summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Makefile3
-rw-r--r--mm/allocpercpu.c26
-rw-r--r--mm/bootmem.c948
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/filemap.c432
-rw-r--r--mm/filemap_xip.c70
-rw-r--r--mm/fremap.c3
-rw-r--r--mm/highmem.c6
-rw-r--r--mm/hugetlb.c1681
-rw-r--r--mm/internal.h61
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memcontrol.c385
-rw-r--r--mm/memory.c397
-rw-r--r--mm/memory_hotplug.c80
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c65
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c152
-rw-r--r--mm/mmap.c189
-rw-r--r--mm/mmu_notifier.c277
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c21
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c25
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c288
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/pdflush.c4
-rw-r--r--mm/quicklist.c9
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c69
-rw-r--r--mm/shmem.c106
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/slab.c35
-rw-r--r--mm/slob.c28
-rw-r--r--mm/slub.c148
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c116
-rw-r--r--mm/swap.c17
-rw-r--r--mm/swap_state.c40
-rw-r--r--mm/swapfile.c65
-rw-r--r--mm/tiny-shmem.c26
-rw-r--r--mm/truncate.c16
-rw-r--r--mm/util.c70
-rw-r--r--mm/vmalloc.c26
-rw-r--r--mm/vmscan.c93
-rw-r--r--mm/vmstat.c22
49 files changed, 4336 insertions, 1758 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa819d628c..0bd9c2dbb2a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,7 +129,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC64 || SUPERH)
+ depends on (IA64 || X86 || PPC64 || SUPERH || S390)
comment "Memory hotplug is currently incompatible with Software Suspend"
depends on SPARSEMEM && HOTPLUG && HIBERNATION
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
config MIGRATION
bool "Page migration"
def_bool y
- depends on NUMA
+ depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful for
@@ -199,9 +199,12 @@ config BOUNCE
config NR_QUICK
int
depends on QUICKLIST
- default "2" if SUPERH
+ default "2" if SUPERH || AVR32
default "1"
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46..da4ccf015ae 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o $(mmu-y)
+ page_isolation.o mm_init.o $(mmu-y)
obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index f4026bae6ee..4297bc41bfd 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -1,7 +1,7 @@
/*
* linux/mm/allocpercpu.c
*
- * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
+ * Separated from slab.c August 11, 2006 Christoph Lameter
*/
#include <linux/mm.h>
#include <linux/module.h>
@@ -18,27 +18,28 @@
* Depopulating per-cpu data for a cpu going offline would be a typical
* use case. You need to register a cpu hotplug handler for that purpose.
*/
-void percpu_depopulate(void *__pdata, int cpu)
+static void percpu_depopulate(void *__pdata, int cpu)
{
struct percpu_data *pdata = __percpu_disguise(__pdata);
kfree(pdata->ptrs[cpu]);
pdata->ptrs[cpu] = NULL;
}
-EXPORT_SYMBOL_GPL(percpu_depopulate);
/**
* percpu_depopulate_mask - depopulate per-cpu data for some cpu's
* @__pdata: per-cpu data to depopulate
* @mask: depopulate per-cpu data for cpu's selected through mask bits
*/
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
{
int cpu;
- for_each_cpu_mask(cpu, *mask)
+ for_each_cpu_mask_nr(cpu, *mask)
percpu_depopulate(__pdata, cpu);
}
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
+
+#define percpu_depopulate_mask(__pdata, mask) \
+ __percpu_depopulate_mask((__pdata), &(mask))
/**
* percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
* use case. You need to register a cpu hotplug handler for that purpose.
* Per-cpu object is populated with zeroed buffer.
*/
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
{
struct percpu_data *pdata = __percpu_disguise(__pdata);
int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
pdata->ptrs[cpu] = kzalloc(size, gfp);
return pdata->ptrs[cpu];
}
-EXPORT_SYMBOL_GPL(percpu_populate);
/**
* percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate);
*
* Per-cpu objects are populated with zeroed buffers.
*/
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
- cpumask_t *mask)
+static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+ cpumask_t *mask)
{
cpumask_t populated;
int cpu;
cpus_clear(populated);
- for_each_cpu_mask(cpu, *mask)
+ for_each_cpu_mask_nr(cpu, *mask)
if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
__percpu_depopulate_mask(__pdata, &populated);
return -ENOMEM;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
cpu_set(cpu, populated);
return 0;
}
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
+
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+ __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
/**
* percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8fb927392b..ad8eec6e44a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
/*
- * linux/mm/bootmem.c
+ * bootmem - A boot-time physical memory allocator and configurator
*
* Copyright (C) 1999 Ingo Molnar
- * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ * 1999 Kanoj Sarcar, SGI
+ * 2008 Johannes Weiner
*
- * simple boot-time physical memory area allocator and
- * free memory collector. It's used to deal with reserved
- * system memory and memory holes as well.
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
*/
#include <linux/init.h>
#include <linux/pfn.h>
@@ -19,15 +19,10 @@
#include "internal.h"
-/*
- * Access to this subsystem has to be serialized externally. (this is
- * true for the boot process anyway)
- */
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-static LIST_HEAD(bdata_list);
#ifdef CONFIG_CRASH_DUMP
/*
* If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
unsigned long saved_max_pfn;
#endif
-/* return the number of _pages_ that will be allocated for the boot bitmap */
-unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
+
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
+
+static int bootmem_debug;
+
+static int __init bootmem_debug_setup(char *buf)
{
- unsigned long mapsize;
+ bootmem_debug = 1;
+ return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
- mapsize = (pages+7)/8;
- mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
- mapsize >>= PAGE_SHIFT;
+#define bdebug(fmt, args...) ({ \
+ if (unlikely(bootmem_debug)) \
+ printk(KERN_INFO \
+ "bootmem::%s " fmt, \
+ __FUNCTION__, ## args); \
+})
- return mapsize;
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+ unsigned long bytes = (pages + 7) / 8;
+
+ return ALIGN(bytes, sizeof(long));
}
-/*
- * link bdata in order
+/**
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
*/
-static void __init link_bootmem(bootmem_data_t *bdata)
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
{
- bootmem_data_t *ent;
+ unsigned long bytes = bootmap_bytes(pages);
- if (list_empty(&bdata_list)) {
- list_add(&bdata->list, &bdata_list);
- return;
- }
- /* insert in order */
- list_for_each_entry(ent, &bdata_list, list) {
- if (bdata->node_boot_start < ent->node_boot_start) {
- list_add_tail(&bdata->list, &ent->list);
- return;
- }
- }
- list_add_tail(&bdata->list, &bdata_list);
+ return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
}
/*
- * Given an initialised bdata, it returns the size of the boot bitmap
+ * link bdata in order
*/
-static unsigned long __init get_mapsize(bootmem_data_t *bdata)
+static void __init link_bootmem(bootmem_data_t *bdata)
{
- unsigned long mapsize;
- unsigned long start = PFN_DOWN(bdata->node_boot_start);
- unsigned long end = bdata->node_low_pfn;
+ struct list_head *iter;
- mapsize = ((end - start) + 7) / 8;
- return ALIGN(mapsize, sizeof(long));
+ list_for_each(iter, &bdata_list) {
+ bootmem_data_t *ent;
+
+ ent = list_entry(iter, bootmem_data_t, list);
+ if (bdata->node_min_pfn < ent->node_min_pfn)
+ break;
+ }
+ list_add_tail(&bdata->list, iter);
}
/*
* Called once to set up the allocator itself.
*/
-static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
unsigned long mapstart, unsigned long start, unsigned long end)
{
- bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize;
+ mminit_validate_memmodel_limits(&start, &end);
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
- bdata->node_boot_start = PFN_PHYS(start);
+ bdata->node_min_pfn = start;
bdata->node_low_pfn = end;
link_bootmem(bdata);
@@ -100,427 +104,484 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
* Initially all pages are reserved - setup_arch() has to
* register free RAM areas explicitly.
*/
- mapsize = get_mapsize(bdata);
+ mapsize = bootmap_bytes(end - start);
memset(bdata->node_bootmem_map, 0xff, mapsize);
+ bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+ bdata - bootmem_node_data, start, mapstart, end, mapsize);
+
return mapsize;
}
-/*
- * Marks a particular physical memory range as unallocatable. Usable RAM
- * might be used for boot-time allocations - or it might get added
- * to the free page pool later on.
+/**
+ * init_bootmem_node - register a node as boot memory
+ * @pgdat: node to register
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
*/
-static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
- unsigned long addr, unsigned long size, int flags)
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+ unsigned long startpfn, unsigned long endpfn)
{
- unsigned long sidx, eidx;
- unsigned long i;
+ return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
+}
- BUG_ON(!size);
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+ max_low_pfn = pages;
+ min_low_pfn = start;
+ return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
- /* out of range, don't hold other */
- if (addr + size < bdata->node_boot_start ||
- PFN_DOWN(addr) > bdata->node_low_pfn)
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+{
+ int aligned;
+ struct page *page;
+ unsigned long start, end, pages, count = 0;
+
+ if (!bdata->node_bootmem_map)
return 0;
+ start = bdata->node_min_pfn;
+ end = bdata->node_low_pfn;
+
/*
- * Round up to index to the range.
+ * If the start is aligned to the machines wordsize, we might
+ * be able to free pages in bulks of that order.
*/
- if (addr > bdata->node_boot_start)
- sidx= PFN_DOWN(addr - bdata->node_boot_start);
- else
- sidx = 0;
+ aligned = !(start & (BITS_PER_LONG - 1));
- eidx = PFN_UP(addr + size - bdata->node_boot_start);
- if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
- eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
+ bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
+ bdata - bootmem_node_data, start, end, aligned);
- for (i = sidx; i < eidx; i++) {
- if (test_bit(i, bdata->node_bootmem_map)) {
- if (flags & BOOTMEM_EXCLUSIVE)
- return -EBUSY;
+ while (start < end) {
+ unsigned long *map, idx, vec;
+
+ map = bdata->node_bootmem_map;
+ idx = start - bdata->node_min_pfn;
+ vec = ~map[idx / BITS_PER_LONG];
+
+ if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+ int order = ilog2(BITS_PER_LONG);
+
+ __free_pages_bootmem(pfn_to_page(start), order);
+ count += BITS_PER_LONG;
+ } else {
+ unsigned long off = 0;
+
+ while (vec && off < BITS_PER_LONG) {
+ if (vec & 1) {
+ page = pfn_to_page(start + off);
+ __free_pages_bootmem(page, 0);
+ count++;
+ }
+ vec >>= 1;
+ off++;
+ }
}
+ start += BITS_PER_LONG;
}
- return 0;
+ page = virt_to_page(bdata->node_bootmem_map);
+ pages = bdata->node_low_pfn - bdata->node_min_pfn;
+ pages = bootmem_bootmap_pages(pages);
+ count += pages;
+ while (pages--)
+ __free_pages_bootmem(page++, 0);
+
+ bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
+ return count;
}
-static void __init reserve_bootmem_core(bootmem_data_t *bdata,
- unsigned long addr, unsigned long size, int flags)
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
- unsigned long sidx, eidx;
- unsigned long i;
+ register_page_bootmem_info_node(pgdat);
+ return free_all_bootmem_core(pgdat->bdata);
+}
- BUG_ON(!size);
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+ return free_all_bootmem_core(NODE_DATA(0)->bdata);
+}
+
+static void __init __free(bootmem_data_t *bdata,
+ unsigned long sidx, unsigned long eidx)
+{
+ unsigned long idx;
- /* out of range */
- if (addr + size < bdata->node_boot_start ||
- PFN_DOWN(addr) > bdata->node_low_pfn)
- return;
+ bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn);
- /*
- * Round up to index to the range.
- */
- if (addr > bdata->node_boot_start)
- sidx= PFN_DOWN(addr - bdata->node_boot_start);
- else
- sidx = 0;
+ if (bdata->hint_idx > sidx)
+ bdata->hint_idx = sidx;
- eidx = PFN_UP(addr + size - bdata->node_boot_start);
- if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
- eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
+ for (idx = sidx; idx < eidx; idx++)
+ if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
+ BUG();
+}
- for (i = sidx; i < eidx; i++) {
- if (test_and_set_bit(i, bdata->node_bootmem_map)) {
-#ifdef CONFIG_DEBUG_BOOTMEM
- printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
-#endif
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
+ unsigned long eidx, int flags)
+{
+ unsigned long idx;
+ int exclusive = flags & BOOTMEM_EXCLUSIVE;
+
+ bdebug("nid=%td start=%lx end=%lx flags=%x\n",
+ bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn,
+ flags);
+
+ for (idx = sidx; idx < eidx; idx++)
+ if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+ if (exclusive) {
+ __free(bdata, sidx, idx);
+ return -EBUSY;
+ }
+ bdebug("silent double reserve of PFN %lx\n",
+ idx + bdata->node_min_pfn);
}
- }
+ return 0;
}
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
- unsigned long size)
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
+ unsigned long start, unsigned long end,
+ int reserve, int flags)
{
unsigned long sidx, eidx;
- unsigned long i;
- BUG_ON(!size);
+ bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
+ bdata - bootmem_node_data, start, end, reserve, flags);
- /* out range */
- if (addr + size < bdata->node_boot_start ||
- PFN_DOWN(addr) > bdata->node_low_pfn)
- return;
- /*
- * round down end of usable mem, partially free pages are
- * considered reserved.
- */
+ BUG_ON(start < bdata->node_min_pfn);
+ BUG_ON(end > bdata->node_low_pfn);
- if (addr >= bdata->node_boot_start && addr < bdata->last_success)
- bdata->last_success = addr;
+ sidx = start - bdata->node_min_pfn;
+ eidx = end - bdata->node_min_pfn;
- /*
- * Round up to index to the range.
- */
- if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
- sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
+ if (reserve)
+ return __reserve(bdata, sidx, eidx, flags);
else
- sidx = 0;
+ __free(bdata, sidx, eidx);
+ return 0;
+}
- eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
- if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
- eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
+static int __init mark_bootmem(unsigned long start, unsigned long end,
+ int reserve, int flags)
+{
+ unsigned long pos;
+ bootmem_data_t *bdata;
- for (i = sidx; i < eidx; i++) {
- if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
- BUG();
+ pos = start;
+ list_for_each_entry(bdata, &bdata_list, list) {
+ int err;
+ unsigned long max;
+
+ if (pos < bdata->node_min_pfn ||
+ pos >= bdata->node_low_pfn) {
+ BUG_ON(pos != start);
+ continue;
+ }
+
+ max = min(bdata->node_low_pfn, end);
+
+ err = mark_bootmem_node(bdata, pos, max, reserve, flags);
+ if (reserve && err) {
+ mark_bootmem(start, pos, 0, 0);
+ return err;
+ }
+
+ if (max == end)
+ return 0;
+ pos = bdata->node_low_pfn;
}
+ BUG();
}
-/*
- * We 'merge' subsequent allocations to save space. We might 'lose'
- * some fraction of a page if allocations cannot be satisfied due to
- * size constraints on boxes where there is physical RAM space
- * fragmentation - in these cases (mostly large memory boxes) this
- * is not a problem.
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
*
- * On low memory boxes we get it right in 100% of the cases.
+ * Partial pages will be considered reserved and left as they are.
*
- * alignment has to be a power of 2 value.
- *
- * NOTE: This function is _not_ reentrant.
+ * The range must reside completely on the specified node.
*/
-void * __init
-__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
- unsigned long align, unsigned long goal, unsigned long limit)
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size)
{
- unsigned long areasize, preferred;
- unsigned long i, start = 0, incr, eidx, end_pfn;
- void *ret;
- unsigned long node_boot_start;
- void *node_bootmem_map;
-
- if (!size) {
- printk("__alloc_bootmem_core(): zero-sized request\n");
- BUG();
- }
- BUG_ON(align & (align-1));
+ unsigned long start, end;
- /* on nodes without memory - bootmem_map is NULL */
- if (!bdata->node_bootmem_map)
- return NULL;
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
- /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
- node_boot_start = bdata->node_boot_start;
- node_bootmem_map = bdata->node_bootmem_map;
- if (align) {
- node_boot_start = ALIGN(bdata->node_boot_start, align);
- if (node_boot_start > bdata->node_boot_start)
- node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
- PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
- }
-
- if (limit && node_boot_start >= limit)
- return NULL;
-
- end_pfn = bdata->node_low_pfn;
- limit = PFN_DOWN(limit);
- if (limit && end_pfn > limit)
- end_pfn = limit;
+ mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
+}
- eidx = end_pfn - PFN_DOWN(node_boot_start);
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+ unsigned long start, end;
- /*
- * We try to allocate bootmem pages above 'goal'
- * first, then we try to allocate lower pages.
- */
- preferred = 0;
- if (goal && PFN_DOWN(goal) < end_pfn) {
- if (goal > node_boot_start)
- preferred = goal - node_boot_start;
-
- if (bdata->last_success > node_boot_start &&
- bdata->last_success - node_boot_start >= preferred)
- if (!limit || (limit && limit > bdata->last_success))
- preferred = bdata->last_success - node_boot_start;
- }
+ start = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
- preferred = PFN_DOWN(ALIGN(preferred, align));
- areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
- incr = align >> PAGE_SHIFT ? : 1;
+ mark_bootmem(start, end, 0, 0);
+}
-restart_scan:
- for (i = preferred; i < eidx;) {
- unsigned long j;
+/**
+ * reserve_bootmem_node - mark a page range as reserved
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size, int flags)
+{
+ unsigned long start, end;
- i = find_next_zero_bit(node_bootmem_map, eidx, i);
- i = ALIGN(i, incr);
- if (i >= eidx)
- break;
- if (test_bit(i, node_bootmem_map)) {
- i += incr;
- continue;
- }
- for (j = i + 1; j < i + areasize; ++j) {
- if (j >= eidx)
- goto fail_block;
- if (test_bit(j, node_bootmem_map))
- goto fail_block;
- }
- start = i;
- goto found;
- fail_block:
- i = ALIGN(j, incr);
- if (i == j)
- i += incr;
- }
+ start = PFN_DOWN(physaddr);
+ end = PFN_UP(physaddr + size);
- if (preferred > 0) {
- preferred = 0;
- goto restart_scan;
- }
- return NULL;
+ return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
+}
-found:
- bdata->last_success = PFN_PHYS(start) + node_boot_start;
- BUG_ON(start >= eidx);
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+/**
+ * reserve_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+ int flags)
+{
+ unsigned long start, end;
- /*
- * Is the next page of the previous allocation-end the start
- * of this allocation's buffer? If yes then we can 'merge'
- * the previous partial page with this allocation.
- */
- if (align < PAGE_SIZE &&
- bdata->last_offset && bdata->last_pos+1 == start) {
- unsigned long offset, remaining_size;
- offset = ALIGN(bdata->last_offset, align);
- BUG_ON(offset > PAGE_SIZE);
- remaining_size = PAGE_SIZE - offset;
- if (size < remaining_size) {
- areasize = 0;
- /* last_pos unchanged */
- bdata->last_offset = offset + size;
- ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
- offset + node_boot_start);
- } else {
- remaining_size = size - remaining_size;
- areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
- ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
- offset + node_boot_start);
- bdata->last_pos = start + areasize - 1;
- bdata->last_offset = remaining_size;
- }
- bdata->last_offset &= ~PAGE_MASK;
- } else {
- bdata->last_pos = start + areasize - 1;
- bdata->last_offset = size & ~PAGE_MASK;
- ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
- }
+ start = PFN_DOWN(addr);
+ end = PFN_UP(addr + size);
- /*
- * Reserve the area now:
- */
- for (i = start; i < start + areasize; i++)
- if (unlikely(test_and_set_bit(i, node_bootmem_map)))
- BUG();
- memset(ret, 0, size);
- return ret;
+ return mark_bootmem(start, end, 1, flags);
}
+#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
-static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+ unsigned long step)
{
- struct page *page;
- unsigned long pfn;
- bootmem_data_t *bdata = pgdat->bdata;
- unsigned long i, count, total = 0;
- unsigned long idx;
- unsigned long *map;
- int gofast = 0;
-
- BUG_ON(!bdata->node_bootmem_map);
-
- count = 0;
- /* first extant page of the node */
- pfn = PFN_DOWN(bdata->node_boot_start);
- idx = bdata->node_low_pfn - pfn;
- map = bdata->node_bootmem_map;
- /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
- if (bdata->node_boot_start == 0 ||
- ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
- gofast = 1;
- for (i = 0; i < idx; ) {
- unsigned long v = ~map[i / BITS_PER_LONG];
-
- if (gofast && v == ~0UL) {
- int order;
-
- page = pfn_to_page(pfn);
- count += BITS_PER_LONG;
- order = ffs(BITS_PER_LONG) - 1;
- __free_pages_bootmem(page, order);
- i += BITS_PER_LONG;
- page += BITS_PER_LONG;
- } else if (v) {
- unsigned long m;
-
- page = pfn_to_page(pfn);
- for (m = 1; m && i < idx; m<<=1, page++, i++) {
- if (v & m) {
- count++;
- __free_pages_bootmem(page, 0);
- }
- }
- } else {
- i += BITS_PER_LONG;
- }
- pfn += BITS_PER_LONG;
- }
- total += count;
+ unsigned long base = bdata->node_min_pfn;
/*
- * Now free the allocator bitmap itself, it's not
- * needed anymore:
+ * Align the index with respect to the node start so that the
+ * combination of both satisfies the requested alignment.
*/
- page = virt_to_page(bdata->node_bootmem_map);
- count = 0;
- idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
- for (i = 0; i < idx; i++, page++) {
- __free_pages_bootmem(page, 0);
- count++;
- }
- total += count;
- bdata->node_bootmem_map = NULL;
- return total;
+ return ALIGN(base + idx, step) - base;
}
-unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
- unsigned long startpfn, unsigned long endpfn)
+static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
+ unsigned long align)
{
- return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
-}
+ unsigned long base = PFN_PHYS(bdata->node_min_pfn);
-void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size, int flags)
-{
- int ret;
+ /* Same as align_idx for byte offsets */
- ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
- if (ret < 0)
- return;
- reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+ return ALIGN(base + off, align) - base;
}
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size)
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
{
- free_bootmem_core(pgdat->bdata, physaddr, size);
-}
+ unsigned long fallback = 0;
+ unsigned long min, max, start, sidx, midx, step;
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
-{
- register_page_bootmem_info_node(pgdat);
- return free_all_bootmem_core(pgdat);
-}
+ BUG_ON(!size);
+ BUG_ON(align & (align - 1));
+ BUG_ON(limit && goal + size > limit);
-unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
-{
- max_low_pfn = pages;
- min_low_pfn = start;
- return init_bootmem_core(NODE_DATA(0), start, 0, pages);
-}
+ if (!bdata->node_bootmem_map)
+ return NULL;
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-int __init reserve_bootmem(unsigned long addr, unsigned long size,
- int flags)
-{
- bootmem_data_t *bdata;
- int ret;
+ bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+ bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ align, goal, limit);
- list_for_each_entry(bdata, &bdata_list, list) {
- ret = can_reserve_bootmem_core(bdata, addr, size, flags);
- if (ret < 0)
- return ret;
+ min = bdata->node_min_pfn;
+ max = bdata->node_low_pfn;
+
+ goal >>= PAGE_SHIFT;
+ limit >>= PAGE_SHIFT;
+
+ if (limit && max > limit)
+ max = limit;
+ if (max <= min)
+ return NULL;
+
+ step = max(align >> PAGE_SHIFT, 1UL);
+
+ if (goal && min < goal && goal < max)
+ start = ALIGN(goal, step);
+ else
+ start = ALIGN(min, step);
+
+ sidx = start - bdata->node_min_pfn;
+ midx = max - bdata->node_min_pfn;
+
+ if (bdata->hint_idx > sidx) {
+ /*
+ * Handle the valid case of sidx being zero and still
+ * catch the fallback below.
+ */
+ fallback = sidx + 1;
+ sidx = align_idx(bdata, bdata->hint_idx, step);
}
- list_for_each_entry(bdata, &bdata_list, list)
- reserve_bootmem_core(bdata, addr, size, flags);
- return 0;
-}
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+ while (1) {
+ int merge;
+ void *region;
+ unsigned long eidx, i, start_off, end_off;
+find_block:
+ sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+ sidx = align_idx(bdata, sidx, step);
+ eidx = sidx + PFN_UP(size);
-void __init free_bootmem(unsigned long addr, unsigned long size)
-{
- bootmem_data_t *bdata;
- list_for_each_entry(bdata, &bdata_list, list)
- free_bootmem_core(bdata, addr, size);
-}
+ if (sidx >= midx || eidx > midx)
+ break;
-unsigned long __init free_all_bootmem(void)
-{
- return free_all_bootmem_core(NODE_DATA(0));
+ for (i = sidx; i < eidx; i++)
+ if (test_bit(i, bdata->node_bootmem_map)) {
+ sidx = align_idx(bdata, i, step);
+ if (sidx == i)
+ sidx += step;
+ goto find_block;
+ }
+
+ if (bdata->last_end_off & (PAGE_SIZE - 1) &&
+ PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+ start_off = align_off(bdata, bdata->last_end_off, align);
+ else
+ start_off = PFN_PHYS(sidx);
+
+ merge = PFN_DOWN(start_off) < sidx;
+ end_off = start_off + size;
+
+ bdata->last_end_off = end_off;
+ bdata->hint_idx = PFN_UP(end_off);
+
+ /*
+ * Reserve the area now:
+ */
+ if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+ PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+ BUG();
+
+ region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+ start_off);
+ memset(region, 0, size);
+ return region;
+ }
+
+ if (fallback) {
+ sidx = align_idx(bdata, fallback - 1, step);
+ fallback = 0;
+ goto find_block;
+ }
+
+ return NULL;
}
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
- unsigned long goal)
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
{
bootmem_data_t *bdata;
- void *ptr;
+restart:
list_for_each_entry(bdata, &bdata_list, list) {
- ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
- if (ptr)
- return ptr;
+ void *region;
+
+ if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+ continue;
+ if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+ break;
+
+ region = alloc_bootmem_core(bdata, size, align, goal, limit);
+ if (region)
+ return region;
+ }
+
+ if (goal) {
+ goal = 0;
+ goto restart;
}
+
return NULL;
}
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
- unsigned long goal)
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+ unsigned long goal)
{
- void *mem = __alloc_bootmem_nopanic(size,align,goal);
+ return ___alloc_bootmem_nopanic(size, align, goal, 0);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
if (mem)
return mem;
@@ -532,78 +593,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return NULL;
}
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem(size, align, goal, 0);
+}
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
+static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
{
void *ptr;
- ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+ ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
if (ptr)
return ptr;
- return __alloc_bootmem(size, align, goal);
+ return ___alloc_bootmem(size, align, goal, limit);
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
}
#ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
- void *ptr;
- unsigned long limit, goal, start_nr, end_nr, pfn;
- struct pglist_data *pgdat;
+ bootmem_data_t *bdata;
+ unsigned long pfn, goal, limit;
pfn = section_nr_to_pfn(section_nr);
- goal = PFN_PHYS(pfn);
- limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
- pgdat = NODE_DATA(early_pfn_to_nid(pfn));
- ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
- limit);
+ goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+ bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
- if (!ptr)
- return NULL;
+ return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+}
+#endif
- start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
- end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
- if (start_nr != section_nr || end_nr != section_nr) {
- printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
- section_nr);
- free_bootmem_core(pgdat->bdata, __pa(ptr), size);
- ptr = NULL;
- }
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
- return ptr;
+ ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+ if (ptr)
+ return ptr;
+
+ return __alloc_bootmem_nopanic(size, align, goal);
}
-#endif
#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
#endif
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
{
- bootmem_data_t *bdata;
- void *ptr;
-
- list_for_each_entry(bdata, &bdata_list, list) {
- ptr = __alloc_bootmem_core(bdata, size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
- if (ptr)
- return ptr;
- }
-
- /*
- * Whoops, we cannot satisfy the allocation request.
- */
- printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
- panic("Out of low memory");
- return NULL;
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
}
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
+ return ___alloc_bootmem_node(pgdat->bdata, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
}
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019..06722c40305 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
/*
* Data-less bio, nothing to bounce
*/
- if (bio_empty_barrier(*bio_orig))
+ if (!bio_has_data(*bio_orig))
return;
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6a7d34874..876bc595d0f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,9 +42,6 @@
#include <asm/mman.h>
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs);
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe. The caller must hold the mapping's tree_lock.
*/
void __remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- mem_cgroup_uncharge_page(page);
+ mem_cgroup_uncharge_cache_page(page);
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
@@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page)
BUG_ON(!PageLocked(page));
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
}
static int sync_page(void *word)
@@ -236,11 +233,12 @@ int filemap_fdatawrite(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
* filemap_flush - mostly a non-blocking flush
@@ -444,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
}
/**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
* @mapping: the page's address_space
* @offset: page index
* @gfp_mask: page allocation mode
*
- * This function is used to add newly allocated pagecache pages;
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
+ * This function is used to add a page to the pagecache. It must be locked.
* This function does not add the page to the LRU. The caller must do that.
*/
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- int error = mem_cgroup_cache_charge(page, current->mm,
+ int error;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & ~__GFP_HIGHMEM);
if (error)
goto out;
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
- write_lock_irq(&mapping->tree_lock);
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
- if (!error) {
- page_cache_get(page);
- SetPageLocked(page);
- page->mapping = mapping;
- page->index = offset;
+ if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
- } else
- mem_cgroup_uncharge_page(page);
+ } else {
+ page->mapping = NULL;
+ mem_cgroup_uncharge_cache_page(page);
+ page_cache_release(page);
+ }
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
radix_tree_preload_end();
} else
- mem_cgroup_uncharge_page(page);
+ mem_cgroup_uncharge_cache_page(page);
out:
return error;
}
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
@@ -556,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
* The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
+ * test_and_set_bit() to lock the page; the second mb is necessary to enforce
+ * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
+ * races with a parallel wait_on_page_locked()).
*/
void unlock_page(struct page *page)
{
smp_mb__before_clear_bit();
- if (!TestClearPageLocked(page))
+ if (!test_and_clear_bit(PG_locked, &page->flags))
BUG();
smp_mb__after_clear_bit();
wake_up_page(page, PG_locked);
@@ -635,15 +637,35 @@ void __lock_page_nosync(struct page *page)
* Is there a pagecache struct page at the given (mapping, offset) tuple?
* If yes, increment its refcount and return it; if no, return NULL.
*/
-struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
{
+ void **pagep;
struct page *page;
- read_lock_irq(&mapping->tree_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page)
- page_cache_get(page);
- read_unlock_irq(&mapping->tree_lock);
+ rcu_read_lock();
+repeat:
+ page = NULL;
+ pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+ if (pagep) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page || page == RADIX_TREE_RETRY))
+ goto repeat;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+ }
+ rcu_read_unlock();
+
return page;
}
EXPORT_SYMBOL(find_get_page);
@@ -658,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
*
* Returns zero if the page was not present. find_lock_page() may sleep.
*/
-struct page *find_lock_page(struct address_space *mapping,
- pgoff_t offset)
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
{
struct page *page;
repeat:
- read_lock_irq(&mapping->tree_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = find_get_page(mapping, offset);
if (page) {
- page_cache_get(page);
- if (TestSetPageLocked(page)) {
- read_unlock_irq(&mapping->tree_lock);
- __lock_page(page);
-
- /* Has the page been truncated while we slept? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- page_cache_release(page);
- goto repeat;
- }
- VM_BUG_ON(page->index != offset);
- goto out;
+ lock_page(page);
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
}
+ VM_BUG_ON(page->index != offset);
}
- read_unlock_irq(&mapping->tree_lock);
-out:
return page;
}
EXPORT_SYMBOL(find_lock_page);
@@ -749,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
{
unsigned int i;
unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ /*
+ * this can only trigger if nr_found == 1, making livelock
+ * a non issue.
+ */
+ if (unlikely(page == RADIX_TREE_RETRY))
+ goto restart;
- read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup(&mapping->page_tree,
- (void **)pages, start, nr_pages);
- for (i = 0; i < ret; i++)
- page_cache_get(pages[i]);
- read_unlock_irq(&mapping->tree_lock);
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
+ }
+ rcu_read_unlock();
return ret;
}
@@ -776,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
{
unsigned int i;
unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, index, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ /*
+ * this can only trigger if nr_found == 1, making livelock
+ * a non issue.
+ */
+ if (unlikely(page == RADIX_TREE_RETRY))
+ goto restart;
- read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup(&mapping->page_tree,
- (void **)pages, index, nr_pages);
- for (i = 0; i < ret; i++) {
- if (pages[i]->mapping == NULL || pages[i]->index != index)
+ if (page->mapping == NULL || page->index != index)
break;
- page_cache_get(pages[i]);
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
index++;
}
- read_unlock_irq(&mapping->tree_lock);
- return i;
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(find_get_pages_contig);
@@ -808,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
{
unsigned int i;
unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+ (void ***)pages, *index, nr_pages, tag);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ /*
+ * this can only trigger if nr_found == 1, making livelock
+ * a non issue.
+ */
+ if (unlikely(page == RADIX_TREE_RETRY))
+ goto restart;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ ret++;
+ }
+ rcu_read_unlock();
- read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
- (void **)pages, *index, nr_pages, tag);
- for (i = 0; i < ret; i++)
- page_cache_get(pages[i]);
if (ret)
*index = pages[ret - 1]->index + 1;
- read_unlock_irq(&mapping->tree_lock);
+
return ret;
}
EXPORT_SYMBOL(find_get_pages_tag);
@@ -840,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
struct page *page = find_get_page(mapping, index);
if (page) {
- if (!TestSetPageLocked(page))
+ if (trylock_page(page))
return page;
page_cache_release(page);
return NULL;
@@ -932,8 +1023,17 @@ find_page:
ra, filp, page,
index, last_index - index);
}
- if (!PageUptodate(page))
- goto page_not_up_to_date;
+ if (!PageUptodate(page)) {
+ if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+ !mapping->a_ops->is_partially_uptodate)
+ goto page_not_up_to_date;
+ if (!trylock_page(page))
+ goto page_not_up_to_date;
+ if (!mapping->a_ops->is_partially_uptodate(page,
+ desc, offset))
+ goto page_not_up_to_date_locked;
+ unlock_page(page);
+ }
page_ok:
/*
* i_size must be checked after we know the page is Uptodate.
@@ -1003,6 +1103,7 @@ page_not_up_to_date:
if (lock_page_killable(page))
goto readpage_eio;
+page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
@@ -1199,42 +1300,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
mapping = filp->f_mapping;
inode = mapping->host;
- retval = 0;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
if (pos < size) {
- retval = generic_file_direct_IO(READ, iocb,
- iov, pos, nr_segs);
+ retval = filemap_write_and_wait(mapping);
+ if (!retval) {
+ retval = mapping->a_ops->direct_IO(READ, iocb,
+ iov, pos, nr_segs);
+ }
if (retval > 0)
*ppos = pos + retval;
- }
- if (likely(retval != 0)) {
- file_accessed(filp);
- goto out;
+ if (retval) {
+ file_accessed(filp);
+ goto out;
+ }
}
}
- retval = 0;
- if (count) {
- for (seg = 0; seg < nr_segs; seg++) {
- read_descriptor_t desc;
+ for (seg = 0; seg < nr_segs; seg++) {
+ read_descriptor_t desc;
- desc.written = 0;
- desc.arg.buf = iov[seg].iov_base;
- desc.count = iov[seg].iov_len;
- if (desc.count == 0)
- continue;
- desc.error = 0;
- do_generic_file_read(filp,ppos,&desc,file_read_actor);
- retval += desc.written;
- if (desc.error) {
- retval = retval ?: desc.error;
- break;
- }
- if (desc.count > 0)
- break;
+ desc.written = 0;
+ desc.arg.buf = iov[seg].iov_base;
+ desc.count = iov[seg].iov_len;
+ if (desc.count == 0)
+ continue;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
+ retval += desc.written;
+ if (desc.error) {
+ retval = retval ?: desc.error;
+ break;
}
+ if (desc.count > 0)
+ break;
}
out:
return retval;
@@ -1668,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
return notify_change(dentry, &newattrs);
}
-int remove_suid(struct dentry *dentry)
+int file_remove_suid(struct file *file)
{
+ struct dentry *dentry = file->f_path.dentry;
int killsuid = should_remove_suid(dentry);
int killpriv = security_inode_need_killpriv(dentry);
int error = 0;
@@ -1683,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
return error;
}
-EXPORT_SYMBOL(remove_suid);
+EXPORT_SYMBOL(file_remove_suid);
static size_t __iovec_copy_from_user_inatomic(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
@@ -1778,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
* The !iov->iov_len check ensures we skip over unlikely
* zero-length segments (without overruning the iovec).
*/
- while (bytes || unlikely(!iov->iov_len && i->count)) {
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
int copy;
copy = min(bytes, iov->iov_len - base);
@@ -2003,11 +2104,62 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t written;
+ size_t write_len;
+ pgoff_t end;
if (count != ocount)
*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
- written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+ /*
+ * Unmap all mmappings of the file up-front.
+ *
+ * This will cause any pte dirty bits to be propagated into the
+ * pageframes for the subsequent filemap_write_and_wait().
+ */
+ write_len = iov_length(iov, *nr_segs);
+ end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+ if (mapping_mapped(mapping))
+ unmap_mapping_range(mapping, pos, write_len, 0);
+
+ written = filemap_write_and_wait(mapping);
+ if (written)
+ goto out;
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ if (mapping->nrpages) {
+ written = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ goto out;
+ }
+ }
+
+ written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+
+ /*
+ * Finally, try again to invalidate clean pages which might have been
+ * cached by non-direct readahead, or faulted in by get_user_pages()
+ * if the source of the write was an mmap'ed region of the file
+ * we're writing. Either one is a pretty crazy thing to do,
+ * so we don't support it 100%. If this invalidation
+ * fails, tough, the write still worked...
+ */
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ }
+
if (written > 0) {
loff_t end = pos + written;
if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2023,6 +2175,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
* i_mutex is held, which protects generic_osync_inode() from
* livelocking. AIO O_DIRECT ops attempt to sync metadata here.
*/
+out:
if ((written >= 0 || written == -EIOCBQUEUED) &&
((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2394,7 +2547,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
- err = remove_suid(file->f_path.dentry);
+ err = file_remove_suid(file);
if (err)
goto out;
@@ -2510,66 +2663,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
}
EXPORT_SYMBOL(generic_file_aio_write);
-/*
- * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
- * went wrong during pagecache shootdown.
- */
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- ssize_t retval;
- size_t write_len;
- pgoff_t end = 0; /* silence gcc */
-
- /*
- * If it's a write, unmap all mmappings of the file up-front. This
- * will cause any pte dirty bits to be propagated into the pageframes
- * for the subsequent filemap_write_and_wait().
- */
- if (rw == WRITE) {
- write_len = iov_length(iov, nr_segs);
- end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
- if (mapping_mapped(mapping))
- unmap_mapping_range(mapping, offset, write_len, 0);
- }
-
- retval = filemap_write_and_wait(mapping);
- if (retval)
- goto out;
-
- /*
- * After a write we want buffered reads to be sure to go to disk to get
- * the new data. We invalidate clean cached page from the region we're
- * about to write. We do this *before* the write so that we can return
- * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
- */
- if (rw == WRITE && mapping->nrpages) {
- retval = invalidate_inode_pages2_range(mapping,
- offset >> PAGE_CACHE_SHIFT, end);
- if (retval)
- goto out;
- }
-
- retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-
- /*
- * Finally, try again to invalidate clean pages which might have been
- * cached by non-direct readahead, or faulted in by get_user_pages()
- * if the source of the write was an mmap'ed region of the file
- * we're writing. Either one is a pretty crazy thing to do,
- * so we don't support it 100%. If this invalidation
- * fails, tough, the write still worked...
- */
- if (rw == WRITE && mapping->nrpages) {
- invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
- }
-out:
- return retval;
-}
-
/**
* try_to_release_page() - release old fs-specific metadata on a page
*
@@ -2581,9 +2674,8 @@ out:
* Otherwise return zero.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
*
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9..b5167dfb2f2 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,7 +13,10 @@
#include <linux/module.h>
#include <linux/uio.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
@@ -21,22 +24,18 @@
* We do use our own empty page to avoid interference with other users
* of ZERO_PAGE(), such as /dev/zero
*/
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
static struct page *__xip_sparse_page;
+/* called under xip_sparse_mutex */
static struct page *xip_sparse_page(void)
{
if (!__xip_sparse_page) {
struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
- if (page) {
- static DEFINE_SPINLOCK(xip_alloc_lock);
- spin_lock(&xip_alloc_lock);
- if (!__xip_sparse_page)
- __xip_sparse_page = page;
- else
- __free_page(page);
- spin_unlock(&xip_alloc_lock);
- }
+ if (page)
+ __xip_sparse_page = page;
}
return __xip_sparse_page;
}
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping,
pte_t pteval;
spinlock_t *ptl;
struct page *page;
+ unsigned count;
+ int locked = 0;
+
+ count = read_seqcount_begin(&xip_sparse_seq);
page = __xip_sparse_page;
if (!page)
return;
+retry:
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 1);
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page, vma);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
}
}
spin_unlock(&mapping->i_mmap_lock);
+
+ if (locked) {
+ mutex_unlock(&xip_sparse_mutex);
+ } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+ mutex_lock(&xip_sparse_mutex);
+ locked = 1;
+ goto retry;
+ }
}
/*
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int error;
/* XXX: are VM_FAULT_ codes OK? */
-
+again:
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int err;
/* maybe shared writable, allocate new block */
+ mutex_lock(&xip_sparse_mutex);
error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
&xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
if (error)
return VM_FAULT_SIGBUS;
/* unmap sparse mappings at pgoff from all other vmas */
@@ -251,14 +265,34 @@ found:
BUG_ON(err);
return VM_FAULT_NOPAGE;
} else {
+ int err, ret = VM_FAULT_OOM;
+
+ mutex_lock(&xip_sparse_mutex);
+ write_seqcount_begin(&xip_sparse_seq);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(!error)) {
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+ goto again;
+ }
+ if (error != -ENODATA)
+ goto out;
/* not shared and writable, use xip_sparse_page() */
page = xip_sparse_page();
if (!page)
- return VM_FAULT_OOM;
+ goto out;
+ err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+ page);
+ if (err == -ENOMEM)
+ goto out;
- page_cache_get(page);
- vmf->page = page;
- return 0;
+ ret = VM_FAULT_NOPAGE;
+out:
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+
+ return ret;
}
}
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
&xip_mem, &xip_pfn);
if (status == -ENODATA) {
/* we allocate a new page unmap it */
+ mutex_lock(&xip_sparse_mutex);
status = a_ops->get_xip_mem(mapping, index, 1,
&xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
if (!status)
/* unmap page at pgoff from all other vmas */
__xip_unmap(mapping, index);
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (count == 0)
goto out_backing;
- ret = remove_suid(filp->f_path.dentry);
+ ret = file_remove_suid(filp);
if (ret)
goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a..7881638e4a1 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
spin_unlock(&mapping->i_mmap_lock);
}
+ mmu_notifier_invalidate_range_start(mm, start, start + size);
err = populate_range(mm, vma, start, size, pgoff);
+ mmu_notifier_invalidate_range_end(mm, start, start + size);
if (!err && !(flags & MAP_NONBLOCK)) {
if (unlikely(has_write_lock)) {
downgrade_write(&mm->mmap_sem);
diff --git a/mm/highmem.c b/mm/highmem.c
index 7da4a7b6af1..b36b83b920f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -40,6 +40,7 @@
#ifdef CONFIG_HIGHMEM
unsigned long totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(totalhigh_pages);
unsigned int nr_free_highpages (void)
{
@@ -69,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
static void flush_all_zero_pkmaps(void)
{
int i;
+ int need_flush = 0;
flush_cache_kmaps();
@@ -100,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
&pkmap_page_table[i]);
set_page_address(page, NULL);
+ need_flush = 1;
}
- flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+ if (need_flush)
+ flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
/**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef2..67a71191136 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,43 +9,357 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
+#include <linux/bootmem.h>
+#include <linux/sysfs.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/io.h>
#include <linux/hugetlb.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-static unsigned long surplus_huge_pages;
-static unsigned long nr_overcommit_huge_pages;
-unsigned long max_huge_pages;
-unsigned long sysctl_overcommit_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
-static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-static int hugetlb_next_nid;
+
+static int max_hstate;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+
+__initdata LIST_HEAD(huge_boot_pages);
+
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
+
+#define for_each_hstate(h) \
+ for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
static DEFINE_SPINLOCK(hugetlb_lock);
-static void clear_huge_page(struct page *page, unsigned long addr)
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ * across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex. To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ * down_write(&mm->mmap_sem);
+ * or
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
+ */
+struct file_region {
+ struct list_head link;
+ long from;
+ long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg, *trg;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+
+ /* Check for and consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ break;
+
+ /* If this area reaches higher then extend our area to
+ * include it completely. If this is not the first area
+ * which we intend to reuse, free it. */
+ if (rg->to > t)
+ t = rg->to;
+ if (rg != nrg) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+ nrg->from = f;
+ nrg->to = t;
+ return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg;
+ long chg = 0;
+
+ /* Locate the region we are before or in. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* If we are below the current region then a new region is required.
+ * Subtle, allocate a new region at the position but make it zero
+ * size such that we can guarantee to record the reservation. */
+ if (&rg->link == head || t < rg->from) {
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ nrg->from = f;
+ nrg->to = f;
+ INIT_LIST_HEAD(&nrg->link);
+ list_add(&nrg->link, rg->link.prev);
+
+ return t - f;
+ }
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+ chg = t - f;
+
+ /* Check for and consume any regions we now overlap with. */
+ list_for_each_entry(rg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ return chg;
+
+ /* We overlap with this area, if it extends futher than
+ * us then we must extend ourselves. Account for its
+ * existing reservation. */
+ if (rg->to > t) {
+ chg += rg->to - t;
+ t = rg->to;
+ }
+ chg -= rg->to - rg->from;
+ }
+ return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+ struct file_region *rg, *trg;
+ long chg = 0;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (end <= rg->to)
+ break;
+ if (&rg->link == head)
+ return 0;
+
+ /* If we are in the middle of a region then adjust it. */
+ if (end > rg->from) {
+ chg = rg->to - end;
+ rg->to = end;
+ rg = list_entry(rg->link.next, typeof(*rg), link);
+ }
+
+ /* Drop any remaining regions. */
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ chg += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return chg;
+}
+
+static long region_count(struct list_head *head, long f, long t)
+{
+ struct file_region *rg;
+ long chg = 0;
+
+ /* Locate each segment we overlap with, and count that overlap. */
+ list_for_each_entry(rg, head, link) {
+ int seg_from;
+ int seg_to;
+
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
+ break;
+
+ seg_from = max(rg->from, f);
+ seg_to = min(rg->to, t);
+
+ chg += seg_to - seg_from;
+ }
+
+ return chg;
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return ((address - vma->vm_start) >> huge_page_shift(h)) +
+ (vma->vm_pgoff >> huge_page_order(h));
+}
+
+/*
+ * Flags for MAP_PRIVATE reservations. These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping. A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated. A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+ return (unsigned long)vma->vm_private_data;
+}
+
+static void set_vma_private_data(struct vm_area_struct *vma,
+ unsigned long value)
+{
+ vma->vm_private_data = (void *)value;
+}
+
+struct resv_map {
+ struct kref refs;
+ struct list_head regions;
+};
+
+struct resv_map *resv_map_alloc(void)
+{
+ struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+ if (!resv_map)
+ return NULL;
+
+ kref_init(&resv_map->refs);
+ INIT_LIST_HEAD(&resv_map->regions);
+
+ return resv_map;
+}
+
+void resv_map_release(struct kref *ref)
+{
+ struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+
+ /* Clear out any active regions before we release the map. */
+ region_truncate(&resv_map->regions, 0);
+ kfree(resv_map);
+}
+
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (!(vma->vm_flags & VM_SHARED))
+ return (struct resv_map *)(get_vma_private_data(vma) &
+ ~HPAGE_RESV_MASK);
+ return 0;
+}
+
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+ set_vma_private_data(vma, (get_vma_private_data(vma) &
+ HPAGE_RESV_MASK) | (unsigned long)map);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+ set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+
+ return (get_vma_private_data(vma) & flag) != 0;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct hstate *h,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_NORESERVE)
+ return;
+
+ if (vma->vm_flags & VM_SHARED) {
+ /* Shared mappings always use reserves */
+ h->resv_huge_pages--;
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
+ h->resv_huge_pages--;
+ }
+}
+
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (!(vma->vm_flags & VM_SHARED))
+ vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_reserves(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHARED)
+ return 1;
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ return 1;
+ return 0;
+}
+
+static void clear_huge_page(struct page *page,
+ unsigned long addr, unsigned long sz)
{
int i;
might_sleep();
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+ for (i = 0; i < sz/PAGE_SIZE; i++) {
cond_resched();
clear_user_highpage(page + i, addr + i * PAGE_SIZE);
}
@@ -55,42 +369,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
+ struct hstate *h = hstate_vma(vma);
might_sleep();
- for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+ for (i = 0; i < pages_per_huge_page(h); i++) {
cond_resched();
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
-static void enqueue_huge_page(struct page *page)
+static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_add(&page->lru, &hugepage_freelists[nid]);
- free_huge_pages++;
- free_huge_pages_node[nid]++;
+ list_add(&page->lru, &h->hugepage_freelists[nid]);
+ h->free_huge_pages++;
+ h->free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct hstate *h)
{
int nid;
struct page *page = NULL;
for (nid = 0; nid < MAX_NUMNODES; ++nid) {
- if (!list_empty(&hugepage_freelists[nid])) {
- page = list_entry(hugepage_freelists[nid].next,
+ if (!list_empty(&h->hugepage_freelists[nid])) {
+ page = list_entry(h->hugepage_freelists[nid].next,
struct page, lru);
list_del(&page->lru);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
break;
}
}
return page;
}
-static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
- unsigned long address)
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long address, int avoid_reserve)
{
int nid;
struct page *page = NULL;
@@ -101,18 +417,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
struct zone *zone;
struct zoneref *z;
+ /*
+ * A child process with MAP_PRIVATE mappings created by their parent
+ * have no page reserves. This check ensures that reservations are
+ * not "stolen". The child may still get SIGKILLed
+ */
+ if (!vma_has_reserves(vma) &&
+ h->free_huge_pages - h->resv_huge_pages == 0)
+ return NULL;
+
+ /* If reserves cannot be used, ensure enough pages are in the pool */
+ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ return NULL;
+
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
- !list_empty(&hugepage_freelists[nid])) {
- page = list_entry(hugepage_freelists[nid].next,
+ !list_empty(&h->hugepage_freelists[nid])) {
+ page = list_entry(h->hugepage_freelists[nid].next,
struct page, lru);
list_del(&page->lru);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
- if (vma && vma->vm_flags & VM_MAYSHARE)
- resv_huge_pages--;
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+
break;
}
}
@@ -120,12 +451,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
return page;
}
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- nr_huge_pages--;
- nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+
+ h->nr_huge_pages--;
+ h->nr_huge_pages_node[page_to_nid(page)]--;
+ for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1<< PG_writeback);
@@ -133,11 +465,27 @@ static void update_and_free_page(struct page *page)
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
- __free_pages(page, HUGETLB_PAGE_ORDER);
+ __free_pages(page, huge_page_order(h));
+}
+
+struct hstate *size_to_hstate(unsigned long size)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ if (huge_page_size(h) == size)
+ return h;
+ }
+ return NULL;
}
static void free_huge_page(struct page *page)
{
+ /*
+ * Can't pass hstate in here because it is called from the
+ * compound page destructor.
+ */
+ struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
struct address_space *mapping;
@@ -147,12 +495,12 @@ static void free_huge_page(struct page *page)
INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
- if (surplus_huge_pages_node[nid]) {
- update_and_free_page(page);
- surplus_huge_pages--;
- surplus_huge_pages_node[nid]--;
+ if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ update_and_free_page(h, page);
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
} else {
- enqueue_huge_page(page);
+ enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
if (mapping)
@@ -164,7 +512,7 @@ static void free_huge_page(struct page *page)
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
-static int adjust_pool_surplus(int delta)
+static int adjust_pool_surplus(struct hstate *h, int delta)
{
static int prev_nid;
int nid = prev_nid;
@@ -177,15 +525,15 @@ static int adjust_pool_surplus(int delta)
nid = first_node(node_online_map);
/* To shrink on this node, there must be a surplus page */
- if (delta < 0 && !surplus_huge_pages_node[nid])
+ if (delta < 0 && !h->surplus_huge_pages_node[nid])
continue;
/* Surplus cannot exceed the total number of pages */
- if (delta > 0 && surplus_huge_pages_node[nid] >=
- nr_huge_pages_node[nid])
+ if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+ h->nr_huge_pages_node[nid])
continue;
- surplus_huge_pages += delta;
- surplus_huge_pages_node[nid] += delta;
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[nid] += delta;
ret = 1;
break;
} while (nid != prev_nid);
@@ -194,59 +542,74 @@ static int adjust_pool_surplus(int delta)
return ret;
}
-static struct page *alloc_fresh_huge_page_node(int nid)
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ set_compound_page_dtor(page, free_huge_page);
+ spin_lock(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+ spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
+}
+
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
+ if (h->order >= MAX_ORDER)
+ return NULL;
+
page = alloc_pages_node(nid,
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
+ huge_page_order(h));
if (page) {
if (arch_prepare_hugepage(page)) {
- __free_pages(page, HUGETLB_PAGE_ORDER);
+ __free_pages(page, huge_page_order(h));
return NULL;
}
- set_compound_page_dtor(page, free_huge_page);
- spin_lock(&hugetlb_lock);
- nr_huge_pages++;
- nr_huge_pages_node[nid]++;
- spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
+ prep_new_huge_page(h, page, nid);
}
return page;
}
-static int alloc_fresh_huge_page(void)
+/*
+ * Use a helper variable to find the next node and then
+ * copy it back to hugetlb_next_nid afterwards:
+ * otherwise there's a window in which a racer might
+ * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really
+ * doesn't matter if occasionally a racer chooses the
+ * same nid as we do. Move nid forward in the mask even
+ * if we just successfully allocated a hugepage so that
+ * the next caller gets hugepages on the next node.
+ */
+static int hstate_next_node(struct hstate *h)
+{
+ int next_nid;
+ next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+ if (next_nid == MAX_NUMNODES)
+ next_nid = first_node(node_online_map);
+ h->hugetlb_next_nid = next_nid;
+ return next_nid;
+}
+
+static int alloc_fresh_huge_page(struct hstate *h)
{
struct page *page;
int start_nid;
int next_nid;
int ret = 0;
- start_nid = hugetlb_next_nid;
+ start_nid = h->hugetlb_next_nid;
do {
- page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+ page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
if (page)
ret = 1;
- /*
- * Use a helper variable to find the next node and then
- * copy it back to hugetlb_next_nid afterwards:
- * otherwise there's a window in which a racer might
- * pass invalid nid MAX_NUMNODES to alloc_pages_node.
- * But we don't need to use a spin_lock here: it really
- * doesn't matter if occasionally a racer chooses the
- * same nid as we do. Move nid forward in the mask even
- * if we just successfully allocated a hugepage so that
- * the next caller gets hugepages on the next node.
- */
- next_nid = next_node(hugetlb_next_nid, node_online_map);
- if (next_nid == MAX_NUMNODES)
- next_nid = first_node(node_online_map);
- hugetlb_next_nid = next_nid;
- } while (!page && hugetlb_next_nid != start_nid);
+ next_nid = hstate_next_node(h);
+ } while (!page && h->hugetlb_next_nid != start_nid);
if (ret)
count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +619,15 @@ static int alloc_fresh_huge_page(void)
return ret;
}
-static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
- unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
{
struct page *page;
unsigned int nid;
+ if (h->order >= MAX_ORDER)
+ return NULL;
+
/*
* Assume we will successfully allocate the surplus page to
* prevent racing processes from causing the surplus to exceed
@@ -286,18 +652,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
* per-node value is checked there.
*/
spin_lock(&hugetlb_lock);
- if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
spin_unlock(&hugetlb_lock);
return NULL;
} else {
- nr_huge_pages++;
- surplus_huge_pages++;
+ h->nr_huge_pages++;
+ h->surplus_huge_pages++;
}
spin_unlock(&hugetlb_lock);
page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
__GFP_REPEAT|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
+ huge_page_order(h));
+
+ if (page && arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ return NULL;
+ }
spin_lock(&hugetlb_lock);
if (page) {
@@ -312,12 +683,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
/*
* We incremented the global counters already
*/
- nr_huge_pages_node[nid]++;
- surplus_huge_pages_node[nid]++;
+ h->nr_huge_pages_node[nid]++;
+ h->surplus_huge_pages_node[nid]++;
__count_vm_event(HTLB_BUDDY_PGALLOC);
} else {
- nr_huge_pages--;
- surplus_huge_pages--;
+ h->nr_huge_pages--;
+ h->surplus_huge_pages--;
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
}
spin_unlock(&hugetlb_lock);
@@ -329,16 +700,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
* Increase the hugetlb pool such that it can accomodate a reservation
* of size 'delta'.
*/
-static int gather_surplus_pages(int delta)
+static int gather_surplus_pages(struct hstate *h, int delta)
{
struct list_head surplus_list;
struct page *page, *tmp;
int ret, i;
int needed, allocated;
- needed = (resv_huge_pages + delta) - free_huge_pages;
+ needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
- resv_huge_pages += delta;
+ h->resv_huge_pages += delta;
return 0;
}
@@ -349,7 +720,7 @@ static int gather_surplus_pages(int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(NULL, 0);
+ page = alloc_buddy_huge_page(h, NULL, 0);
if (!page) {
/*
* We were not able to allocate enough pages to
@@ -370,7 +741,8 @@ retry:
* because either resv_huge_pages or free_huge_pages may have changed.
*/
spin_lock(&hugetlb_lock);
- needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+ needed = (h->resv_huge_pages + delta) -
+ (h->free_huge_pages + allocated);
if (needed > 0)
goto retry;
@@ -383,7 +755,7 @@ retry:
* before they are reserved.
*/
needed += allocated;
- resv_huge_pages += delta;
+ h->resv_huge_pages += delta;
ret = 0;
free:
/* Free the needed pages to the hugetlb pool */
@@ -391,7 +763,7 @@ free:
if ((--needed) < 0)
break;
list_del(&page->lru);
- enqueue_huge_page(page);
+ enqueue_huge_page(h, page);
}
/* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +791,8 @@ free:
* allocated to satisfy the reservation must be explicitly freed if they were
* never used.
*/
-static void return_unused_surplus_pages(unsigned long unused_resv_pages)
+static void return_unused_surplus_pages(struct hstate *h,
+ unsigned long unused_resv_pages)
{
static int nid = -1;
struct page *page;
@@ -434,157 +807,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
unsigned long remaining_iterations = num_online_nodes();
/* Uncommit the reservation */
- resv_huge_pages -= unused_resv_pages;
+ h->resv_huge_pages -= unused_resv_pages;
- nr_pages = min(unused_resv_pages, surplus_huge_pages);
+ /* Cannot return gigantic pages currently */
+ if (h->order >= MAX_ORDER)
+ return;
+
+ nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
while (remaining_iterations-- && nr_pages) {
nid = next_node(nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
- if (!surplus_huge_pages_node[nid])
+ if (!h->surplus_huge_pages_node[nid])
continue;
- if (!list_empty(&hugepage_freelists[nid])) {
- page = list_entry(hugepage_freelists[nid].next,
+ if (!list_empty(&h->hugepage_freelists[nid])) {
+ page = list_entry(h->hugepage_freelists[nid].next,
struct page, lru);
list_del(&page->lru);
- update_and_free_page(page);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
- surplus_huge_pages--;
- surplus_huge_pages_node[nid]--;
+ update_and_free_page(h, page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
nr_pages--;
remaining_iterations = num_online_nodes();
}
}
}
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation. Where it does not we will need to logically increase
+ * reservation and actually increase quota before an allocation can occur.
+ * Where any new reservation would be required the reservation change is
+ * prepared, but not committed. Once the page has been quota'd allocated
+ * an instantiated the change should be committed via vma_commit_reservation.
+ * No action is required on failure.
+ */
+static int vma_needs_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ if (vma->vm_flags & VM_SHARED) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ return region_chg(&inode->i_mapping->private_list,
+ idx, idx + 1);
+
+ } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ return 1;
+
+ } else {
+ int err;
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
- unsigned long addr)
+ err = region_chg(&reservations->regions, idx, idx + 1);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+}
+static void vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
- spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(vma, addr);
- spin_unlock(&hugetlb_lock);
- return page ? page : ERR_PTR(-VM_FAULT_OOM);
+ if (vma->vm_flags & VM_SHARED) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ region_add(&inode->i_mapping->private_list, idx, idx + 1);
+
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /* Mark this page used in the map. */
+ region_add(&reservations->regions, idx, idx + 1);
+ }
}
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
- unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
{
- struct page *page = NULL;
+ struct hstate *h = hstate_vma(vma);
+ struct page *page;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned int chg;
- if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ /*
+ * Processes that did not create the mapping will have no reserves and
+ * will not have accounted against quota. Check that the quota can be
+ * made before satisfying the allocation
+ * MAP_NORESERVE mappings may also need pages and quota allocated
+ * if no reserve mapping overlaps.
+ */
+ chg = vma_needs_reservation(h, vma, addr);
+ if (chg < 0)
+ return ERR_PTR(chg);
+ if (chg)
+ if (hugetlb_get_quota(inode->i_mapping, chg))
+ return ERR_PTR(-ENOSPC);
spin_lock(&hugetlb_lock);
- if (free_huge_pages > resv_huge_pages)
- page = dequeue_huge_page_vma(vma, addr);
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
spin_unlock(&hugetlb_lock);
+
if (!page) {
- page = alloc_buddy_huge_page(vma, addr);
+ page = alloc_buddy_huge_page(h, vma, addr);
if (!page) {
- hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+ hugetlb_put_quota(inode->i_mapping, chg);
return ERR_PTR(-VM_FAULT_OOM);
}
}
+
+ set_page_refcounted(page);
+ set_page_private(page, (unsigned long) mapping);
+
+ vma_commit_reservation(h, vma, addr);
+
return page;
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
- unsigned long addr)
+__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
{
- struct page *page;
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct huge_bootmem_page *m;
+ int nr_nodes = nodes_weight(node_online_map);
- if (vma->vm_flags & VM_MAYSHARE)
- page = alloc_huge_page_shared(vma, addr);
- else
- page = alloc_huge_page_private(vma, addr);
+ while (nr_nodes) {
+ void *addr;
+
+ addr = __alloc_bootmem_node_nopanic(
+ NODE_DATA(h->hugetlb_next_nid),
+ huge_page_size(h), huge_page_size(h), 0);
- if (!IS_ERR(page)) {
- set_page_refcounted(page);
- set_page_private(page, (unsigned long) mapping);
+ if (addr) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ m = addr;
+ if (m)
+ goto found;
+ }
+ hstate_next_node(h);
+ nr_nodes--;
}
- return page;
+ return 0;
+
+found:
+ BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+ /* Put them into a private list first because mem_map is not up yet */
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = h;
+ return 1;
}
-static int __init hugetlb_init(void)
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
{
- unsigned long i;
-
- if (HPAGE_SHIFT == 0)
- return 0;
-
- for (i = 0; i < MAX_NUMNODES; ++i)
- INIT_LIST_HEAD(&hugepage_freelists[i]);
+ struct huge_bootmem_page *m;
+
+ list_for_each_entry(m, &huge_boot_pages, list) {
+ struct page *page = virt_to_page(m);
+ struct hstate *h = m->hstate;
+ __ClearPageReserved(page);
+ WARN_ON(page_count(page) != 1);
+ prep_compound_page(page, h->order);
+ prep_new_huge_page(h, page, page_to_nid(page));
+ }
+}
- hugetlb_next_nid = first_node(node_online_map);
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+ unsigned long i;
- for (i = 0; i < max_huge_pages; ++i) {
- if (!alloc_fresh_huge_page())
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ if (h->order >= MAX_ORDER) {
+ if (!alloc_bootmem_huge_page(h))
+ break;
+ } else if (!alloc_fresh_huge_page(h))
break;
}
- max_huge_pages = free_huge_pages = nr_huge_pages = i;
- printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
- return 0;
+ h->max_huge_pages = i;
}
-module_init(hugetlb_init);
-static int __init hugetlb_setup(char *s)
+static void __init hugetlb_init_hstates(void)
{
- if (sscanf(s, "%lu", &max_huge_pages) <= 0)
- max_huge_pages = 0;
- return 1;
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /* oversize hugepages were init'ed in early boot */
+ if (h->order < MAX_ORDER)
+ hugetlb_hstate_alloc_pages(h);
+ }
}
-__setup("hugepages=", hugetlb_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+static char * __init memfmt(char *buf, unsigned long n)
{
- int node;
- unsigned int nr = 0;
-
- for_each_node_mask(node, cpuset_current_mems_allowed)
- nr += array[node];
+ if (n >= (1UL << 30))
+ sprintf(buf, "%lu GB", n >> 30);
+ else if (n >= (1UL << 20))
+ sprintf(buf, "%lu MB", n >> 20);
+ else
+ sprintf(buf, "%lu KB", n >> 10);
+ return buf;
+}
- return nr;
+static void __init report_hugepages(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ char buf[32];
+ printk(KERN_INFO "HugeTLB registered %s page size, "
+ "pre-allocated %ld pages\n",
+ memfmt(buf, huge_page_size(h)),
+ h->free_huge_pages);
+ }
}
-#ifdef CONFIG_SYSCTL
#ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count)
{
int i;
+ if (h->order >= MAX_ORDER)
+ return;
+
for (i = 0; i < MAX_NUMNODES; ++i) {
struct page *page, *next;
- list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
- if (count >= nr_huge_pages)
+ struct list_head *freel = &h->hugepage_freelists[i];
+ list_for_each_entry_safe(page, next, freel, lru) {
+ if (count >= h->nr_huge_pages)
return;
if (PageHighMem(page))
continue;
list_del(&page->lru);
- update_and_free_page(page);
- free_huge_pages--;
- free_huge_pages_node[page_to_nid(page)]--;
+ update_and_free_page(h, page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[page_to_nid(page)]--;
}
}
}
#else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
{
}
#endif
-#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
-static unsigned long set_max_huge_pages(unsigned long count)
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
{
unsigned long min_count, ret;
+ if (h->order >= MAX_ORDER)
+ return h->max_huge_pages;
+
/*
* Increase the pool size
* First take pages out of surplus state. Then make up the
@@ -597,20 +1082,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
* within all the constraints specified by the sysctls.
*/
spin_lock(&hugetlb_lock);
- while (surplus_huge_pages && count > persistent_huge_pages) {
- if (!adjust_pool_surplus(-1))
+ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, -1))
break;
}
- while (count > persistent_huge_pages) {
- int ret;
+ while (count > persistent_huge_pages(h)) {
/*
* If this allocation races such that we no longer need the
* page, free_huge_page will handle it by freeing the page
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
- ret = alloc_fresh_huge_page();
+ ret = alloc_fresh_huge_page(h);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -632,31 +1116,305 @@ static unsigned long set_max_huge_pages(unsigned long count)
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
*/
- min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count);
- try_to_free_low(min_count);
- while (min_count < persistent_huge_pages) {
- struct page *page = dequeue_huge_page();
+ try_to_free_low(h, min_count);
+ while (min_count < persistent_huge_pages(h)) {
+ struct page *page = dequeue_huge_page(h);
if (!page)
break;
- update_and_free_page(page);
+ update_and_free_page(h, page);
}
- while (count < persistent_huge_pages) {
- if (!adjust_pool_surplus(1))
+ while (count < persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, 1))
break;
}
out:
- ret = persistent_huge_pages;
+ ret = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
return ret;
}
+#define HSTATE_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj)
+{
+ int i;
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (hstate_kobjs[i] == kobj)
+ return &hstates[i];
+ BUG();
+ return NULL;
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj);
+
+ err = strict_strtoul(buf, 10, &input);
+ if (err)
+ return 0;
+
+ h->max_huge_pages = set_max_huge_pages(h, input);
+
+ return count;
+}
+HSTATE_ATTR(nr_hugepages);
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj);
+
+ err = strict_strtoul(buf, 10, &input);
+ if (err)
+ return 0;
+
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = input;
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj);
+ return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static struct attribute *hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &nr_overcommit_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &resv_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+ NULL,
+};
+
+static struct attribute_group hstate_attr_group = {
+ .attrs = hstate_attrs,
+};
+
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+ int retval;
+
+ hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
+ hugepages_kobj);
+ if (!hstate_kobjs[h - hstates])
+ return -ENOMEM;
+
+ retval = sysfs_create_group(hstate_kobjs[h - hstates],
+ &hstate_attr_group);
+ if (retval)
+ kobject_put(hstate_kobjs[h - hstates]);
+
+ return retval;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h);
+ if (err)
+ printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+ h->name);
+ }
+}
+
+static void __exit hugetlb_exit(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ kobject_put(hstate_kobjs[h - hstates]);
+ }
+
+ kobject_put(hugepages_kobj);
+}
+module_exit(hugetlb_exit);
+
+static int __init hugetlb_init(void)
+{
+ /* Some platform decide whether they support huge pages at boot
+ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+ * there is no such support
+ */
+ if (HPAGE_SHIFT == 0)
+ return 0;
+
+ if (!size_to_hstate(default_hstate_size)) {
+ default_hstate_size = HPAGE_SIZE;
+ if (!size_to_hstate(default_hstate_size))
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ }
+ default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+ if (default_hstate_max_huge_pages)
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+
+ hugetlb_init_hstates();
+
+ gather_bootmem_prealloc();
+
+ report_hugepages();
+
+ hugetlb_sysfs_init();
+
+ return 0;
+}
+module_init(hugetlb_init);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_add_hstate(unsigned order)
+{
+ struct hstate *h;
+ unsigned long i;
+
+ if (size_to_hstate(PAGE_SIZE << order)) {
+ printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+ return;
+ }
+ BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(order == 0);
+ h = &hstates[max_hstate++];
+ h->order = order;
+ h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+ h->nr_huge_pages = 0;
+ h->free_huge_pages = 0;
+ for (i = 0; i < MAX_NUMNODES; ++i)
+ INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ h->hugetlb_next_nid = first_node(node_online_map);
+ snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+ huge_page_size(h)/1024);
+
+ parsed_hstate = h;
+}
+
+static int __init hugetlb_nrpages_setup(char *s)
+{
+ unsigned long *mhp;
+ static unsigned long *last_mhp;
+
+ /*
+ * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * so this hugepages= parameter goes to the "default hstate".
+ */
+ if (!max_hstate)
+ mhp = &default_hstate_max_huge_pages;
+ else
+ mhp = &parsed_hstate->max_huge_pages;
+
+ if (mhp == last_mhp) {
+ printk(KERN_WARNING "hugepages= specified twice without "
+ "interleaving hugepagesz=, ignoring\n");
+ return 1;
+ }
+
+ if (sscanf(s, "%lu", mhp) <= 0)
+ *mhp = 0;
+
+ /*
+ * Global state is always initialized later in hugetlb_init.
+ * But we need to allocate >= MAX_ORDER hstates here early to still
+ * use the bootmem allocator.
+ */
+ if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+ hugetlb_hstate_alloc_pages(parsed_hstate);
+
+ last_mhp = mhp;
+
+ return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+
+static int __init hugetlb_default_setup(char *s)
+{
+ default_hstate_size = memparse(s, &s);
+ return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+ int node;
+ unsigned int nr = 0;
+
+ for_each_node_mask(node, cpuset_current_mems_allowed)
+ nr += array[node];
+
+ return nr;
+}
+
+#ifdef CONFIG_SYSCTL
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer,
size_t *length, loff_t *ppos)
{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+
+ if (!write)
+ tmp = h->max_huge_pages;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
- max_huge_pages = set_max_huge_pages(max_huge_pages);
+
+ if (write)
+ h->max_huge_pages = set_max_huge_pages(h, tmp);
+
return 0;
}
@@ -676,10 +1434,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer,
size_t *length, loff_t *ppos)
{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+
+ if (!write)
+ tmp = h->nr_overcommit_huge_pages;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
- spin_lock(&hugetlb_lock);
- nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
- spin_unlock(&hugetlb_lock);
+
+ if (write) {
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = tmp;
+ spin_unlock(&hugetlb_lock);
+ }
+
return 0;
}
@@ -687,34 +1457,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
int hugetlb_report_meminfo(char *buf)
{
+ struct hstate *h = &default_hstate;
return sprintf(buf,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
"HugePages_Rsvd: %5lu\n"
"HugePages_Surp: %5lu\n"
"Hugepagesize: %5lu kB\n",
- nr_huge_pages,
- free_huge_pages,
- resv_huge_pages,
- surplus_huge_pages,
- HPAGE_SIZE/1024);
+ h->nr_huge_pages,
+ h->free_huge_pages,
+ h->resv_huge_pages,
+ h->surplus_huge_pages,
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
int hugetlb_report_node_meminfo(int nid, char *buf)
{
+ struct hstate *h = &default_hstate;
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
"Node %d HugePages_Surp: %5u\n",
- nid, nr_huge_pages_node[nid],
- nid, free_huge_pages_node[nid],
- nid, surplus_huge_pages_node[nid]);
+ nid, h->nr_huge_pages_node[nid],
+ nid, h->free_huge_pages_node[nid],
+ nid, h->surplus_huge_pages_node[nid]);
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
- return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+ struct hstate *h = &default_hstate;
+ return h->nr_huge_pages * pages_per_huge_page(h);
+}
+
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+ int ret = -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+ /*
+ * When cpuset is configured, it breaks the strict hugetlb page
+ * reservation as the accounting is done on a global variable. Such
+ * reservation is completely rubbish in the presence of cpuset because
+ * the reservation is not checked against page availability for the
+ * current cpuset. Application can still potentially OOM'ed by kernel
+ * with lack of free htlb page in cpuset that the task is in.
+ * Attempt to enforce strict accounting with cpuset is almost
+ * impossible (or too ugly) because cpuset is too fluid that
+ * task or memory node can be dynamically moved between cpusets.
+ *
+ * The change of semantics for shared hugetlb mapping with cpuset is
+ * undesirable. However, in order to preserve some of the semantics,
+ * we fall back to check against current free page availability as
+ * a best attempt and hopefully to minimize the impact of changing
+ * semantics that cpuset has.
+ */
+ if (delta > 0) {
+ if (gather_surplus_pages(h, delta) < 0)
+ goto out;
+
+ if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ return_unused_surplus_pages(h, delta);
+ goto out;
+ }
+ }
+
+ ret = 0;
+ if (delta < 0)
+ return_unused_surplus_pages(h, (unsigned long) -delta);
+
+out:
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /*
+ * This new VMA should share its siblings reservation map if present.
+ * The VMA will only ever have a valid reservation map pointer where
+ * it is being copied for another still existing VMA. As that VMA
+ * has a reference to the reservation map it cannot dissappear until
+ * after this open call completes. It is therefore safe to take a
+ * new reference here without additional locking.
+ */
+ if (reservations)
+ kref_get(&reservations->refs);
+}
+
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct resv_map *reservations = vma_resv_map(vma);
+ unsigned long reserve;
+ unsigned long start;
+ unsigned long end;
+
+ if (reservations) {
+ start = vma_hugecache_offset(h, vma, vma->vm_start);
+ end = vma_hugecache_offset(h, vma, vma->vm_end);
+
+ reserve = (end - start) -
+ region_count(&reservations->regions, start, end);
+
+ kref_put(&reservations->refs, resv_map_release);
+
+ if (reserve) {
+ hugetlb_acct_memory(h, -reserve);
+ hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+ }
+ }
}
/*
@@ -731,6 +1585,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
+ .open = hugetlb_vm_op_open,
+ .close = hugetlb_vm_op_close,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1625,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct page *ptepage;
unsigned long addr;
int cow;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
- dst_pte = huge_pte_alloc(dst, addr);
+ dst_pte = huge_pte_alloc(dst, addr, sz);
if (!dst_pte)
goto nomem;
@@ -804,7 +1662,7 @@ nomem:
}
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
+ unsigned long end, struct page *ref_page)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -812,6 +1670,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
pte_t pte;
struct page *page;
struct page *tmp;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+
/*
* A page gathering list, protected by per file i_mmap_lock. The
* lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1681,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
LIST_HEAD(page_list);
WARN_ON(!is_vm_hugetlb_page(vma));
- BUG_ON(start & ~HPAGE_MASK);
- BUG_ON(end & ~HPAGE_MASK);
+ BUG_ON(start & ~huge_page_mask(h));
+ BUG_ON(end & ~huge_page_mask(h));
+ mmu_notifier_invalidate_range_start(mm, start, end);
spin_lock(&mm->page_table_lock);
- for (address = start; address < end; address += HPAGE_SIZE) {
+ for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
@@ -832,6 +1694,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (huge_pmd_unshare(mm, &address, ptep))
continue;
+ /*
+ * If a reference page is supplied, it is because a specific
+ * page is being unmapped, not a range. Ensure the page we
+ * are about to unmap is the actual page of interest.
+ */
+ if (ref_page) {
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte))
+ continue;
+ page = pte_page(pte);
+ if (page != ref_page)
+ continue;
+
+ /*
+ * Mark the VMA as having unmapped its page so that
+ * future faults in this VMA will fail rather than
+ * looking like data was lost
+ */
+ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ }
+
pte = huge_ptep_get_and_clear(mm, address, ptep);
if (huge_pte_none(pte))
continue;
@@ -843,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
}
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
list_del(&page->lru);
put_page(page);
@@ -850,31 +1734,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
+ unsigned long end, struct page *ref_page)
{
+ spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ __unmap_hugepage_range(vma, start, end, ref_page);
+ spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+}
+
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct page *page,
+ unsigned long address)
+{
+ struct vm_area_struct *iter_vma;
+ struct address_space *mapping;
+ struct prio_tree_iter iter;
+ pgoff_t pgoff;
+
/*
- * It is undesirable to test vma->vm_file as it should be non-null
- * for valid hugetlb area. However, vm_file will be NULL in the error
- * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
- * do_mmap_pgoff() nullifies vma->vm_file before calling this function
- * to clean up. Since no pte has actually been setup, it is safe to
- * do nothing in this case.
+ * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+ * from page cache lookup which is in HPAGE_SIZE units.
*/
- if (vma->vm_file) {
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
- __unmap_hugepage_range(vma, start, end);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ address = address & huge_page_mask(hstate_vma(vma));
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+ + (vma->vm_pgoff >> PAGE_SHIFT);
+ mapping = (struct address_space *)page_private(page);
+
+ vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ /* Do not unmap the current VMA */
+ if (iter_vma == vma)
+ continue;
+
+ /*
+ * Unmap the page from other VMAs without their own reserves.
+ * They get marked to be SIGKILLed if they fault in these
+ * areas. This is because a future no-page fault on this VMA
+ * could insert a zeroed page instead of the data existing
+ * from the time of fork. This would look like data corruption
+ */
+ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+ unmap_hugepage_range(iter_vma,
+ address, address + HPAGE_SIZE,
+ page);
}
+
+ return 1;
}
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, pte_t pte)
+ unsigned long address, pte_t *ptep, pte_t pte,
+ struct page *pagecache_page)
{
+ struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int avoidcopy;
+ int outside_reserve = 0;
old_page = pte_page(pte);
+retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1807,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
+ /*
+ * If the process that created a MAP_PRIVATE mapping is about to
+ * perform a COW due to a shared page count, attempt to satisfy
+ * the allocation without using the existing reserves. The pagecache
+ * page is used to determine if the reserve at this address was
+ * consumed or not. If reserves were used, a partial faulted mapping
+ * at the time of fork() could consume its reserves on COW instead
+ * of the full address range.
+ */
+ if (!(vma->vm_flags & VM_SHARED) &&
+ is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ old_page != pagecache_page)
+ outside_reserve = 1;
+
page_cache_get(old_page);
- new_page = alloc_huge_page(vma, address);
+ new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
page_cache_release(old_page);
+
+ /*
+ * If a process owning a MAP_PRIVATE mapping fails to COW,
+ * it is due to references held by a child and an insufficient
+ * huge page pool. To guarantee the original mappers
+ * reliability, unmap the page from child processes. The child
+ * may get SIGKILLed if it later faults.
+ */
+ if (outside_reserve) {
+ BUG_ON(huge_pte_none(pte));
+ if (unmap_ref_private(mm, vma, old_page, address)) {
+ BUG_ON(page_count(old_page) != 1);
+ BUG_ON(huge_pte_none(pte));
+ goto retry_avoidcopy;
+ }
+ WARN_ON_ONCE(1);
+ }
+
return -PTR_ERR(new_page);
}
@@ -896,7 +1852,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
__SetPageUptodate(new_page);
spin_lock(&mm->page_table_lock);
- ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1866,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ return find_lock_page(mapping, idx);
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, int write_access)
{
+ struct hstate *h = hstate_vma(vma);
int ret = VM_FAULT_SIGBUS;
- unsigned long idx;
+ pgoff_t idx;
unsigned long size;
struct page *page;
struct address_space *mapping;
pte_t new_pte;
+ /*
+ * Currently, we are forced to kill the process in the event the
+ * original mapper has unmapped pages from the child due to a failed
+ * COW. Warn that such a situation has occured as it may not be obvious
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+ printk(KERN_WARNING
+ "PID %d killed due to inadequate hugepage pool\n",
+ current->pid);
+ return ret;
+ }
+
mapping = vma->vm_file->f_mapping;
- idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ idx = vma_hugecache_offset(h, vma, address);
/*
* Use page lock to guard against racing truncation
@@ -931,15 +1912,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
- page = alloc_huge_page(vma, address);
+ page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
ret = -PTR_ERR(page);
goto out;
}
- clear_huge_page(page, address);
+ clear_huge_page(page, address, huge_page_size(h));
__SetPageUptodate(page);
if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1936,26 @@ retry:
}
spin_lock(&inode->i_lock);
- inode->i_blocks += BLOCKS_PER_HUGEPAGE;
+ inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
} else
lock_page(page);
}
+ /*
+ * If we are going to COW a private mapping later, we examine the
+ * pending reservations for this page now. This will ensure that
+ * any allocations necessary to record that reservation occur outside
+ * the spinlock.
+ */
+ if (write_access && !(vma->vm_flags & VM_SHARED))
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+
spin_lock(&mm->page_table_lock);
- size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@@ -976,7 +1969,7 @@ retry:
if (write_access && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
}
spin_unlock(&mm->page_table_lock);
@@ -986,6 +1979,7 @@ out:
backout:
spin_unlock(&mm->page_table_lock);
+backout_unlocked:
unlock_page(page);
put_page(page);
goto out;
@@ -997,9 +1991,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+ struct hstate *h = hstate_vma(vma);
- ptep = huge_pte_alloc(mm, address);
+ ptep = huge_pte_alloc(mm, address, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
@@ -1012,23 +2008,58 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
- mutex_unlock(&hugetlb_instantiation_mutex);
- return ret;
+ goto out_unlock;
}
ret = 0;
+ /*
+ * If we are going to COW the mapping later, we examine the pending
+ * reservations for this page now. This will ensure that any
+ * allocations necessary to record that reservation occur outside the
+ * spinlock. For private mappings, we also lookup the pagecache
+ * page now as it is used to determine if a reservation has been
+ * consumed.
+ */
+ if (write_access && !pte_write(entry)) {
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto out_unlock;
+ }
+
+ if (!(vma->vm_flags & VM_SHARED))
+ pagecache_page = hugetlbfs_pagecache_page(h,
+ vma, address);
+ }
+
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (likely(pte_same(entry, huge_ptep_get(ptep))))
if (write_access && !pte_write(entry))
- ret = hugetlb_cow(mm, vma, address, ptep, entry);
+ ret = hugetlb_cow(mm, vma, address, ptep, entry,
+ pagecache_page);
spin_unlock(&mm->page_table_lock);
+
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+
+out_unlock:
mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
}
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i,
@@ -1037,6 +2068,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long pfn_offset;
unsigned long vaddr = *position;
int remainder = *length;
+ struct hstate *h = hstate_vma(vma);
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
@@ -1048,7 +2080,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* each hugepage. We have to make * sure we get the
* first, for the page indexing below to work.
*/
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
(write && !pte_write(huge_ptep_get(pte)))) {
@@ -1066,7 +2098,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
- pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+ pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
same_page:
if (pages) {
@@ -1082,7 +2114,7 @@ same_page:
--remainder;
++i;
if (vaddr < vma->vm_end && remainder &&
- pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+ pfn_offset < pages_per_huge_page(h)) {
/*
* We use pfn_offset to avoid touching the pageframes
* of this compound page.
@@ -1104,13 +2136,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long start = address;
pte_t *ptep;
pte_t pte;
+ struct hstate *h = hstate_vma(vma);
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
spin_lock(&mm->page_table_lock);
- for (; address < end; address += HPAGE_SIZE) {
+ for (; address < end; address += huge_page_size(h)) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
@@ -1128,195 +2161,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
flush_tlb_range(vma, start, end);
}
-struct file_region {
- struct list_head link;
- long from;
- long to;
-};
-
-static long region_add(struct list_head *head, long f, long t)
-{
- struct file_region *rg, *nrg, *trg;
-
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
-
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- break;
-
- /* If this area reaches higher then extend our area to
- * include it completely. If this is not the first area
- * which we intend to reuse, free it. */
- if (rg->to > t)
- t = rg->to;
- if (rg != nrg) {
- list_del(&rg->link);
- kfree(rg);
- }
- }
- nrg->from = f;
- nrg->to = t;
- return 0;
-}
-
-static long region_chg(struct list_head *head, long f, long t)
+int hugetlb_reserve_pages(struct inode *inode,
+ long from, long to,
+ struct vm_area_struct *vma)
{
- struct file_region *rg, *nrg;
- long chg = 0;
-
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
-
- /* If we are below the current region then a new region is required.
- * Subtle, allocate a new region at the position but make it zero
- * size such that we can guarantee to record the reservation. */
- if (&rg->link == head || t < rg->from) {
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
- if (!nrg)
- return -ENOMEM;
- nrg->from = f;
- nrg->to = f;
- INIT_LIST_HEAD(&nrg->link);
- list_add(&nrg->link, rg->link.prev);
-
- return t - f;
- }
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
- chg = t - f;
-
- /* Check for and consume any regions we now overlap with. */
- list_for_each_entry(rg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- return chg;
-
- /* We overlap with this area, if it extends futher than
- * us then we must extend ourselves. Account for its
- * existing reservation. */
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
- }
- chg -= rg->to - rg->from;
- }
- return chg;
-}
-
-static long region_truncate(struct list_head *head, long end)
-{
- struct file_region *rg, *trg;
- long chg = 0;
+ long ret, chg;
+ struct hstate *h = hstate_inode(inode);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (end <= rg->to)
- break;
- if (&rg->link == head)
+ if (vma && vma->vm_flags & VM_NORESERVE)
return 0;
- /* If we are in the middle of a region then adjust it. */
- if (end > rg->from) {
- chg = rg->to - end;
- rg->to = end;
- rg = list_entry(rg->link.next, typeof(*rg), link);
- }
-
- /* Drop any remaining regions. */
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- chg += rg->to - rg->from;
- list_del(&rg->link);
- kfree(rg);
- }
- return chg;
-}
-
-static int hugetlb_acct_memory(long delta)
-{
- int ret = -ENOMEM;
-
- spin_lock(&hugetlb_lock);
/*
- * When cpuset is configured, it breaks the strict hugetlb page
- * reservation as the accounting is done on a global variable. Such
- * reservation is completely rubbish in the presence of cpuset because
- * the reservation is not checked against page availability for the
- * current cpuset. Application can still potentially OOM'ed by kernel
- * with lack of free htlb page in cpuset that the task is in.
- * Attempt to enforce strict accounting with cpuset is almost
- * impossible (or too ugly) because cpuset is too fluid that
- * task or memory node can be dynamically moved between cpusets.
- *
- * The change of semantics for shared hugetlb mapping with cpuset is
- * undesirable. However, in order to preserve some of the semantics,
- * we fall back to check against current free page availability as
- * a best attempt and hopefully to minimize the impact of changing
- * semantics that cpuset has.
+ * Shared mappings base their reservation on the number of pages that
+ * are already allocated on behalf of the file. Private mappings need
+ * to reserve the full area even if read-only as mprotect() may be
+ * called to make the mapping read-write. Assume !vma is a shm mapping
*/
- if (delta > 0) {
- if (gather_surplus_pages(delta) < 0)
- goto out;
-
- if (delta > cpuset_mems_nr(free_huge_pages_node)) {
- return_unused_surplus_pages(delta);
- goto out;
- }
- }
-
- ret = 0;
- if (delta < 0)
- return_unused_surplus_pages((unsigned long) -delta);
+ if (!vma || vma->vm_flags & VM_SHARED)
+ chg = region_chg(&inode->i_mapping->private_list, from, to);
+ else {
+ struct resv_map *resv_map = resv_map_alloc();
+ if (!resv_map)
+ return -ENOMEM;
-out:
- spin_unlock(&hugetlb_lock);
- return ret;
-}
+ chg = to - from;
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
- long ret, chg;
+ set_vma_resv_map(vma, resv_map);
+ set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+ }
- chg = region_chg(&inode->i_mapping->private_list, from, to);
if (chg < 0)
return chg;
if (hugetlb_get_quota(inode->i_mapping, chg))
return -ENOSPC;
- ret = hugetlb_acct_memory(chg);
+ ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
hugetlb_put_quota(inode->i_mapping, chg);
return ret;
}
- region_add(&inode->i_mapping->private_list, from, to);
+ if (!vma || vma->vm_flags & VM_SHARED)
+ region_add(&inode->i_mapping->private_list, from, to);
return 0;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
+ struct hstate *h = hstate_inode(inode);
long chg = region_truncate(&inode->i_mapping->private_list, offset);
spin_lock(&inode->i_lock);
- inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
+ inode->i_blocks -= blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
hugetlb_put_quota(inode->i_mapping, (chg - freed));
- hugetlb_acct_memory(-(chg - freed));
+ hugetlb_acct_memory(h, -(chg - freed));
}
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4b..1f43f741697 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
#include <linux/mm.h>
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long floor, unsigned long ceiling);
+
+extern void prep_compound_page(struct page *page, unsigned long order);
+
static inline void set_page_count(struct page *page, int v)
{
atomic_set(&page->_count, v);
@@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page)
#define __paginginit __init
#endif
+/* Memory initialisation debug and verification */
+enum mminit_level {
+ MMINIT_WARNING,
+ MMINIT_VERIFY,
+ MMINIT_TRACE
+};
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+
+extern int mminit_loglevel;
+
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+ if (level < mminit_loglevel) { \
+ printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+ printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+ } \
+} while (0)
+
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
+
+#else
+
+static inline void mminit_dprintk(enum mminit_level level,
+ const char *prefix, const char *fmt, ...)
+{
+}
+
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+
+static inline void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
+
#endif
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea..f9349c18a1b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for refill_inactive to actually free
+ * zap_page_range call sets things up for shrink_active_list to actually free
* these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
- * refill_inactive to pick up before reclaiming other pages.
+ * shrink_active_list to pick up before reclaiming other pages.
*
* NB: This interface discards data rather than pushes it out to swap,
* as some implementations do. This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b..36896f3eb7f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,9 +35,9 @@
#include <asm/uaccess.h>
-struct cgroup_subsys mem_cgroup_subsys;
-static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
-static struct kmem_cache *page_cgroup_cache;
+struct cgroup_subsys mem_cgroup_subsys __read_mostly;
+static struct kmem_cache *page_cgroup_cache __read_mostly;
+#define MEM_CGROUP_RECLAIM_RETRIES 5
/*
* Statistics for memory cgroup.
@@ -166,7 +166,6 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct page *page;
struct mem_cgroup *mem_cgroup;
- int ref_cnt; /* cached, mapped, migrating */
int flags;
};
#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
@@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
+ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
};
/*
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
+ /*
+ * mm_update_next_owner() may clear mm->owner to NULL
+ * if it races with swapoff, page migration, etc.
+ * So this can be called with p == NULL.
+ */
+ if (unlikely(!p))
+ return NULL;
+
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
@@ -296,7 +304,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
- list_del_init(&pc->lru);
+ list_del(&pc->lru);
}
static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
@@ -354,6 +362,9 @@ void mem_cgroup_move_lists(struct page *page, bool active)
struct mem_cgroup_per_zone *mz;
unsigned long flags;
+ if (mem_cgroup_subsys.disabled)
+ return;
+
/*
* We cannot lock_page_cgroup while holding zone's lru_lock,
* because other holders of lock_page_cgroup can be interrupted
@@ -524,7 +535,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype)
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcg)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
@@ -532,35 +544,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup_per_zone *mz;
- if (mem_cgroup_subsys.disabled)
- return 0;
-
- /*
- * Should page_cgroup's go to their own slab?
- * One could optimize the performance of the charging routine
- * by saving a bit in the page_flags and using it as a lock
- * to see if the cgroup page already has a page_cgroup associated
- * with it
- */
-retry:
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
- /*
- * The page_cgroup exists and
- * the page has already been accounted.
- */
- if (pc) {
- VM_BUG_ON(pc->page != page);
- VM_BUG_ON(pc->ref_cnt <= 0);
-
- pc->ref_cnt++;
- unlock_page_cgroup(page);
- goto done;
- }
- unlock_page_cgroup(page);
-
- pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
- if (pc == NULL)
+ pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
+ if (unlikely(pc == NULL))
goto err;
/*
@@ -569,16 +554,23 @@ retry:
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (!mm)
- mm = &init_mm;
-
- rcu_read_lock();
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- /*
- * For every charge from the cgroup, increment reference count
- */
- css_get(&mem->css);
- rcu_read_unlock();
+ if (likely(!memcg)) {
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem)) {
+ rcu_read_unlock();
+ kmem_cache_free(page_cgroup_cache, pc);
+ return 0;
+ }
+ /*
+ * For every charge from the cgroup, increment reference count
+ */
+ css_get(&mem->css);
+ rcu_read_unlock();
+ } else {
+ mem = memcg;
+ css_get(&memcg->css);
+ }
while (res_counter_charge(&mem->res, PAGE_SIZE)) {
if (!(gfp_mask & __GFP_WAIT))
@@ -603,25 +595,24 @@ retry:
}
}
- pc->ref_cnt = 1;
pc->mem_cgroup = mem;
pc->page = page;
- pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+ /*
+ * If a page is accounted as a page cache, insert to inactive list.
+ * If anon, insert to active list.
+ */
if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
pc->flags = PAGE_CGROUP_FLAG_CACHE;
+ else
+ pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
lock_page_cgroup(page);
- if (page_get_page_cgroup(page)) {
+ if (unlikely(page_get_page_cgroup(page))) {
unlock_page_cgroup(page);
- /*
- * Another charge has been added to this page already.
- * We take lock_page_cgroup(page) again and read
- * page->cgroup, increment refcnt.... just retry is OK.
- */
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
kmem_cache_free(page_cgroup_cache, pc);
- goto retry;
+ goto done;
}
page_assign_page_cgroup(page, pc);
@@ -642,24 +633,65 @@ err:
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+
+ /*
+ * If already mapped, we don't have to account.
+ * If page cache, page->mapping has address_space.
+ * But page->mapping may have out-of-use anon_vma pointer,
+ * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
+ * is NULL.
+ */
+ if (page_mapped(page) || (page->mapping && !PageAnon(page)))
+ return 0;
+ if (unlikely(!mm))
+ mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
}
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- if (!mm)
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+
+ /*
+ * Corner case handling. This is called from add_to_page_cache()
+ * in usual. But some FS (shmem) precharges this page before calling it
+ * and call add_to_page_cache() with GFP_NOWAIT.
+ *
+ * For GFP_NOWAIT case, the page may be pre-charged before calling
+ * add_to_page_cache(). (See shmem.c) check it here and avoid to call
+ * charge twice. (It works but has to pay a bit larger cost.)
+ */
+ if (!(gfp_mask & __GFP_WAIT)) {
+ struct page_cgroup *pc;
+
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc) {
+ VM_BUG_ON(pc->page != page);
+ VM_BUG_ON(!pc->mem_cgroup);
+ unlock_page_cgroup(page);
+ return 0;
+ }
+ unlock_page_cgroup(page);
+ }
+
+ if (unlikely(!mm))
mm = &init_mm;
+
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE);
+ MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
}
/*
- * Uncharging is always a welcome operation, we never complain, simply
- * uncharge.
+ * uncharge if !page_mapped(page)
*/
-void mem_cgroup_uncharge_page(struct page *page)
+static void
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
struct page_cgroup *pc;
struct mem_cgroup *mem;
@@ -674,98 +706,158 @@ void mem_cgroup_uncharge_page(struct page *page)
*/
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
- if (!pc)
+ if (unlikely(!pc))
goto unlock;
VM_BUG_ON(pc->page != page);
- VM_BUG_ON(pc->ref_cnt <= 0);
- if (--(pc->ref_cnt) == 0) {
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(mz, pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
+ if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+ && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
+ || page_mapped(page)))
+ goto unlock;
- page_assign_page_cgroup(page, NULL);
- unlock_page_cgroup(page);
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_remove_list(mz, pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
- mem = pc->mem_cgroup;
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- css_put(&mem->css);
+ page_assign_page_cgroup(page, NULL);
+ unlock_page_cgroup(page);
- kmem_cache_free(page_cgroup_cache, pc);
- return;
- }
+ mem = pc->mem_cgroup;
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+ kmem_cache_free(page_cgroup_cache, pc);
+ return;
unlock:
unlock_page_cgroup(page);
}
+void mem_cgroup_uncharge_page(struct page *page)
+{
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+
+void mem_cgroup_uncharge_cache_page(struct page *page)
+{
+ VM_BUG_ON(page_mapped(page));
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+}
+
/*
- * Returns non-zero if a page (under migration) has valid page_cgroup member.
- * Refcnt of page_cgroup is incremented.
+ * Before starting migration, account against new page.
*/
-int mem_cgroup_prepare_migration(struct page *page)
+int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
{
struct page_cgroup *pc;
+ struct mem_cgroup *mem = NULL;
+ enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+ int ret = 0;
if (mem_cgroup_subsys.disabled)
return 0;
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
- if (pc)
- pc->ref_cnt++;
+ if (pc) {
+ mem = pc->mem_cgroup;
+ css_get(&mem->css);
+ if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
+ ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ }
unlock_page_cgroup(page);
- return pc != NULL;
+ if (mem) {
+ ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
+ ctype, mem);
+ css_put(&mem->css);
+ }
+ return ret;
}
-void mem_cgroup_end_migration(struct page *page)
+/* remove redundant charge if migration failed*/
+void mem_cgroup_end_migration(struct page *newpage)
{
- mem_cgroup_uncharge_page(page);
+ /*
+ * At success, page->mapping is not NULL.
+ * special rollback care is necessary when
+ * 1. at migration failure. (newpage->mapping is cleared in this case)
+ * 2. the newpage was moved but not remapped again because the task
+ * exits and the newpage is obsolete. In this case, the new page
+ * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
+ * always for avoiding mess. The page_cgroup will be removed if
+ * unnecessary. File cache pages is still on radix-tree. Don't
+ * care it.
+ */
+ if (!newpage->mapping)
+ __mem_cgroup_uncharge_common(newpage,
+ MEM_CGROUP_CHARGE_TYPE_FORCE);
+ else if (PageAnon(newpage))
+ mem_cgroup_uncharge_page(newpage);
}
/*
- * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
- * And no race with uncharge() routines because page_cgroup for *page*
- * has extra one reference by mem_cgroup_prepare_migration.
+ * A call to try to shrink memory usage under specified resource controller.
+ * This is typically used for page reclaiming for shmem for reducing side
+ * effect of page allocation from shmem, which is used by some mem_cgroup.
*/
-void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
{
- struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
- unsigned long flags;
+ struct mem_cgroup *mem;
+ int progress = 0;
+ int retry = MEM_CGROUP_RECLAIM_RETRIES;
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
- if (!pc) {
- unlock_page_cgroup(page);
- return;
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+ if (!mm)
+ return 0;
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem)) {
+ rcu_read_unlock();
+ return 0;
}
+ css_get(&mem->css);
+ rcu_read_unlock();
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(mz, pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
+ do {
+ progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+ progress += res_counter_check_under_limit(&mem->res);
+ } while (!progress && --retry);
- page_assign_page_cgroup(page, NULL);
- unlock_page_cgroup(page);
+ css_put(&mem->css);
+ if (!retry)
+ return -ENOMEM;
+ return 0;
+}
- pc->page = newpage;
- lock_page_cgroup(newpage);
- page_assign_page_cgroup(newpage, pc);
+int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
+{
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(mz, pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
+ int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
+ int progress;
+ int ret = 0;
- unlock_page_cgroup(newpage);
+ while (res_counter_set_limit(&memcg->res, val)) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ if (!retry_count) {
+ ret = -EBUSY;
+ break;
+ }
+ progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+ if (!progress)
+ retry_count--;
+ }
+ return ret;
}
+
/*
* This routine traverse page_cgroup in given list and drop them all.
- * This routine ignores page_cgroup->ref_cnt.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/
#define FORCE_UNCHARGE_BATCH (128)
@@ -790,12 +882,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
page = pc->page;
get_page(page);
spin_unlock_irqrestore(&mz->lru_lock, flags);
- mem_cgroup_uncharge_page(page);
- put_page(page);
- if (--count <= 0) {
- count = FORCE_UNCHARGE_BATCH;
+ /*
+ * Check if this page is on LRU. !LRU page can be found
+ * if it's under page migration.
+ */
+ if (PageLRU(page)) {
+ __mem_cgroup_uncharge_common(page,
+ MEM_CGROUP_CHARGE_TYPE_FORCE);
+ put_page(page);
+ if (--count <= 0) {
+ count = FORCE_UNCHARGE_BATCH;
+ cond_resched();
+ }
+ } else
cond_resched();
- }
spin_lock_irqsave(&mz->lru_lock, flags);
}
spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -810,9 +910,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
int ret = -EBUSY;
int node, zid;
- if (mem_cgroup_subsys.disabled)
- return 0;
-
css_get(&mem->css);
/*
* page reclaim code (kswapd etc..) will move pages between
@@ -838,32 +935,34 @@ out:
return ret;
}
-static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
-{
- *tmp = memparse(buf, &buf);
- if (*buf != '\0')
- return -EINVAL;
-
- /*
- * Round up the value to the closest page size
- */
- *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
- return 0;
-}
-
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
cft->private);
}
-
-static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
- struct file *file, const char __user *userbuf,
- size_t nbytes, loff_t *ppos)
+/*
+ * The user of this function is...
+ * RES_LIMIT.
+ */
+static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+ const char *buffer)
{
- return res_counter_write(&mem_cgroup_from_cont(cont)->res,
- cft->private, userbuf, nbytes, ppos,
- mem_cgroup_write_strategy);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ unsigned long long val;
+ int ret;
+
+ switch (cft->private) {
+ case RES_LIMIT:
+ /* This function does all necessary parse...reuse it */
+ ret = res_counter_memparse_write_strategy(buffer, &val);
+ if (!ret)
+ ret = mem_cgroup_resize_limit(memcg, val);
+ break;
+ default:
+ ret = -EINVAL; /* should be BUG() ? */
+ break;
+ }
+ return ret;
}
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -940,7 +1039,7 @@ static struct cftype mem_cgroup_files[] = {
{
.name = "limit_in_bytes",
.private = RES_LIMIT,
- .write = mem_cgroup_write,
+ .write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
{
@@ -1070,8 +1169,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
static int mem_cgroup_populate(struct cgroup_subsys *ss,
struct cgroup *cont)
{
- if (mem_cgroup_subsys.disabled)
- return 0;
return cgroup_add_files(cont, ss, mem_cgroup_files,
ARRAY_SIZE(mem_cgroup_files));
}
@@ -1084,9 +1181,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct mm_struct *mm;
struct mem_cgroup *mem, *old_mem;
- if (mem_cgroup_subsys.disabled)
- return;
-
mm = get_task_mm(p);
if (mm == NULL)
return;
@@ -1094,9 +1188,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
mem = mem_cgroup_from_cont(cont);
old_mem = mem_cgroup_from_cont(old_cont);
- if (mem == old_mem)
- goto out;
-
/*
* Only thread group leaders are allowed to migrate, the mm_struct is
* in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 19e0ae9beec..1002f473f49 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -61,6 +62,8 @@
#include <linux/swapops.h>
#include <linux/elf.h>
+#include "internal.h"
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
@@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
*
* Must be called with pagetable lock held.
*/
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
@@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb,
return;
start = addr;
- pgd = pgd_offset((*tlb)->mm, addr);
+ pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+ free_pud_range(tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling)
{
while (vma) {
@@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
*
* The calling function must still handle the error.
*/
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+ unsigned long vaddr)
{
printk(KERN_ERR "Bad pte = %08llx, process = %s, "
"vm_flags = %lx, vaddr = %lx\n",
@@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
+ int ret;
/*
* Don't copy ptes where a page fault will fill them correctly.
@@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ /*
+ * We need to invalidate the secondary MMU mappings only when
+ * there could be a permission downgrade on the ptes of the
+ * parent mm. And a permission downgrade will only happen if
+ * is_cow_mapping() returns true.
+ */
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier_invalidate_range_start(src_mm, addr, end);
+
+ ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
- if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, addr, next))
- return -ENOMEM;
+ if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+ vma, addr, next))) {
+ ret = -ENOMEM;
+ break;
+ }
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
- return 0;
+
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier_invalidate_range_end(src_mm,
+ vma->vm_start, end);
+ return ret;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
unsigned long start = start_addr;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = (*tlbp)->fullmm;
+ struct mm_struct *mm = vma->vm_mm;
+ mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
@@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
}
if (unlikely(is_vm_hugetlb_page(vma))) {
- unmap_hugepage_range(vma, start, end);
- zap_work -= (end - start) /
- (HPAGE_SIZE / PAGE_SIZE);
+ /*
+ * It is undesirable to test vma->vm_file as it
+ * should be non-null for valid hugetlb area.
+ * However, vm_file will be NULL in the error
+ * cleanup path of do_mmap_pgoff. When
+ * hugetlbfs ->mmap method fails,
+ * do_mmap_pgoff() nullifies vma->vm_file
+ * before calling this function to clean up.
+ * Since no pte has actually been setup, it is
+ * safe to do nothing in this case.
+ */
+ if (vma->vm_file) {
+ unmap_hugepage_range(vma, start, end, NULL);
+ zap_work -= (end - start) /
+ pages_per_huge_page(hstate_vma(vma));
+ }
+
start = end;
} else
start = unmap_page_range(*tlbp, vma,
@@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
}
}
out:
+ mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
return start; /* which is now the end (or restart) address */
}
@@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
return end;
}
+/**
+ * zap_vma_ptes - remove ptes mapping the vma
+ * @vma: vm_area_struct holding ptes to be zapped
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * This function only unmaps ptes assigned to VM_PFNMAP vmas.
+ *
+ * The entire address range must be fully contained within the vma.
+ *
+ * Returns 0 if successful.
+ */
+int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size)
+{
+ if (address < vma->vm_start || address + size > vma->vm_end ||
+ !(vma->vm_flags & VM_PFNMAP))
+ return -1;
+ zap_page_range(vma, address, size, NULL);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(zap_vma_ptes);
+
/*
* Do a quick page-table lookup for a single page.
*/
@@ -982,34 +1043,37 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
goto no_page_table;
pud = pud_offset(pgd, address);
- if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ if (pud_none(*pud))
+ goto no_page_table;
+ if (pud_huge(*pud)) {
+ BUG_ON(flags & FOLL_GET);
+ page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+ goto out;
+ }
+ if (unlikely(pud_bad(*pud)))
goto no_page_table;
-
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto no_page_table;
-
if (pmd_huge(*pmd)) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
-
if (unlikely(pmd_bad(*pmd)))
goto no_page_table;
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!ptep)
- goto out;
pte = *ptep;
if (!pte_present(pte))
- goto unlock;
+ goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
page = vm_normal_page(vma, address, pte);
if (unlikely(!page))
- goto unlock;
+ goto bad_page;
if (flags & FOLL_GET)
get_page(page);
@@ -1024,6 +1088,15 @@ unlock:
out:
return page;
+bad_page:
+ pte_unmap_unlock(ptep, ptl);
+ return ERR_PTR(-EFAULT);
+
+no_page:
+ pte_unmap_unlock(ptep, ptl);
+ if (!pte_none(pte))
+ return page;
+ /* Fall through to ZERO_PAGE handling */
no_page_table:
/*
* When core dumping an enormous anonymous area that nobody
@@ -1038,6 +1111,24 @@ no_page_table:
return page;
}
+/* Can we do the FOLL_ANON optimization? */
+static inline int use_zero_page(struct vm_area_struct *vma)
+{
+ /*
+ * We don't want to optimize FOLL_ANON for make_pages_present()
+ * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
+ * we want to get the page from the page tables to make sure
+ * that we serialize and update with any other user of that
+ * mapping.
+ */
+ if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
+ return 0;
+ /*
+ * And if we have a fault routine, it's not an anonymous region.
+ */
+ return !vma->vm_ops || !vma->vm_ops->fault;
+}
+
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -1112,8 +1203,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
foll_flags = FOLL_TOUCH;
if (pages)
foll_flags |= FOLL_GET;
- if (!write && !(vma->vm_flags & VM_LOCKED) &&
- (!vma->vm_ops || !vma->vm_ops->fault))
+ if (!write && use_zero_page(vma))
foll_flags |= FOLL_ANON;
do {
@@ -1125,7 +1215,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* be processed until returning to user space.
*/
if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
- return -ENOMEM;
+ return i ? i : -ENOMEM;
if (write)
foll_flags |= FOLL_WRITE;
@@ -1159,6 +1249,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
}
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
if (pages) {
pages[i] = page;
@@ -1310,6 +1402,11 @@ out:
*
* This function should only be called from a vm_ops->fault handler, and
* in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
*/
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
@@ -1520,6 +1617,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long next;
int err;
+ BUG_ON(pud_huge(*pud));
+
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
@@ -1561,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + size;
+ unsigned long start = addr, end = addr + size;
int err;
BUG_ON(addr >= end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1572,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1669,15 +1770,26 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page)
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable as we can't do any dirty
+ * accounting on raw pfn maps.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ goto reuse;
goto gotten;
+ }
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
if (PageAnon(old_page)) {
- if (!TestSetPageLocked(old_page)) {
+ if (trylock_page(old_page)) {
reuse = can_share_swap_page(old_page);
unlock_page(old_page);
}
@@ -1723,6 +1835,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
if (reuse) {
+reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1757,7 +1870,6 @@ gotten:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
- page_remove_rmap(old_page, vma);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
@@ -1773,12 +1885,38 @@ gotten:
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush(vma, address, page_table);
+ ptep_clear_flush_notify(vma, address, page_table);
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
+ if (old_page) {
+ /*
+ * Only after switching the pte to the new page may
+ * we remove the mapcount here. Otherwise another
+ * process may come and find the rmap count decremented
+ * before the pte is switched to the new page, and
+ * "reuse" the old page writing into it while our pte
+ * here still points into it and can be read by other
+ * threads.
+ *
+ * The critical issue is to order this
+ * page_remove_rmap with the ptp_clear_flush above.
+ * Those stores are ordered by (if nothing else,)
+ * the barrier present in the atomic_add_negative
+ * in page_remove_rmap.
+ *
+ * Then the TLB flush in ptep_clear_flush ensures that
+ * no process can access the old page before the
+ * decremented mapcount is visible. And the old page
+ * cannot be reused until after the decremented
+ * mapcount is visible. So transitively, TLBs to
+ * old page will be flushed before it can be reused.
+ */
+ page_remove_rmap(old_page, vma);
+ }
+
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
@@ -2436,59 +2574,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
-
-/*
- * do_no_pfn() tries to create a new page mapping for a page without
- * a struct_page backing it
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- *
- * It is expected that the ->nopfn handler always returns the same pfn
- * for a given virtual mapping.
- *
- * Mark this `noinline' to prevent it from bloating the main pagefault code.
- */
-static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access)
-{
- spinlock_t *ptl;
- pte_t entry;
- unsigned long pfn;
-
- pte_unmap(page_table);
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
- BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-
- pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
-
- BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
-
- if (unlikely(pfn == NOPFN_OOM))
- return VM_FAULT_OOM;
- else if (unlikely(pfn == NOPFN_SIGBUS))
- return VM_FAULT_SIGBUS;
- else if (unlikely(pfn == NOPFN_REFAULT))
- return 0;
-
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- /* Only go through if we didn't race with anybody else... */
- if (pte_none(*page_table)) {
- entry = pfn_pte(pfn, vma->vm_page_prot);
- if (write_access)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- set_pte_at(mm, address, page_table, entry);
- }
- pte_unmap_unlock(page_table, ptl);
- return 0;
-}
-
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
@@ -2549,9 +2634,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
pte, pmd, write_access, entry);
- if (unlikely(vma->vm_ops->nopfn))
- return do_no_pfn(mm, vma, address, pte,
- pmd, write_access);
}
return do_anonymous_page(mm, vma, address,
pte, pmd, write_access);
@@ -2683,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end)
vma = find_vma(current->mm, addr);
if (!vma)
- return -1;
+ return -ENOMEM;
write = (vma->vm_flags & VM_WRITE) != 0;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
- if (ret < 0)
+ if (ret < 0) {
+ /*
+ SUS require strange return value to mlock
+ - invalid addr generate to ENOMEM.
+ - out of memory should generate EAGAIN.
+ */
+ if (ret == -EFAULT)
+ ret = -ENOMEM;
+ else if (ret == -ENOMEM)
+ ret = -EAGAIN;
return ret;
- return ret == len ? 0 : -1;
+ }
+ return ret == len ? 0 : -ENOMEM;
}
#if !defined(__HAVE_ARCH_GATE_AREA)
@@ -2739,6 +2831,86 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+static resource_size_t follow_phys(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned long *prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ resource_size_t phys_addr = 0;
+ struct mm_struct *mm = vma->vm_mm;
+
+ VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto no_page_table;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ goto no_page_table;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto no_page_table;
+
+ /* We cannot handle huge page PFN maps. Luckily they don't exist. */
+ if (pmd_huge(*pmd))
+ goto no_page_table;
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!ptep)
+ goto out;
+
+ pte = *ptep;
+ if (!pte_present(pte))
+ goto unlock;
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+ phys_addr = pte_pfn(pte);
+ phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
+
+ *prot = pgprot_val(pte_pgprot(pte));
+
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return phys_addr;
+no_page_table:
+ return 0;
+}
+
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ resource_size_t phys_addr;
+ unsigned long prot = 0;
+ void *maddr;
+ int offset = addr & (PAGE_SIZE-1);
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+ phys_addr = follow_phys(vma, addr, write, &prot);
+
+ if (!phys_addr)
+ return -EINVAL;
+
+ maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+ if (write)
+ memcpy_toio(maddr + offset, buf, len);
+ else
+ memcpy_fromio(buf, maddr + offset, len);
+ iounmap(maddr);
+
+ return len;
+}
+#endif
+
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
@@ -2748,7 +2920,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
{
struct mm_struct *mm;
struct vm_area_struct *vma;
- struct page *page;
void *old_buf = buf;
mm = get_task_mm(tsk);
@@ -2760,28 +2931,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
while (len) {
int bytes, ret, offset;
void *maddr;
+ struct page *page = NULL;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
- if (ret <= 0)
- break;
-
- bytes = len;
- offset = addr & (PAGE_SIZE-1);
- if (bytes > PAGE_SIZE-offset)
- bytes = PAGE_SIZE-offset;
-
- maddr = kmap(page);
- if (write) {
- copy_to_user_page(vma, page, addr,
- maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
+ if (ret <= 0) {
+ /*
+ * Check if this is a VM_IO | VM_PFNMAP VMA, which
+ * we can access using slightly different code.
+ */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+ vma = find_vma(mm, addr);
+ if (!vma)
+ break;
+ if (vma->vm_ops && vma->vm_ops->access)
+ ret = vma->vm_ops->access(vma, addr, buf,
+ len, write);
+ if (ret <= 0)
+#endif
+ break;
+ bytes = ret;
} else {
- copy_from_user_page(vma, page, addr,
- buf, maddr + offset, bytes);
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ page_cache_release(page);
}
- kunmap(page);
- page_cache_release(page);
len -= bytes;
buf += bytes;
addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe..89fee2dcb03 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res)
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info, struct page *page, int magic)
+static void get_page_bootmem(unsigned long info, struct page *page, int type)
{
- atomic_set(&page->_mapcount, magic);
+ atomic_set(&page->_mapcount, type);
SetPagePrivate(page);
set_page_private(page, info);
atomic_inc(&page->_count);
@@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic)
void put_page_bootmem(struct page *page)
{
- int magic;
+ int type;
- magic = atomic_read(&page->_mapcount);
- BUG_ON(magic >= -1);
+ type = atomic_read(&page->_mapcount);
+ BUG_ON(type >= -1);
if (atomic_dec_return(&page->_count) == 1) {
ClearPagePrivate(page);
@@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page)
}
-void register_page_bootmem_info_section(unsigned long start_pfn)
+static void register_page_bootmem_info_section(unsigned long start_pfn)
{
unsigned long *usemap, mapsize, section_nr, i;
struct mem_section *ms;
@@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_INFO);
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
@@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
if (need_zonelists_rebuild)
build_all_zonelists();
- vm_total_pages = nr_free_pagecache_pages();
+ else
+ vm_total_pages = nr_free_pagecache_pages();
+
writeback_set_ratelimit();
if (onlined_pages)
@@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
/* we can use NODE_DATA(nid) from here */
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+ free_area_init_node(nid, zones_size, start_pfn, zholes_size);
return pgdat;
}
@@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory);
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
+ * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
+ * set and the size of the free page is given by page_order(). Using this,
+ * the function determines if the pageblock contains only free pages.
+ * Due to buddy contraints, a free page at least the size of a pageblock will
+ * be located at the start of the pageblock
+ */
+static inline int pageblock_free(struct page *page)
+{
+ return PageBuddy(page) && page_order(page) >= pageblock_order;
+}
+
+/* Return the start of the next active pageblock after a given page */
+static struct page *next_active_pageblock(struct page *page)
+{
+ int pageblocks_stride;
+
+ /* Ensure the starting page is pageblock-aligned */
+ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+
+ /* Move forward by at least 1 * pageblock_nr_pages */
+ pageblocks_stride = 1;
+
+ /* If the entire pageblock is free, move to the end of free page */
+ if (pageblock_free(page))
+ pageblocks_stride += page_order(page) - pageblock_order;
+
+ return page + (pageblocks_stride * pageblock_nr_pages);
+}
+
+/* Checks if this range of memory is likely to be hot-removable. */
+int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+{
+ int type;
+ struct page *page = pfn_to_page(start_pfn);
+ struct page *end_page = page + nr_pages;
+
+ /* Check the starting page of each pageblock within the range */
+ for (; page < end_page; page = next_active_pageblock(page)) {
+ type = get_pageblock_migratetype(page);
+
+ /*
+ * A pageblock containing MOVABLE or free pages is considered
+ * removable
+ */
+ if (type != MIGRATE_MOVABLE && !pageblock_free(page))
+ return 0;
+
+ /*
+ * A pageblock starting with a PageReserved page is not
+ * considered removable.
+ */
+ if (PageReserved(page))
+ return 0;
+ }
+
+ /* All pageblocks in the memory block are likely to be hot-removable */
+ return 1;
+}
+
+/*
* Confirm all pages in a range [start, end) is belongs to the same zone.
*/
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a37a5034f63..83369058ec1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -729,7 +729,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
- *policy |= pol->flags;
+ /*
+ * Internal mempolicy flags must be masked off before exposing
+ * the policy to userspace.
+ */
+ *policy |= (pol->flags & MPOL_MODE_FLAGS);
}
if (vma) {
@@ -799,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
{
- LIST_HEAD(pagelist);
int busy = 0;
int err = 0;
nodemask_t tmp;
@@ -1477,7 +1480,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
zl = node_zonelist(interleave_nid(*mpol, vma, addr,
- HPAGE_SHIFT), gfp_flags);
+ huge_page_shift(hstate_vma(vma))), gfp_flags);
} else {
zl = policy_zonelist(gfp_flags, *mpol);
if ((*mpol)->mode == MPOL_BIND)
@@ -2216,9 +2219,12 @@ static void check_huge_range(struct vm_area_struct *vma,
{
unsigned long addr;
struct page *page;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
- for (addr = start; addr < end; addr += HPAGE_SIZE) {
- pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+ for (addr = start; addr < end; addr += sz) {
+ pte_t *ptep = huge_pte_offset(vma->vm_mm,
+ addr & huge_page_mask(h));
pte_t pte;
if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 449d77d409f..2a80136b23b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -9,7 +9,7 @@
* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
* Hirokazu Takahashi <taka@valinux.co.jp>
* Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter
*/
#include <linux/migrate.h>
@@ -30,6 +30,7 @@
#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/memcontrol.h>
+#include <linux/syscalls.h>
#include "internal.h"
@@ -284,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
page = migration_entry_to_page(entry);
- get_page(page);
+ /*
+ * Once radix-tree replacement of page migration started, page_count
+ * *must* be zero. And, we don't want to call wait_on_page_locked()
+ * against a page without get_page().
+ * So, we use get_page_unless_zero(), here. Even failed, page fault
+ * will occur again.
+ */
+ if (!get_page_unless_zero(page))
+ goto out;
pte_unmap_unlock(ptep, ptl);
wait_on_page_locked(page);
put_page(page);
@@ -304,6 +313,7 @@ out:
static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
+ int expected_count;
void **pslot;
if (!mapping) {
@@ -313,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
return 0;
}
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
- if (page_count(page) != 2 + !!PagePrivate(page) ||
+ expected_count = 2 + !!PagePrivate(page);
+ if (page_count(page) != expected_count ||
(struct page *)radix_tree_deref_slot(pslot) != page) {
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -337,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
radix_tree_replace_slot(pslot, newpage);
+ page_unfreeze_refs(page, expected_count);
/*
* Drop cache reference from old page.
* We know this isn't the last reference.
@@ -356,7 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
__dec_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(newpage, NR_FILE_PAGES);
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
+ if (!PageSwapCache(newpage))
+ mem_cgroup_uncharge_cache_page(page);
return 0;
}
@@ -586,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
* establishing additional references. We are the only one
* holding a reference to the new page at this point.
*/
- if (TestSetPageLocked(newpage))
+ if (!trylock_page(newpage))
BUG();
/* Prepare mapping for the new page.*/
@@ -610,7 +629,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
rc = fallback_migrate_page(mapping, newpage, page);
if (!rc) {
- mem_cgroup_page_migration(page, newpage);
remove_migration_ptes(page, newpage);
} else
newpage->mapping = NULL;
@@ -640,8 +658,16 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
/* page was freed from under us. So we are done. */
goto move_newpage;
+ charge = mem_cgroup_prepare_migration(page, newpage);
+ if (charge == -ENOMEM) {
+ rc = -ENOMEM;
+ goto move_newpage;
+ }
+ /* prepare cgroup just returns 0 or -ENOMEM */
+ BUG_ON(charge);
+
rc = -EAGAIN;
- if (TestSetPageLocked(page)) {
+ if (!trylock_page(page)) {
if (!force)
goto move_newpage;
lock_page(page);
@@ -691,19 +717,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
goto rcu_unlock;
}
- charge = mem_cgroup_prepare_migration(page);
/* Establish migration ptes or remove ptes */
try_to_unmap(page, 1);
if (!page_mapped(page))
rc = move_to_new_page(newpage, page);
- if (rc) {
+ if (rc)
remove_migration_ptes(page, page);
- if (charge)
- mem_cgroup_end_migration(page);
- } else if (charge)
- mem_cgroup_end_migration(newpage);
rcu_unlock:
if (rcu_locked)
rcu_read_unlock();
@@ -724,6 +745,8 @@ unlock:
}
move_newpage:
+ if (!charge)
+ mem_cgroup_end_migration(newpage);
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
@@ -865,6 +888,11 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
goto set_status;
page = follow_page(vma, pp->addr, FOLL_GET);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto set_status;
+
err = -ENOENT;
if (!page)
goto set_status;
@@ -928,6 +956,11 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
goto set_status;
page = follow_page(vma, pm->addr, 0);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto set_status;
+
err = -ENOENT;
/* Use PageReserved to check for zero page */
if (!page || PageReserved(page))
@@ -1060,7 +1093,6 @@ out2:
mmput(mm);
return err;
}
-#endif
/*
* Call migration functions in the vma_ops that may prepare
@@ -1082,3 +1114,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
}
return err;
}
+#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6..01fbe93eff5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -78,8 +78,6 @@ success:
mm->locked_vm -= pages;
out:
- if (ret == -ENOMEM)
- ret = -EAGAIN;
return ret;
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 00000000000..4e0e26591df
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
+/*
+ * mm_init.c - Memory initialisation verification and debugging
+ *
+ * Copyright 2008 IBM Corporation, 2008
+ * Author Mel Gorman <mel@csn.ul.ie>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include "internal.h"
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+int mminit_loglevel;
+
+#ifndef SECTIONS_SHIFT
+#define SECTIONS_SHIFT 0
+#endif
+
+/* The zonelists are simply reported, validation is manual. */
+void mminit_verify_zonelist(void)
+{
+ int nid;
+
+ if (mminit_loglevel < MMINIT_VERIFY)
+ return;
+
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zone *zone;
+ struct zoneref *z;
+ struct zonelist *zonelist;
+ int i, listid, zoneid;
+
+ BUG_ON(MAX_ZONELISTS > 2);
+ for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+
+ /* Identify the zone and nodelist */
+ zoneid = i % MAX_NR_ZONES;
+ listid = i / MAX_NR_ZONES;
+ zonelist = &pgdat->node_zonelists[listid];
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Print information about the zonelist */
+ printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+ listid > 0 ? "thisnode" : "general", nid,
+ zone->name);
+
+ /* Iterate the zonelist */
+ for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+#ifdef CONFIG_NUMA
+ printk(KERN_CONT "%d:%s ",
+ zone->node, zone->name);
+#else
+ printk(KERN_CONT "0:%s ", zone->name);
+#endif /* CONFIG_NUMA */
+ }
+ printk(KERN_CONT "\n");
+ }
+ }
+}
+
+void __init mminit_verify_pageflags_layout(void)
+{
+ int shift, width;
+ unsigned long or_mask, add_mask;
+
+ shift = 8 * sizeof(unsigned long);
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+ "Section %d Node %d Zone %d Flags %d\n",
+ SECTIONS_WIDTH,
+ NODES_WIDTH,
+ ZONES_WIDTH,
+ NR_PAGEFLAGS);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+ "Section %d Node %d Zone %d\n",
+ SECTIONS_SHIFT,
+ NODES_SHIFT,
+ ZONES_SHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
+ "Section %lu Node %lu Zone %lu\n",
+ (unsigned long)SECTIONS_PGSHIFT,
+ (unsigned long)NODES_PGSHIFT,
+ (unsigned long)ZONES_PGSHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
+ "Zone ID: %lu -> %lu\n",
+ (unsigned long)ZONEID_PGOFF,
+ (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
+ "location: %d -> %d unused %d -> %d flags %d -> %d\n",
+ shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+ "Node not in page flags");
+#endif
+
+ if (SECTIONS_WIDTH) {
+ shift -= SECTIONS_WIDTH;
+ BUG_ON(shift != SECTIONS_PGSHIFT);
+ }
+ if (NODES_WIDTH) {
+ shift -= NODES_WIDTH;
+ BUG_ON(shift != NODES_PGSHIFT);
+ }
+ if (ZONES_WIDTH) {
+ shift -= ZONES_WIDTH;
+ BUG_ON(shift != ZONES_PGSHIFT);
+ }
+
+ /* Check for bitmask overlaps */
+ or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
+ (NODES_MASK << NODES_PGSHIFT) |
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
+ (NODES_MASK << NODES_PGSHIFT) +
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ BUG_ON(or_mask != add_mask);
+}
+
+void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
+ unsigned long nid, unsigned long pfn)
+{
+ BUG_ON(page_to_nid(page) != nid);
+ BUG_ON(page_zonenum(page) != zone);
+ BUG_ON(page_to_pfn(page) != pfn);
+}
+
+static __init int set_mminit_loglevel(char *str)
+{
+ get_option(&str, &mminit_loglevel);
+ return 0;
+}
+early_param("mminit_loglevel", set_mminit_loglevel);
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+struct kobject *mm_kobj;
+EXPORT_SYMBOL_GPL(mm_kobj);
+
+static int __init mm_sysfs_init(void)
+{
+ mm_kobj = kobject_create_and_add("mm", kernel_kobj);
+ if (!mm_kobj)
+ return -ENOMEM;
+
+ return 0;
+}
+
+__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 3354fdd83d4..e7a5a68a9c2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,12 +26,15 @@
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
+#include "internal.h"
+
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags) (0)
#endif
@@ -72,8 +75,9 @@ pgprot_t protection_map[16] = {
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- return protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ return __pgprot(pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
+ pgprot_val(arch_vm_get_page_prot(vm_flags)));
}
EXPORT_SYMBOL(vm_get_page_prot);
@@ -366,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
if (vma_tmp->vm_end > addr) {
vma = vma_tmp;
if (vma_tmp->vm_start <= addr)
- return vma;
+ break;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
@@ -1026,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ /*
+ * Ignore pgoff.
+ */
+ pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
@@ -1107,6 +1115,9 @@ munmap_back:
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
+ if (flags & MAP_NORESERVE)
+ vm_flags |= VM_NORESERVE;
+
if (accountable && (!(flags & MAP_NORESERVE) ||
sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
if (vm_flags & VM_SHARED) {
@@ -1762,7 +1773,7 @@ static void unmap_region(struct mm_struct *mm,
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+ free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
}
@@ -1806,7 +1817,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
struct mempolicy *pol;
struct vm_area_struct *new;
- if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+ if (is_vm_hugetlb_page(vma) && (addr &
+ ~(huge_page_mask(hstate_vma(vma)))))
return -EINVAL;
if (mm->map_count >= sysctl_max_map_count)
@@ -2054,6 +2066,7 @@ void exit_mmap(struct mm_struct *mm)
/* mm's last user has gone, and its about to be pulled down */
arch_exit_mmap(mm);
+ mmu_notifier_release(mm);
lru_add_drain();
flush_cache_mm(mm);
@@ -2062,7 +2075,7 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
/*
@@ -2261,3 +2274,167 @@ int install_special_mapping(struct mm_struct *mm,
return 0;
}
+
+static DEFINE_MUTEX(mm_all_locks_mutex);
+
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+ if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ /*
+ * We can safely modify head.next after taking the
+ * anon_vma->lock. If some other vma in this mm shares
+ * the same anon_vma we won't take it again.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us thanks to the
+ * anon_vma->lock.
+ */
+ if (__test_and_set_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ }
+}
+
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+ if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change from under us because
+ * we hold the mm_all_locks_mutex.
+ *
+ * Operations on ->flags have to be atomic because
+ * even if AS_MM_ALL_LOCKS is stable thanks to the
+ * mm_all_locks_mutex, there may be other cpus
+ * changing other bitflags in parallel to us.
+ */
+ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+ BUG();
+ spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+ }
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int ret = -EINTR;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+
+ mutex_lock(&mm_all_locks_mutex);
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ vm_lock_anon_vma(mm, vma->anon_vma);
+ }
+
+ ret = 0;
+
+out_unlock:
+ if (ret)
+ mm_drop_all_locks(mm);
+
+ return ret;
+}
+
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ /*
+ * The LSB of head.next can't change to 0 from under
+ * us because we hold the mm_all_locks_mutex.
+ *
+ * We must however clear the bitflag before unlocking
+ * the vma so the users using the anon_vma->head will
+ * never see our bitflag.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us until we release the
+ * anon_vma->lock.
+ */
+ if (!__test_and_clear_bit(0, (unsigned long *)
+ &anon_vma->head.next))
+ BUG();
+ spin_unlock(&anon_vma->lock);
+ }
+}
+
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+ if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change to 0 from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ spin_unlock(&mapping->i_mmap_lock);
+ if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+ &mapping->flags))
+ BUG();
+ }
+}
+
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma)
+ vm_unlock_anon_vma(vma->anon_vma);
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_unlock_mapping(vma->vm_file->f_mapping);
+ }
+
+ mutex_unlock(&mm_all_locks_mutex);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 00000000000..5f4ef0250be
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
+/*
+ * linux/mm/mmu_notifier.c
+ *
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright (C) 2008 SGI
+ * Christoph Lameter <clameter@sgi.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
+ mn = hlist_entry(mm->mmu_notifier_mm->list.first,
+ struct mmu_notifier,
+ hlist);
+ /*
+ * We arrived before mmu_notifier_unregister so
+ * mmu_notifier_unregister will do nothing other than
+ * to wait ->release to finish and
+ * mmu_notifier_unregister to return.
+ */
+ hlist_del_init_rcu(&mn->hlist);
+ /*
+ * RCU here will block mmu_notifier_unregister until
+ * ->release returns.
+ */
+ rcu_read_lock();
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ /*
+ * if ->release runs before mmu_notifier_unregister it
+ * must be handled as it's the only way for the driver
+ * to flush all existing sptes and stop the driver
+ * from establishing any more sptes before all the
+ * pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ rcu_read_unlock();
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ }
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ /*
+ * synchronize_rcu here prevents mmu_notifier_release to
+ * return to exit_mmap (which would proceed freeing all pages
+ * in the mm) until the ->release method returns, if it was
+ * invoked by mmu_notifier_unregister.
+ *
+ * The mmu_notifier_mm can't go away from under us because one
+ * mm_count is hold by exit_mmap.
+ */
+ synchronize_rcu();
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->clear_flush_young)
+ young |= mn->ops->clear_flush_young(mn, mm, address);
+ }
+ rcu_read_unlock();
+
+ return young;
+}
+
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_page)
+ mn->ops->invalidate_page(mn, mm, address);
+ }
+ rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range_start)
+ mn->ops->invalidate_range_start(mn, mm, start, end);
+ }
+ rcu_read_unlock();
+}
+
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range_end)
+ mn->ops->invalidate_range_end(mn, mm, start, end);
+ }
+ rcu_read_unlock();
+}
+
+static int do_mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ int take_mmap_sem)
+{
+ struct mmu_notifier_mm *mmu_notifier_mm;
+ int ret;
+
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+
+ ret = -ENOMEM;
+ mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+ if (unlikely(!mmu_notifier_mm))
+ goto out;
+
+ if (take_mmap_sem)
+ down_write(&mm->mmap_sem);
+ ret = mm_take_all_locks(mm);
+ if (unlikely(ret))
+ goto out_cleanup;
+
+ if (!mm_has_notifiers(mm)) {
+ INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+ spin_lock_init(&mmu_notifier_mm->lock);
+ mm->mmu_notifier_mm = mmu_notifier_mm;
+ mmu_notifier_mm = NULL;
+ }
+ atomic_inc(&mm->mm_count);
+
+ /*
+ * Serialize the update against mmu_notifier_unregister. A
+ * side note: mmu_notifier_release can't run concurrently with
+ * us because we hold the mm_users pin (either implicitly as
+ * current->mm or explicitly with get_task_mm() or similar).
+ * We can't race against any other mmu notifier method either
+ * thanks to mm_take_all_locks().
+ */
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ mm_drop_all_locks(mm);
+out_cleanup:
+ if (take_mmap_sem)
+ up_write(&mm->mmap_sem);
+ /* kfree() does nothing if mmu_notifier_mm is NULL */
+ kfree(mmu_notifier_mm);
+out:
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ return ret;
+}
+
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns. mmu_notifier_unregister must be always called to
+ * unregister the notifier. mm_count is automatically pinned to allow
+ * mmu_notifier_unregister to safely run at any time later, before or
+ * after exit_mmap. ->release will always be called before exit_mmap
+ * frees the pages.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ return do_mmu_notifier_register(mn, mm, 1);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ return do_mmu_notifier_register(mn, mm, 0);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+{
+ BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+ kfree(mm->mmu_notifier_mm);
+ mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+}
+
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * with the unregister lock + RCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ if (!hlist_unhashed(&mn->hlist)) {
+ hlist_del_rcu(&mn->hlist);
+
+ /*
+ * RCU here will force exit_mmap to wait ->release to finish
+ * before freeing the pages.
+ */
+ rcu_read_lock();
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ /*
+ * exit_mmap will block in mmu_notifier_release to
+ * guarantee ->release is called before freeing the
+ * pages.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ rcu_read_unlock();
+ } else
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ /*
+ * Wait any running method to finish, of course including
+ * ->release if it was run by mmu_notifier_relase instead of us.
+ */
+ synchronize_rcu();
+
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed595ee6..16ce8b955dc 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
(z->zone && !zref_in_nodemask(z, nodes)))
z++;
- *zone = zonelist_zone(z++);
+ *zone = zonelist_zone(z);
return z;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a5bf31c2737..fded06f923f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -47,19 +48,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (pte_present(oldpte)) {
pte_t ptent;
- /* Avoid an SMP race with hardware updated dirty/clean
- * bits by wiping the pte and then setting the new pte
- * into place.
- */
- ptent = ptep_get_and_clear(mm, addr, pte);
+ ptent = ptep_modify_prot_start(mm, addr, pte);
ptent = pte_modify(ptent, newprot);
+
/*
* Avoid taking write faults for pages we know to be
* dirty.
*/
if (dirty_accountable && pte_dirty(ptent))
ptent = pte_mkwrite(ptent);
- set_pte_at(mm, addr, pte, ptent);
+
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
#ifdef CONFIG_MIGRATION
} else if (!pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -155,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
* make it unwritable again.
- *
- * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
- * a MAP_NORESERVE private mapping to writable will now reserve.
*/
if (newflags & VM_WRITE) {
- if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+ if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+ VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
if (security_vm_enough_memory(charged))
return -ENOMEM;
@@ -207,10 +204,12 @@ success:
dirty_accountable = 1;
}
+ mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
else
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+ mmu_notifier_invalidate_range_end(mm, start, end);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
return 0;
@@ -239,7 +238,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
end = start + len;
if (end <= start)
return -ENOMEM;
- if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
+ if (!arch_validate_prot(prot))
return -EINVAL;
reqprot = prot;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd1..1a7743923c8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
+ unsigned long old_start;
+ old_start = old_addr;
+ mmu_notifier_invalidate_range_start(vma->vm_mm,
+ old_start, old_end);
if (vma->vm_file) {
/*
* Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb..ed75bc962fb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
@@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vmalloc_node);
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+/**
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size: allocation size
+ *
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+
+void *vmalloc_exec(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
+
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
@@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file,
* it's being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
- if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+ if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
vm_flags &= ~VM_MAYSHARE;
return vm_flags;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee626..64e5b4bcd96 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/security.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* Superuser processes are usually more important, so we make it
* less likely that we kill those.
*/
- if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
+ if (has_capability(p, CAP_SYS_ADMIN) ||
+ has_capability(p, CAP_SYS_RESOURCE))
points /= 4;
/*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* tend to only have this flag set on applications they think
* of as important.
*/
- if (__capable(p, CAP_SYS_RAWIO))
+ if (has_capability(p, CAP_SYS_RAWIO))
points /= 4;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6adbef3..24de8b65fdb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages);
static struct prop_descriptor vm_completions;
static struct prop_descriptor vm_dirties;
-static unsigned long determine_dirtyable_memory(void);
-
/*
* couple the period to the dirty_ratio:
*
@@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
#endif
}
-static unsigned long determine_dirtyable_memory(void)
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
{
unsigned long x;
@@ -956,6 +960,9 @@ retry:
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+
+ if (wbc->range_cont)
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
@@ -1081,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
if (!mapping)
return 1;
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
@@ -1095,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1251,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
- write_lock_irqsave(&mapping->tree_lock, flags);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
if (ret) {
radix_tree_tag_clear(&mapping->page_tree,
@@ -1262,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
__bdi_writeout_inc(bdi);
}
}
- write_unlock_irqrestore(&mapping->tree_lock, flags);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
}
@@ -1280,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
- write_lock_irqsave(&mapping->tree_lock, flags);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
if (!ret) {
radix_tree_tag_set(&mapping->page_tree,
@@ -1293,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- write_unlock_irqrestore(&mapping->tree_lock, flags);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f552955a02..27b8681139f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
- unsigned long __initdata required_kernelcore;
+ static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
- unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+ static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -264,17 +264,18 @@ static void free_compound_page(struct page *page)
__free_pages_ok(page, compound_order(page));
}
-static void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
+ struct page *p = page + 1;
set_compound_page_dtor(page, free_compound_page);
set_compound_order(page, order);
__SetPageHead(page);
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
-
+ for (i = 1; i < nr_pages; i++, p++) {
+ if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+ p = pfn_to_page(page_to_pfn(page) + i);
__SetPageTail(p);
p->first_page = page;
}
@@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
+ struct page *p = page + 1;
if (unlikely(compound_order(page) != order))
bad_page(page);
@@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (unlikely(!PageHead(page)))
bad_page(page);
__ClearPageHead(page);
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
+ for (i = 1; i < nr_pages; i++, p++) {
+ if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+ p = pfn_to_page(page_to_pfn(page) + i);
if (unlikely(!PageTail(p) |
(p->first_page != page)))
@@ -432,8 +435,9 @@ static inline void __free_one_page(struct page *page,
buddy = __page_find_buddy(page, page_idx, order);
if (!page_is_buddy(page, buddy, order))
- break; /* Move the buddy up one level. */
+ break;
+ /* Our buddy is free, merge with it and move up one order. */
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
@@ -532,7 +536,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
/*
* permit the bootmem allocator to evade page validation on high-order frees
*/
-void __free_pages_bootmem(struct page *page, unsigned int order)
+void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
if (order == 0) {
__ClearPageReserved(page);
@@ -673,9 +677,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
-int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
- int migratetype)
+static int move_freepages(struct zone *zone,
+ struct page *start_page, struct page *end_page,
+ int migratetype)
{
struct page *page;
unsigned long order;
@@ -693,6 +697,9 @@ int move_freepages(struct zone *zone,
#endif
for (page = start_page; page <= end_page;) {
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
@@ -714,7 +721,8 @@ int move_freepages(struct zone *zone,
return pages_moved;
}
-int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
+static int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -918,7 +926,7 @@ void drain_local_pages(void *arg)
*/
void drain_all_pages(void)
{
- on_each_cpu(drain_local_pages, NULL, 0, 1);
+ on_each_cpu(drain_local_pages, NULL, 1);
}
#ifdef CONFIG_HIBERNATION
@@ -1429,7 +1437,7 @@ try_next_zone:
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-static struct page *
+struct page *
__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
@@ -1632,22 +1640,7 @@ nopage:
got_pg:
return page;
}
-
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist)
-{
- return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
-}
-
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
-{
- return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
-}
-
-EXPORT_SYMBOL(__alloc_pages);
+EXPORT_SYMBOL(__alloc_pages_internal);
/*
* Common helper functions.
@@ -1711,6 +1704,59 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
+/**
+ * alloc_pages_exact - allocate an exact number physically-contiguous pages.
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * This function is similar to alloc_pages(), except that it allocates the
+ * minimum number of pages to satisfy the request. alloc_pages() can only
+ * allocate memory in power-of-two pages.
+ *
+ * This function is also limited by MAX_ORDER.
+ *
+ * Memory allocated by this function must be released by free_pages_exact().
+ */
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+{
+ unsigned int order = get_order(size);
+ unsigned long addr;
+
+ addr = __get_free_pages(gfp_mask, order);
+ if (addr) {
+ unsigned long alloc_end = addr + (PAGE_SIZE << order);
+ unsigned long used = addr + PAGE_ALIGN(size);
+
+ split_page(virt_to_page(addr), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+
+ return (void *)addr;
+}
+EXPORT_SYMBOL(alloc_pages_exact);
+
+/**
+ * free_pages_exact - release memory allocated via alloc_pages_exact()
+ * @virt: the value returned by alloc_pages_exact.
+ * @size: size of allocation, same value as passed to alloc_pages_exact().
+ *
+ * Release the memory allocated by a previous call to alloc_pages_exact.
+ */
+void free_pages_exact(void *virt, size_t size)
+{
+ unsigned long addr = (unsigned long)virt;
+ unsigned long end = addr + PAGE_ALIGN(size);
+
+ while (addr < end) {
+ free_page(addr);
+ addr += PAGE_SIZE;
+ }
+}
+EXPORT_SYMBOL(free_pages_exact);
+
static unsigned int nr_free_zone_pages(int offset)
{
struct zoneref *z;
@@ -2328,12 +2374,11 @@ static void build_zonelists(pg_data_t *pgdat)
static void build_zonelist_cache(pg_data_t *pgdat)
{
pgdat->node_zonelists[0].zlcache_ptr = NULL;
- pgdat->node_zonelists[1].zlcache_ptr = NULL;
}
#endif /* CONFIG_NUMA */
-/* return values int ....just for stop_machine_run() */
+/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *dummy)
{
int nid;
@@ -2353,11 +2398,12 @@ void build_all_zonelists(void)
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
+ mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+ stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -2476,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
continue;
page = pfn_to_page(pfn);
+ /* Watch out for overlapping nodes */
+ if (page_to_nid(page) != zone_to_nid(zone))
+ continue;
+
/* Blocks with reserved pages will never free, skip them. */
if (PageReserved(page))
continue;
@@ -2535,6 +2585,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
+ mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
SetPageReserved(page);
@@ -2612,7 +2663,7 @@ static int zone_batchsize(struct zone *zone)
return batch;
}
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
struct per_cpu_pages *pcp;
@@ -2837,6 +2888,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
zone->zone_start_pfn = zone_start_pfn;
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+ pgdat->node_id,
+ (unsigned long)zone_idx(zone),
+ zone_start_pfn, (zone_start_pfn + size));
+
zone_init_free_lists(zone);
return 0;
@@ -2930,6 +2987,18 @@ void __init free_bootmem_with_active_regions(int nid,
}
}
+void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
+{
+ int i;
+ int ret;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ ret = work_fn(early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn, data);
+ if (ret)
+ break;
+ }
+}
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -2964,7 +3033,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
void __init push_node_boundaries(unsigned int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
- printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+ mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+ "Entering push_node_boundaries(%u, %lu, %lu)\n",
nid, start_pfn, end_pfn);
/* Initialise the boundary for this node if necessary */
@@ -2982,7 +3052,8 @@ void __init push_node_boundaries(unsigned int nid,
static void __meminit account_node_boundary(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
- printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+ mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+ "Entering account_node_boundary(%u, %lu, %lu)\n",
nid, *start_pfn, *end_pfn);
/* Return if boundary information has not been provided */
@@ -3039,7 +3110,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
* assumption is made that zones within a node are ordered in monotonic
* increasing memory addresses so that the "highest" populated zone is used
*/
-void __init find_usable_zone_for_movable(void)
+static void __init find_usable_zone_for_movable(void)
{
int zone_index;
for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3065,7 +3136,7 @@ void __init find_usable_zone_for_movable(void)
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
* zones within a node are in order of monotonic increases memory addresses
*/
-void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __meminit adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -3126,7 +3197,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
-unsigned long __meminit __absent_pages_in_range(int nid,
+static unsigned long __meminit __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
@@ -3357,8 +3428,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "%s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
@@ -3368,7 +3439,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
/* Account for reserved pages */
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;
- printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "%s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
@@ -3453,15 +3525,21 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
-void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long node_start_pfn,
- unsigned long *zholes_size)
+void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
+ unsigned long node_start_pfn, unsigned long *zholes_size)
{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ nid, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
+#endif
free_area_init_core(pgdat, zones_size, zholes_size);
}
@@ -3504,10 +3582,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
{
int i;
- printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
- "%d entries of %d used\n",
- nid, start_pfn, end_pfn,
- nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+ mminit_dprintk(MMINIT_TRACE, "memory_register",
+ "Entering add_active_range(%d, %#lx, %#lx) "
+ "%d entries of %d used\n",
+ nid, start_pfn, end_pfn,
+ nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+
+ mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
/* Merge with existing active regions if possible */
for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3548,27 +3629,68 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
}
/**
- * shrink_active_range - Shrink an existing registered range of PFNs
+ * remove_active_range - Shrink an existing registered range of PFNs
* @nid: The node id the range is on that should be shrunk
- * @old_end_pfn: The old end PFN of the range
- * @new_end_pfn: The new PFN of the range
+ * @start_pfn: The new PFN of the range
+ * @end_pfn: The new PFN of the range
*
* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept at the end physical page range that has already been
- * registered with add_active_range(). This function allows an arch to shrink
- * an existing registered range.
+ * The map is kept near the end physical page range that has already been
+ * registered. This function allows an arch to shrink an existing registered
+ * range.
*/
-void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
- unsigned long new_end_pfn)
+void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
{
- int i;
+ int i, j;
+ int removed = 0;
+
+ printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
+ nid, start_pfn, end_pfn);
/* Find the old active region end and shrink */
- for_each_active_range_index_in_nid(i, nid)
- if (early_node_map[i].end_pfn == old_end_pfn) {
- early_node_map[i].end_pfn = new_end_pfn;
- break;
+ for_each_active_range_index_in_nid(i, nid) {
+ if (early_node_map[i].start_pfn >= start_pfn &&
+ early_node_map[i].end_pfn <= end_pfn) {
+ /* clear it */
+ early_node_map[i].start_pfn = 0;
+ early_node_map[i].end_pfn = 0;
+ removed = 1;
+ continue;
+ }
+ if (early_node_map[i].start_pfn < start_pfn &&
+ early_node_map[i].end_pfn > start_pfn) {
+ unsigned long temp_end_pfn = early_node_map[i].end_pfn;
+ early_node_map[i].end_pfn = start_pfn;
+ if (temp_end_pfn > end_pfn)
+ add_active_range(nid, end_pfn, temp_end_pfn);
+ continue;
+ }
+ if (early_node_map[i].start_pfn >= start_pfn &&
+ early_node_map[i].end_pfn > end_pfn &&
+ early_node_map[i].start_pfn < end_pfn) {
+ early_node_map[i].start_pfn = end_pfn;
+ continue;
}
+ }
+
+ if (!removed)
+ return;
+
+ /* remove the blank ones */
+ for (i = nr_nodemap_entries - 1; i > 0; i--) {
+ if (early_node_map[i].nid != nid)
+ continue;
+ if (early_node_map[i].end_pfn)
+ continue;
+ /* we found it, get rid of it */
+ for (j = i; j < nr_nodemap_entries - 1; j++)
+ memcpy(&early_node_map[j], &early_node_map[j+1],
+ sizeof(early_node_map[j]));
+ j = nr_nodemap_entries - 1;
+ memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
+ nr_nodemap_entries--;
+ }
}
/**
@@ -3612,7 +3734,7 @@ static void __init sort_node_map(void)
}
/* Find the lowest pfn for a node */
-unsigned long __init find_min_pfn_for_node(unsigned long nid)
+static unsigned long __init find_min_pfn_for_node(int nid)
{
int i;
unsigned long min_pfn = ULONG_MAX;
@@ -3623,7 +3745,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid)
if (min_pfn == ULONG_MAX) {
printk(KERN_WARNING
- "Could not find start_pfn for node %lu\n", nid);
+ "Could not find start_pfn for node %d\n", nid);
return 0;
}
@@ -3641,23 +3763,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
return find_min_pfn_for_node(MAX_NUMNODES);
}
-/**
- * find_max_pfn_with_active_regions - Find the maximum PFN registered
- *
- * It returns the maximum PFN based on information provided via
- * add_active_range().
- */
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
- int i;
- unsigned long max_pfn = 0;
-
- for (i = 0; i < nr_nodemap_entries; i++)
- max_pfn = max(max_pfn, early_node_map[i].end_pfn);
-
- return max_pfn;
-}
-
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
@@ -3684,7 +3789,7 @@ static unsigned long __init early_calculate_totalpages(void)
* memory. When they don't, some nodes will have more kernelcore than
* others
*/
-void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
{
int i, nid;
unsigned long usable_startpfn;
@@ -3879,7 +3984,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
- printk(" %-8s %8lu -> %8lu\n",
+ printk(" %-8s %0#10lx -> %0#10lx\n",
zone_names[i],
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
@@ -3895,15 +4000,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Print out the early_node_map[] */
printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
for (i = 0; i < nr_nodemap_entries; i++)
- printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
+ printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
early_node_map[i].start_pfn,
early_node_map[i].end_pfn);
/* Initialise every node */
+ mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- free_area_init_node(nid, pgdat, NULL,
+ free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
@@ -3968,15 +4074,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-static bootmem_data_t contig_bootmem_data;
-struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
-
+struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
EXPORT_SYMBOL(contig_page_data);
#endif
void __init free_area_init(unsigned long *zones_size)
{
- free_area_init_node(0, NODE_DATA(0), zones_size,
+ free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
@@ -4343,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem(size);
+ table = alloc_bootmem_nopanic(size);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c..b70a7fec1ff 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
* linux/mm/page_isolation.c
*/
-#include <stddef.h>
#include <linux/mm.h>
#include <linux/page-isolation.h>
#include <linux/pageblock-flags.h>
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long pfn;
+ unsigned long pfn, flags;
struct page *page;
+ struct zone *zone;
+ int ret;
pfn = start_pfn;
/*
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
if (pfn < end_pfn)
return -EBUSY;
/* Check all pages are free or Marked as ISOLATED */
- if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
- return 0;
- return -EBUSY;
+ zone = page_zone(pfn_to_page(pfn));
+ spin_lock_irqsave(&zone->lock, flags);
+ ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret ? 0 : -EBUSY;
}
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b97..0cbe0c60c6b 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
* Thread creation: For how long have there been zero
* available threads?
*/
- if (jiffies - last_empty_jifs > 1 * HZ) {
+ if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
/* unlocked list_empty() test is OK here */
if (list_empty(&pdflush_list)) {
/* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
continue;
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
- if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
+ if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
/* Limit exit rate */
pdf->when_i_went_to_sleep = jiffies;
break; /* exeunt */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb39..8dbb6805ef3 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
static unsigned long max_pages(unsigned long min_pages)
{
unsigned long node_free_pages, max;
- struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+ int node = numa_node_id();
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int num_cpus_on_node;
+ node_to_cpumask_ptr(cpumask_on_node, node);
node_free_pages =
#ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
max = node_free_pages / FRACTION_OF_NODE_MEM;
+
+ num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+ max /= num_cpus_on_node;
+
return max(max, min_pages);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f649..77e8ddf945e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
if (hit_readahead_marker) {
pgoff_t start;
- read_lock_irq(&mapping->tree_lock);
- start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
- read_unlock_irq(&mapping->tree_lock);
+ rcu_read_lock();
+ start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
+ rcu_read_unlock();
if (!start || start - offset > max)
return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8..0383acfcb06 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
anon_vma_free(anon_vma);
}
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
@@ -223,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
/*
* Check that @page is mapped at @address into @mm.
*
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
* On success returns with pte mapped and locked.
*/
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address, spinlock_t **ptlp)
+ unsigned long address, spinlock_t **ptlp, int sync)
{
pgd_t *pgd;
pud_t *pud;
@@ -248,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
- if (!pte_present(*pte)) {
+ if (!sync && !pte_present(*pte)) {
pte_unmap(pte);
return NULL;
}
@@ -280,14 +285,14 @@ static int page_referenced_one(struct page *page,
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
if (vma->vm_flags & VM_LOCKED) {
referenced++;
*mapcount = 1; /* break early from loop */
- } else if (ptep_clear_flush_young(vma, address, pte))
+ } else if (ptep_clear_flush_young_notify(vma, address, pte))
referenced++;
/* Pretend the page is referenced if the task has the
@@ -421,7 +426,7 @@ int page_referenced(struct page *page, int is_locked,
referenced += page_referenced_anon(page, mem_cont);
else if (is_locked)
referenced += page_referenced_file(page, mem_cont);
- else if (TestSetPageLocked(page))
+ else if (!trylock_page(page))
referenced++;
else {
if (page->mapping)
@@ -449,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
goto out;
@@ -457,7 +462,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
pte_t entry;
flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush(vma, address, pte);
+ entry = ptep_clear_flush_notify(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(mm, address, pte, entry);
@@ -576,14 +581,8 @@ void page_add_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
if (atomic_inc_and_test(&page->_mapcount))
__page_set_anon_rmap(page, vma, address);
- else {
+ else
__page_check_anon_rmap(page, vma, address);
- /*
- * We unconditionally charged during prepare, we uncharge here
- * This takes care of balancing the reference counts
- */
- mem_cgroup_uncharge_page(page);
- }
}
/**
@@ -614,12 +613,6 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount))
__inc_zone_page_state(page, NR_FILE_MAPPED);
- else
- /*
- * We unconditionally charged during prepare, we uncharge here
- * This takes care of balancing the reference counts
- */
- mem_cgroup_uncharge_page(page);
}
#ifdef CONFIG_DEBUG_VM
@@ -670,6 +663,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
}
/*
+ * Now that the last pte has gone, s390 must transfer dirty
+ * flag from storage key to struct page. We can usually skip
+ * this if the page is anon, so about to be freed; but perhaps
+ * not if it's in swapcache - there might be another pte slot
+ * containing the swap entry, but page not yet written to swap.
+ */
+ if ((!PageAnon(page) || PageSwapCache(page)) &&
+ page_test_dirty(page)) {
+ page_clear_dirty(page);
+ set_page_dirty(page);
+ }
+
+ mem_cgroup_uncharge_page(page);
+ __dec_zone_page_state(page,
+ PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+ /*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
* which increments mapcount after us but sets mapping
@@ -678,14 +687,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
- if (page_test_dirty(page)) {
- page_clear_dirty(page);
- set_page_dirty(page);
- }
- mem_cgroup_uncharge_page(page);
-
- __dec_zone_page_state(page,
- PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
}
}
@@ -707,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -717,14 +718,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* skipped over this mm) then we should reactivate it.
*/
if (!migration && ((vma->vm_flags & VM_LOCKED) ||
- (ptep_clear_flush_young(vma, address, pte)))) {
+ (ptep_clear_flush_young_notify(vma, address, pte)))) {
ret = SWAP_FAIL;
goto out_unmap;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -849,12 +850,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
page = vm_normal_page(vma, address, *pte);
BUG_ON(!page || PageAnon(page));
- if (ptep_clear_flush_young(vma, address, pte))
+ if (ptep_clear_flush_young_notify(vma, address, pte))
continue;
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
+ pteval = ptep_clear_flush_notify(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e..04fb4f1ab88 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,20 +922,26 @@ found:
error = 1;
if (!inode)
goto out;
- /* Precharge page while we can wait, compensate afterwards */
+ /* Precharge page using GFP_KERNEL while we can wait */
error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
if (error)
goto out;
error = radix_tree_preload(GFP_KERNEL);
- if (error)
- goto uncharge;
+ if (error) {
+ mem_cgroup_uncharge_cache_page(page);
+ goto out;
+ }
error = 1;
spin_lock(&info->lock);
ptr = shmem_swp_entry(info, idx, NULL);
- if (ptr && ptr->val == entry.val)
- error = add_to_page_cache(page, inode->i_mapping,
+ if (ptr && ptr->val == entry.val) {
+ error = add_to_page_cache_locked(page, inode->i_mapping,
idx, GFP_NOWAIT);
+ /* does mem_cgroup_uncharge_cache_page on error */
+ } else /* we must compensate for our precharge above */
+ mem_cgroup_uncharge_cache_page(page);
+
if (error == -EEXIST) {
struct page *filepage = find_get_page(inode->i_mapping, idx);
error = 1;
@@ -961,8 +967,6 @@ found:
shmem_swp_unmap(ptr);
spin_unlock(&info->lock);
radix_tree_preload_end();
-uncharge:
- mem_cgroup_uncharge_page(page);
out:
unlock_page(page);
page_cache_release(page);
@@ -1261,7 +1265,7 @@ repeat:
}
/* We have to do this with page locked to prevent races */
- if (TestSetPageLocked(swappage)) {
+ if (!trylock_page(swappage)) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
wait_on_page_locked(swappage);
@@ -1297,8 +1301,8 @@ repeat:
SetPageUptodate(filepage);
set_page_dirty(filepage);
swap_free(swap);
- } else if (!(error = add_to_page_cache(
- swappage, mapping, idx, GFP_NOWAIT))) {
+ } else if (!(error = add_to_page_cache_locked(swappage, mapping,
+ idx, GFP_NOWAIT))) {
info->flags |= SHMEM_PAGEIN;
shmem_swp_set(info, entry, 0);
shmem_swp_unmap(entry);
@@ -1311,24 +1315,21 @@ repeat:
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
unlock_page(swappage);
+ page_cache_release(swappage);
if (error == -ENOMEM) {
/* allow reclaim from this memory cgroup */
- error = mem_cgroup_cache_charge(swappage,
- current->mm, gfp & ~__GFP_HIGHMEM);
- if (error) {
- page_cache_release(swappage);
+ error = mem_cgroup_shrink_usage(current->mm,
+ gfp);
+ if (error)
goto failed;
- }
- mem_cgroup_uncharge_page(swappage);
}
- page_cache_release(swappage);
goto repeat;
}
} else if (sgp == SGP_READ && !filepage) {
shmem_swp_unmap(entry);
filepage = find_get_page(mapping, idx);
if (filepage &&
- (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+ (!PageUptodate(filepage) || !trylock_page(filepage))) {
spin_unlock(&info->lock);
wait_on_page_locked(filepage);
page_cache_release(filepage);
@@ -1358,6 +1359,8 @@ repeat:
}
if (!filepage) {
+ int ret;
+
spin_unlock(&info->lock);
filepage = shmem_alloc_page(gfp, info, idx);
if (!filepage) {
@@ -1386,10 +1389,18 @@ repeat:
swap = *entry;
shmem_swp_unmap(entry);
}
- if (error || swap.val || 0 != add_to_page_cache_lru(
- filepage, mapping, idx, GFP_NOWAIT)) {
+ ret = error || swap.val;
+ if (ret)
+ mem_cgroup_uncharge_cache_page(filepage);
+ else
+ ret = add_to_page_cache_lru(filepage, mapping,
+ idx, GFP_NOWAIT);
+ /*
+ * At add_to_page_cache_lru() failure, uncharge will
+ * be done automatically.
+ */
+ if (ret) {
spin_unlock(&info->lock);
- mem_cgroup_uncharge_page(filepage);
page_cache_release(filepage);
shmem_unacct_blocks(info->flags, 1);
shmem_free_blocks(inode, 1);
@@ -1398,7 +1409,6 @@ repeat:
goto failed;
goto repeat;
}
- mem_cgroup_uncharge_page(filepage);
info->flags |= SHMEM_PAGEIN;
}
@@ -1503,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
inode->i_blocks = 0;
- inode->i_mapping->a_ops = &shmem_aops;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_generation = get_seconds();
@@ -1518,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
+ inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
mpol_shared_policy_init(&info->policy,
@@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
file_accessed(filp);
}
-static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs, loff_t pos)
{
- read_descriptor_t desc;
+ struct file *filp = iocb->ki_filp;
+ ssize_t retval;
+ unsigned long seg;
+ size_t count;
+ loff_t *ppos = &iocb->ki_pos;
- if ((ssize_t) count < 0)
- return -EINVAL;
- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
- if (!count)
- return 0;
+ retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (retval)
+ return retval;
- desc.written = 0;
- desc.count = count;
- desc.arg.buf = buf;
- desc.error = 0;
+ for (seg = 0; seg < nr_segs; seg++) {
+ read_descriptor_t desc;
- do_shmem_file_read(filp, ppos, &desc, file_read_actor);
- if (desc.written)
- return desc.written;
- return desc.error;
+ desc.written = 0;
+ desc.arg.buf = iov[seg].iov_base;
+ desc.count = iov[seg].iov_len;
+ if (desc.count == 0)
+ continue;
+ desc.error = 0;
+ do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+ retval += desc.written;
+ if (desc.error) {
+ retval = retval ?: desc.error;
+ break;
+ }
+ if (desc.count > 0)
+ break;
+ }
+ return retval;
}
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1907,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
return error;
}
unlock_page(page);
+ inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, symname, len);
@@ -2330,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2369,8 +2392,9 @@ static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = generic_file_llseek,
- .read = shmem_file_read,
+ .read = do_sync_read,
.write = do_sync_write,
+ .aio_read = shmem_file_aio_read,
.aio_write = generic_file_aio_write,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb..8e5aadd7dcd 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
* shmem_permission - permission() inode operation
*/
int
-shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+shmem_permission(struct inode *inode, int mask)
{
return generic_permission(inode, mask, shmem_check_acl);
}
diff --git a/mm/slab.c b/mm/slab.c
index 06236e4ddc1..e76eee46688 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
unsigned int dflags; /* dynamic flags */
/* constructor func */
- void (*ctor)(struct kmem_cache *, void *);
+ void (*ctor)(void *obj);
/* 5) cache creation/removal */
const char *name;
@@ -1901,15 +1901,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
#endif
#if DEBUG
-/**
- * slab_destroy_objs - destroy a slab and its objects
- * @cachep: cache pointer being destroyed
- * @slabp: slab pointer being destroyed
- *
- * Call the registered destructor for each object in a slab that is being
- * destroyed.
- */
-static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
{
int i;
for (i = 0; i < cachep->num; i++) {
@@ -1938,7 +1930,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
}
}
#else
-static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
{
}
#endif
@@ -1956,7 +1948,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
{
void *addr = slabp->s_mem - slabp->colouroff;
- slab_destroy_objs(cachep, slabp);
+ slab_destroy_debugcheck(cachep, slabp);
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
struct slab_rcu *slab_rcu;
@@ -2145,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
*/
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
- unsigned long flags,
- void (*ctor)(struct kmem_cache *, void *))
+ unsigned long flags, void (*ctor)(void *))
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
@@ -2454,7 +2445,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
struct kmem_list3 *l3;
int node;
- on_each_cpu(do_drain, cachep, 1, 1);
+ on_each_cpu(do_drain, cachep, 1);
check_irq_on();
for_each_online_node(node) {
l3 = cachep->nodelists[node];
@@ -2661,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
* They must also be threaded.
*/
if (cachep->ctor && !(cachep->flags & SLAB_POISON))
- cachep->ctor(cachep, objp + obj_offset(cachep));
+ cachep->ctor(objp + obj_offset(cachep));
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2677,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
cachep->buffer_size / PAGE_SIZE, 0);
#else
if (cachep->ctor)
- cachep->ctor(cachep, objp);
+ cachep->ctor(objp);
#endif
slab_bufctl(slabp)[i] = i + 1;
}
@@ -3101,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
#endif
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
- cachep->ctor(cachep, objp);
+ cachep->ctor(objp);
#if ARCH_SLAB_MINALIGN
if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -3263,9 +3254,12 @@ retry:
if (cpuset_zone_allowed_hardwall(zone, flags) &&
cache->nodelists[nid] &&
- cache->nodelists[nid]->free_objects)
+ cache->nodelists[nid]->free_objects) {
obj = ____cache_alloc_node(cache,
flags | GFP_THISNODE, nid);
+ if (obj)
+ break;
+ }
}
if (!obj) {
@@ -3936,7 +3930,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
}
new->cachep = cachep;
- on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
+ on_each_cpu(do_ccupdate_local, (void *)new, 1);
check_irq_on();
cachep->batchcount = batchcount;
@@ -4478,4 +4472,3 @@ size_t ksize(const void *objp)
return obj_size(virt_to_cache(objp));
}
-EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf..cb675d12679 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
*/
static inline int slob_page(struct slob_page *sp)
{
- return test_bit(PG_active, &sp->flags);
+ return PageSlobPage((struct page *)sp);
}
static inline void set_slob_page(struct slob_page *sp)
{
- __set_bit(PG_active, &sp->flags);
+ __SetPageSlobPage((struct page *)sp);
}
static inline void clear_slob_page(struct slob_page *sp)
{
- __clear_bit(PG_active, &sp->flags);
+ __ClearPageSlobPage((struct page *)sp);
}
/*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
*/
static inline int slob_page_free(struct slob_page *sp)
{
- return test_bit(PG_private, &sp->flags);
+ return PageSlobFree((struct page *)sp);
}
static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
{
list_add(&sp->list, list);
- __set_bit(PG_private, &sp->flags);
+ __SetPageSlobFree((struct page *)sp);
}
static inline void clear_slob_page_free(struct slob_page *sp)
{
list_del(&sp->list);
- __clear_bit(PG_private, &sp->flags);
+ __ClearPageSlobFree((struct page *)sp);
}
#define SLOB_UNIT sizeof(slob_t)
@@ -514,23 +514,23 @@ size_t ksize(const void *block)
return 0;
sp = (struct slob_page *)virt_to_page(block);
- if (slob_page(sp))
- return ((slob_t *)block - 1)->units + SLOB_UNIT;
- else
+ if (slob_page(sp)) {
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ return SLOB_UNITS(*m) * SLOB_UNIT;
+ } else
return sp->page.private;
}
-EXPORT_SYMBOL(ksize);
struct kmem_cache {
unsigned int size, align;
unsigned long flags;
const char *name;
- void (*ctor)(struct kmem_cache *, void *);
+ void (*ctor)(void *);
};
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
- size_t align, unsigned long flags,
- void (*ctor)(struct kmem_cache *, void *))
+ size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *c;
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
b = slob_new_page(flags, get_order(c->size), node);
if (c->ctor)
- c->ctor(c, b);
+ c->ctor(b);
return b;
}
diff --git a/mm/slub.c b/mm/slub.c
index 0987d1cd943..0c83e6afe7b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5,7 +5,7 @@
* The allocator synchronizes using per slab locks and only
* uses a centralized lock to manage a pool of partial slabs.
*
- * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
+ * (C) 2007 SGI, Christoph Lameter
*/
#include <linux/mm.h>
@@ -102,44 +102,12 @@
* the fast path and disables lockless freelists.
*/
-#define FROZEN (1 << PG_active)
-
#ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG (1 << PG_error)
+#define SLABDEBUG 1
#else
#define SLABDEBUG 0
#endif
-static inline int SlabFrozen(struct page *page)
-{
- return page->flags & FROZEN;
-}
-
-static inline void SetSlabFrozen(struct page *page)
-{
- page->flags |= FROZEN;
-}
-
-static inline void ClearSlabFrozen(struct page *page)
-{
- page->flags &= ~FROZEN;
-}
-
-static inline int SlabDebug(struct page *page)
-{
- return page->flags & SLABDEBUG;
-}
-
-static inline void SetSlabDebug(struct page *page)
-{
- page->flags |= SLABDEBUG;
-}
-
-static inline void ClearSlabDebug(struct page *page)
-{
- page->flags &= ~SLABDEBUG;
-}
-
/*
* Issues still to be resolved:
*
@@ -411,7 +379,7 @@ static void set_track(struct kmem_cache *s, void *object,
if (addr) {
p->addr = addr;
p->cpu = smp_processor_id();
- p->pid = current ? current->pid : -1;
+ p->pid = current->pid;
p->when = jiffies;
} else
memset(p, 0, sizeof(struct track));
@@ -431,9 +399,8 @@ static void print_track(const char *s, struct track *t)
if (!t->addr)
return;
- printk(KERN_ERR "INFO: %s in ", s);
- __print_symbol("%s", (unsigned long)t->addr);
- printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+ printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ s, t->addr, jiffies - t->when, t->cpu, t->pid);
}
static void print_tracking(struct kmem_cache *s, void *object)
@@ -493,7 +460,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (p > addr + 16)
print_section("Bytes b4", p - 16, 16);
- print_section("Object", p, min(s->objsize, 128));
+ print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
print_section("Redzone", p + s->objsize,
@@ -972,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
}
/* Special debug activities for freeing objects */
- if (!SlabFrozen(page) && !page->freelist)
+ if (!PageSlubFrozen(page) && !page->freelist)
remove_full(s, page);
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
@@ -1045,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
static unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
- void (*ctor)(struct kmem_cache *, void *))
+ void (*ctor)(void *))
{
/*
* Enable debugging if selected on the kernel commandline.
@@ -1073,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
- void (*ctor)(struct kmem_cache *, void *))
+ void (*ctor)(void *))
{
return flags;
}
@@ -1136,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
{
setup_object_debug(s, page, object);
if (unlikely(s->ctor))
- s->ctor(s, object);
+ s->ctor(object);
}
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1158,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
page->flags |= 1 << PG_slab;
if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
SLAB_STORE_USER | SLAB_TRACE))
- SetSlabDebug(page);
+ __SetPageSlubDebug(page);
start = page_address(page);
@@ -1185,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (unlikely(SlabDebug(page))) {
+ if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
void *p;
slab_pad_check(s, page);
for_each_object(p, s, page_address(page),
page->objects)
check_object(s, page, p, 0);
- ClearSlabDebug(page);
+ __ClearPageSlubDebug(page);
}
mod_zone_page_state(page_zone(page),
@@ -1289,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
if (slab_trylock(page)) {
list_del(&page->lru);
n->nr_partial--;
- SetSlabFrozen(page);
+ __SetPageSlubFrozen(page);
return 1;
}
return 0;
@@ -1362,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
n = get_node(s, zone_to_nid(zone));
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
- n->nr_partial > MIN_PARTIAL) {
+ n->nr_partial > n->min_partial) {
page = get_partial_node(n);
if (page)
return page;
@@ -1399,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
- ClearSlabFrozen(page);
+ __ClearPageSlubFrozen(page);
if (page->inuse) {
if (page->freelist) {
@@ -1407,13 +1374,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
} else {
stat(c, DEACTIVATE_FULL);
- if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
+ if (SLABDEBUG && PageSlubDebug(page) &&
+ (s->flags & SLAB_STORE_USER))
add_full(n, page);
}
slab_unlock(page);
} else {
stat(c, DEACTIVATE_EMPTY);
- if (n->nr_partial < MIN_PARTIAL) {
+ if (n->nr_partial < n->min_partial) {
/*
* Adding an empty slab to the partial slabs in order
* to avoid page allocator overhead. This slab needs
@@ -1496,15 +1464,7 @@ static void flush_cpu_slab(void *d)
static void flush_all(struct kmem_cache *s)
{
-#ifdef CONFIG_SMP
- on_each_cpu(flush_cpu_slab, s, 1, 1);
-#else
- unsigned long flags;
-
- local_irq_save(flags);
- flush_cpu_slab(s);
- local_irq_restore(flags);
-#endif
+ on_each_cpu(flush_cpu_slab, s, 1);
}
/*
@@ -1560,7 +1520,7 @@ load_freelist:
object = c->page->freelist;
if (unlikely(!object))
goto another_slab;
- if (unlikely(SlabDebug(c->page)))
+ if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
goto debug;
c->freelist = object[c->offset];
@@ -1597,7 +1557,7 @@ new_slab:
if (c->page)
flush_slab(s, c);
slab_lock(new);
- SetSlabFrozen(new);
+ __SetPageSlubFrozen(new);
c->page = new;
goto load_freelist;
}
@@ -1628,9 +1588,11 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
void **object;
struct kmem_cache_cpu *c;
unsigned long flags;
+ unsigned int objsize;
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
+ objsize = c->objsize;
if (unlikely(!c->freelist || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
@@ -1643,7 +1605,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
local_irq_restore(flags);
if (unlikely((gfpflags & __GFP_ZERO) && object))
- memset(object, 0, c->objsize);
+ memset(object, 0, objsize);
return object;
}
@@ -1681,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(c, FREE_SLOWPATH);
slab_lock(page);
- if (unlikely(SlabDebug(page)))
+ if (unlikely(SLABDEBUG && PageSlubDebug(page)))
goto debug;
checks_ok:
@@ -1689,7 +1651,7 @@ checks_ok:
page->freelist = object;
page->inuse--;
- if (unlikely(SlabFrozen(page))) {
+ if (unlikely(PageSlubFrozen(page))) {
stat(c, FREE_FROZEN);
goto out_unlock;
}
@@ -1951,13 +1913,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
#endif
}
-static void init_kmem_cache_node(struct kmem_cache_node *n)
+static void
+init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
{
n->nr_partial = 0;
+
+ /*
+ * The larger the object size is, the more pages we want on the partial
+ * list to avoid pounding the page allocator excessively.
+ */
+ n->min_partial = ilog2(s->size);
+ if (n->min_partial < MIN_PARTIAL)
+ n->min_partial = MIN_PARTIAL;
+ else if (n->min_partial > MAX_PARTIAL)
+ n->min_partial = MAX_PARTIAL;
+
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
+ atomic_long_set(&n->total_objects, 0);
INIT_LIST_HEAD(&n->full);
#endif
}
@@ -2125,7 +2100,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
init_object(kmalloc_caches, n, 1);
init_tracking(kmalloc_caches, n);
#endif
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, kmalloc_caches);
inc_slabs_node(kmalloc_caches, node, page->objects);
/*
@@ -2182,7 +2157,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
}
s->node[node] = n;
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, s);
}
return 1;
}
@@ -2193,7 +2168,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
{
- init_kmem_cache_node(&s->local_node);
+ init_kmem_cache_node(&s->local_node, s);
return 1;
}
#endif
@@ -2324,7 +2299,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
const char *name, size_t size,
size_t align, unsigned long flags,
- void (*ctor)(struct kmem_cache *, void *))
+ void (*ctor)(void *))
{
memset(s, 0, kmem_size);
s->name = name;
@@ -2338,7 +2313,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
s->refcount = 1;
#ifdef CONFIG_NUMA
- s->remote_node_defrag_ratio = 100;
+ s->remote_node_defrag_ratio = 1000;
#endif
if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
goto error;
@@ -2753,7 +2728,6 @@ size_t ksize(const void *object)
*/
return s->size;
}
-EXPORT_SYMBOL(ksize);
void kfree(const void *x)
{
@@ -2765,6 +2739,7 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
put_page(page);
return;
}
@@ -2927,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg)
ret = -ENOMEM;
goto out;
}
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, s);
s->node[nid] = n;
}
out:
@@ -2995,8 +2970,6 @@ void __init kmem_cache_init(void)
create_kmalloc_cache(&kmalloc_caches[1],
"kmalloc-96", 96, GFP_KERNEL);
caches++;
- }
- if (KMALLOC_MIN_SIZE <= 128) {
create_kmalloc_cache(&kmalloc_caches[2],
"kmalloc-192", 192, GFP_KERNEL);
caches++;
@@ -3026,6 +2999,16 @@ void __init kmem_cache_init(void)
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+ if (KMALLOC_MIN_SIZE == 128) {
+ /*
+ * The 192 byte sized cache is not used if the alignment
+ * is 128 byte. Redirect kmalloc to use the 256 byte cache
+ * instead.
+ */
+ for (i = 128 + 8; i <= 192; i += 8)
+ size_index[(i - 1) / 8] = 8;
+ }
+
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
@@ -3071,7 +3054,7 @@ static int slab_unmergeable(struct kmem_cache *s)
static struct kmem_cache *find_mergeable(size_t size,
size_t align, unsigned long flags, const char *name,
- void (*ctor)(struct kmem_cache *, void *))
+ void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -3111,8 +3094,7 @@ static struct kmem_cache *find_mergeable(size_t size,
}
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
- size_t align, unsigned long flags,
- void (*ctor)(struct kmem_cache *, void *))
+ size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -3315,12 +3297,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
s->name, page);
if (s->flags & DEBUG_DEFAULT_FLAGS) {
- if (!SlabDebug(page))
- printk(KERN_ERR "SLUB %s: SlabDebug not set "
+ if (!PageSlubDebug(page))
+ printk(KERN_ERR "SLUB %s: SlubDebug not set "
"on slab 0x%p\n", s->name, page);
} else {
- if (SlabDebug(page))
- printk(KERN_ERR "SLUB %s: SlabDebug set on "
+ if (PageSlubDebug(page))
+ printk(KERN_ERR "SLUB %s: SlubDebug set on "
"slab 0x%p\n", s->name, page);
}
}
@@ -4077,7 +4059,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
if (err)
return err;
- if (ratio < 100)
+ if (ratio <= 100)
s->remote_node_defrag_ratio = ratio * 10;
return length;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 99c4f36eb8a..a91b5f8fcaf 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -1,7 +1,7 @@
/*
* Virtual Memory Map support
*
- * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ * (C) 2007 sgi. Christoph Lameter.
*
* Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
* virt_to_page, page_address() to be implemented as a base offset
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2..39db301b920 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section)
return (section->section_mem_map >> SECTION_NID_SHIFT);
}
-/* Record a memory area against a node. */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+/* Validate the physical addressing limitations of the model */
+void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
{
- unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
- unsigned long pfn;
+ unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
/*
* Sanity checks - do not allow an architecture to pass
* in larger pfns than the maximum scope of sparsemem:
*/
- if (start >= max_arch_pfn)
- return;
- if (end >= max_arch_pfn)
- end = max_arch_pfn;
+ if (*start_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *start_pfn = max_sparsemem_pfn;
+ *end_pfn = max_sparsemem_pfn;
+ }
+
+ if (*end_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *end_pfn = max_sparsemem_pfn;
+ }
+}
+
+/* Record a memory area against a node. */
+void __init memory_present(int nid, unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
start &= PAGE_SECTION_MASK;
+ mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
unsigned long section = pfn_to_section_nr(pfn);
struct mem_section *ms;
@@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long pfn;
unsigned long nr_pages = 0;
+ mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
if (nid != early_pfn_to_nid(pfn))
continue;
@@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void)
}
#endif /* CONFIG_MEMORY_HOTPLUG */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+ unsigned long section_nr;
+
+ /*
+ * A page may contain usemaps for other sections preventing the
+ * page being freed and making a section unremovable while
+ * other sections referencing the usemap retmain active. Similarly,
+ * a pgdat can prevent a section being removed. If section A
+ * contains a pgdat and section B contains the usemap, both
+ * sections become inter-dependent. This allocates usemaps
+ * from the same section as the pgdat where possible to avoid
+ * this problem.
+ */
+ section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ return alloc_bootmem_section(usemap_size(), section_nr);
+}
+
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+ unsigned long usemap_snr, pgdat_snr;
+ static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
+ static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int usemap_nid;
+
+ usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ if (usemap_snr == pgdat_snr)
+ return;
+
+ if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
+ /* skip redundant message */
+ return;
+
+ old_usemap_snr = usemap_snr;
+ old_pgdat_snr = pgdat_snr;
+
+ usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
+ if (usemap_nid != nid) {
+ printk(KERN_INFO
+ "node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
+ return;
+ }
+ /*
+ * There is a circular dependency.
+ * Some platforms allow un-removable section because they will just
+ * gather other removable sections for dynamic partitioning.
+ * Just notify un-removable section's number here.
+ */
+ printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
+ pgdat_snr, nid);
+ printk(KERN_CONT
+ " have a circular dependency on usemap and pgdat allocations\n");
+}
+#else
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+ return NULL;
+}
+
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
{
unsigned long *usemap;
struct mem_section *ms = __nr_to_section(pnum);
int nid = sparse_early_nid(ms);
- usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+ usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
if (usemap)
return usemap;
+ usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+ if (usemap) {
+ check_usemap_section_nr(nid, usemap);
+ return usemap;
+ }
+
/* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
nid = 0;
@@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3..9e0cb311807 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,9 +34,9 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -278,9 +278,10 @@ int lru_add_drain_all(void)
* Avoid taking zone->lru_lock if possible, but if it is taken, retain it
* for the remainder of the operation.
*
- * The locking in this function is against shrink_cache(): we recheck the
- * page count inside the lock to see whether shrink_cache grabbed the page
- * via the LRU. If it did, give up: shrink_cache will free it.
+ * The locking in this function is against shrink_inactive_list(): we recheck
+ * the page count inside the lock to see whether shrink_inactive_list()
+ * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
+ * will free it.
*/
void release_pages(struct page **pages, int nr, int cold)
{
@@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec)
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- if (PagePrivate(page) && !TestSetPageLocked(page)) {
+ if (PagePrivate(page) && trylock_page(page)) {
if (PagePrivate(page))
try_to_release_page(page, 0);
unlock_page(page);
@@ -493,7 +494,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
*/
#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
-static DEFINE_PER_CPU(long, committed_space) = 0;
+static DEFINE_PER_CPU(long, committed_space);
void vm_acct_memory(long pages)
{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0b..797c3831cbe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
struct address_space swapper_space = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
- .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+ .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
.a_ops = &swap_aops,
.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
.backing_dev_info = &swap_backing_dev_info,
@@ -56,15 +56,16 @@ static struct {
void show_swap_cache_info(void)
{
- printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
+ printk("%lu pages in swap cache\n", total_swapcache_pages);
+ printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total,
swap_cache_info.find_success, swap_cache_info.find_total);
- printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+ printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
/*
- * add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
BUG_ON(PagePrivate(page));
error = radix_tree_preload(gfp_mask);
if (!error) {
- write_lock_irq(&swapper_space.tree_lock);
+ page_cache_get(page);
+ SetPageSwapCache(page);
+ set_page_private(page, entry.val);
+
+ spin_lock_irq(&swapper_space.tree_lock);
error = radix_tree_insert(&swapper_space.page_tree,
entry.val, page);
- if (!error) {
- page_cache_get(page);
- SetPageSwapCache(page);
- set_page_private(page, entry.val);
+ if (likely(!error)) {
total_swapcache_pages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(add_total);
}
- write_unlock_irq(&swapper_space.tree_lock);
+ spin_unlock_irq(&swapper_space.tree_lock);
radix_tree_preload_end();
+
+ if (unlikely(error)) {
+ set_page_private(page, 0UL);
+ ClearPageSwapCache(page);
+ page_cache_release(page);
+ }
}
return error;
}
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
entry.val = page_private(page);
- write_lock_irq(&swapper_space.tree_lock);
+ spin_lock_irq(&swapper_space.tree_lock);
__delete_from_swap_cache(page);
- write_unlock_irq(&swapper_space.tree_lock);
+ spin_unlock_irq(&swapper_space.tree_lock);
swap_free(entry);
page_cache_release(page);
@@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page)
*/
static inline void free_swap_cache(struct page *page)
{
- if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+ if (PageSwapCache(page) && trylock_page(page)) {
remove_exclusive_swap_page(page);
unlock_page(page);
}
@@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* re-using the just freed swap entry for an existing page.
* May fail (-ENOMEM) if radix-tree node allocation failed.
*/
- SetPageLocked(new_page);
+ set_page_locked(new_page);
err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
- if (!err) {
+ if (likely(!err)) {
/*
* Initiate read into locked page and return.
*/
@@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
swap_readpage(NULL, new_page);
return new_page;
}
- ClearPageLocked(new_page);
+ clear_page_locked(new_page);
swap_free(entry);
} while (err != -ENOMEM);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb592030..1e330f2998f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,17 +33,18 @@
#include <asm/tlbflush.h>
#include <linux/swapops.h>
-DEFINE_SPINLOCK(swap_lock);
-unsigned int nr_swapfiles;
+static DEFINE_SPINLOCK(swap_lock);
+static unsigned int nr_swapfiles;
long total_swap_pages;
static int swap_overflow;
+static int least_priority;
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+static struct swap_list_t swap_list = {-1, -1};
static struct swap_info_struct swap_info[MAX_SWAPFILES];
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
retval = 0;
if (p->swap_map[swp_offset(entry)] == 1) {
/* Recheck the page count with the swapcache lock held.. */
- write_lock_irq(&swapper_space.tree_lock);
+ spin_lock_irq(&swapper_space.tree_lock);
if ((page_count(page) == 2) && !PageWriteback(page)) {
__delete_from_swap_cache(page);
SetPageDirty(page);
retval = 1;
}
- write_unlock_irq(&swapper_space.tree_lock);
+ spin_unlock_irq(&swapper_space.tree_lock);
}
spin_unlock(&swap_lock);
@@ -402,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry)
if (p) {
if (swap_entry_free(p, swp_offset(entry)) == 1) {
page = find_get_page(&swapper_space, entry.val);
- if (page && unlikely(TestSetPageLocked(page))) {
+ if (page && unlikely(!trylock_page(page))) {
page_cache_release(page);
page = NULL;
}
@@ -655,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm,
if (!down_read_trylock(&mm->mmap_sem)) {
/*
- * Activate page so shrink_cache is unlikely to unmap its
- * ptes while lock is dropped, so swapoff can make progress.
+ * Activate page so shrink_inactive_list is unlikely to unmap
+ * its ptes while lock is dropped, so swapoff can make progress.
*/
activate_page(page);
unlock_page(page);
@@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
/* just pick something that's safe... */
swap_list.next = swap_list.head;
}
+ if (p->prio < 0) {
+ for (i = p->next; i >= 0; i = swap_info[i].next)
+ swap_info[i].prio = p->prio--;
+ least_priority++;
+ }
nr_swap_pages -= p->pages;
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
if (err) {
/* re-insert swap space back into swap_list */
spin_lock(&swap_lock);
- for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+ if (p->prio < 0)
+ p->prio = --least_priority;
+ prev = -1;
+ for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
if (p->prio >= swap_info[i].prio)
break;
+ prev = i;
+ }
p->next = i;
if (prev < 0)
swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
unsigned int type;
int i, prev;
int error;
- static int least_priority;
union swap_header *swap_header = NULL;
int swap_header_version;
unsigned int nr_good_pages = 0;
@@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
sector_t span;
unsigned long maxpages = 1;
int swapfilesize;
- unsigned short *swap_map;
+ unsigned short *swap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
int did_down = 0;
@@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
}
if (type >= nr_swapfiles)
nr_swapfiles = type+1;
+ memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->extent_list);
p->flags = SWP_USED;
- p->swap_file = NULL;
- p->old_block_size = 0;
- p->swap_map = NULL;
- p->lowest_bit = 0;
- p->highest_bit = 0;
- p->cluster_nr = 0;
- p->inuse_pages = 0;
p->next = -1;
- if (swap_flags & SWAP_FLAG_PREFER) {
- p->prio =
- (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
- } else {
- p->prio = --least_priority;
- }
spin_unlock(&swap_lock);
name = getname(specialfile);
error = PTR_ERR(name);
@@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
/* OK, set up the swap map and apply the bad block list */
- if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+ swap_map = vmalloc(maxpages * sizeof(short));
+ if (!swap_map) {
error = -ENOMEM;
goto bad_swap;
}
error = 0;
- memset(p->swap_map, 0, maxpages * sizeof(short));
+ memset(swap_map, 0, maxpages * sizeof(short));
for (i = 0; i < swap_header->info.nr_badpages; i++) {
int page_nr = swap_header->info.badpages[i];
if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
error = -EINVAL;
else
- p->swap_map[page_nr] = SWAP_MAP_BAD;
+ swap_map[page_nr] = SWAP_MAP_BAD;
}
nr_good_pages = swap_header->info.last_page -
swap_header->info.nr_badpages -
@@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
}
if (nr_good_pages) {
- p->swap_map[0] = SWAP_MAP_BAD;
+ swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
p->pages = nr_good_pages;
nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
+ if (swap_flags & SWAP_FLAG_PREFER)
+ p->prio =
+ (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ else
+ p->prio = --least_priority;
+ p->swap_map = swap_map;
p->flags = SWP_ACTIVE;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
@@ -1707,12 +1712,8 @@ bad_swap:
destroy_swap_extents(p);
bad_swap_2:
spin_lock(&swap_lock);
- swap_map = p->swap_map;
p->swap_file = NULL;
- p->swap_map = NULL;
p->flags = 0;
- if (!(swap_flags & SWAP_FLAG_PREFER))
- ++least_priority;
spin_unlock(&swap_lock);
vfree(swap_map);
if (swap_file)
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f50194..8d7a27a6335 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,31 +65,31 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
if (!dentry)
goto put_memory;
+ error = -ENFILE;
+ file = get_empty_filp();
+ if (!file)
+ goto put_dentry;
+
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
- goto put_dentry;
+ goto close_file;
d_instantiate(dentry, inode);
- error = -ENFILE;
- file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
- &ramfs_file_operations);
- if (!file)
- goto put_dentry;
-
+ inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
+ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &ramfs_file_operations);
- /* notify everyone as to the change of file size */
- error = do_truncate(dentry, size, 0, file);
- if (error < 0)
+#ifndef CONFIG_MMU
+ error = ramfs_nommu_expand_for_mapping(inode, size);
+ if (error)
goto close_file;
-
+#endif
return file;
close_file:
put_filp(file);
- return ERR_PTR(error);
-
put_dentry:
dput(dentry);
put_memory:
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb6341..6650c1d878b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
cancel_dirty_page(page, PAGE_CACHE_SIZE);
remove_from_page_cache(page);
- ClearPageUptodate(page);
ClearPageMappedToDisk(page);
page_cache_release(page); /* pagecache ref */
}
@@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (page_index > next)
next = page_index;
next++;
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
continue;
if (PageWriteback(page)) {
unlock_page(page);
@@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
pgoff_t index;
int lock_failed;
- lock_failed = TestSetPageLocked(page);
+ lock_failed = !trylock_page(page);
/*
* We really shouldn't be looking at the ->index of an
@@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
if (PageDirty(page))
goto failed;
BUG_ON(PagePrivate(page));
__remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- ClearPageUptodate(page);
+ spin_unlock_irq(&mapping->tree_lock);
page_cache_release(page); /* pagecache ref */
return 1;
failed:
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
return 0;
}
@@ -382,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -442,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
- ret2 = -EIO;
+ ret2 = -EBUSY;
}
if (ret2 < 0)
ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 8f18683825b..cb00b748ce4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
+#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/err.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
/**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * __krealloc - like krealloc() but don't free @p.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
+ * This function is like krealloc() except it never frees the originally
+ * allocated buffer. Use this if you don't want to free the buffer immediately
+ * like, for example, with RCU.
*/
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
{
void *ret;
size_t ks = 0;
- if (unlikely(!new_size)) {
- kfree(p);
+ if (unlikely(!new_size))
return ZERO_SIZE_PTR;
- }
if (p)
ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
return (void *)p;
ret = kmalloc_track_caller(new_size, flags);
- if (ret && p) {
+ if (ret && p)
memcpy(ret, p, ks);
+
+ return ret;
+}
+EXPORT_SYMBOL(__krealloc);
+
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes. If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc(). If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+
+ if (unlikely(!new_size)) {
kfree(p);
+ return ZERO_SIZE_PTR;
}
+
+ ret = __krealloc(p, new_size, flags);
+ if (ret && p != ret)
+ kfree(p);
+
return ret;
}
EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
return p;
}
EXPORT_SYMBOL(strndup_user);
+
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->unmap_area = arch_unmap_area;
+}
+#endif
+
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start, nr_pages,
+ write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 830a5580c5d..bba06c41fc5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -388,16 +388,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
return;
if ((PAGE_SIZE-1) & (unsigned long)addr) {
- printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
- WARN_ON(1);
+ WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
return;
}
area = remove_vm_area(addr);
if (unlikely(!area)) {
- printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
- WARN_ON(1);
return;
}
@@ -938,6 +936,25 @@ static void s_stop(struct seq_file *m, void *p)
read_unlock(&vmlist_lock);
}
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+ if (NUMA_BUILD) {
+ unsigned int nr, *counters = m->private;
+
+ if (!counters)
+ return;
+
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+
+ for (nr = 0; nr < v->nr_pages; nr++)
+ counters[page_to_nid(v->pages[nr])]++;
+
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
+ }
+}
+
static int s_show(struct seq_file *m, void *p)
{
struct vm_struct *v = p;
@@ -974,6 +991,7 @@ static int s_show(struct seq_file *m, void *p)
if (v->flags & VM_VPAGES)
seq_printf(m, " vpages");
+ show_numa_info(m, v);
seq_putc(m, '\n');
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92..1ff1a58e7c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/delayacct.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -390,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
/*
- * Attempt to detach a locked page from its ->mapping. If it is dirty or if
- * someone else has a ref on the page, abort and return 0. If it was
- * successfully detached, return 1. Assumes the caller has a single ref on
- * this page.
+ * Same as remove_mapping, but if the page is removed from the mapping, it
+ * gets returned with a refcount of 0.
*/
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
/*
* The non racy check for a busy page.
*
@@ -426,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (unlikely(page_count(page) != 2))
+ if (!page_freeze_refs(page, 2))
goto cannot_free;
- smp_rmb();
- if (unlikely(PageDirty(page)))
+ /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ if (unlikely(PageDirty(page))) {
+ page_unfreeze_refs(page, 2);
goto cannot_free;
+ }
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
swap_free(swap);
- __put_page(page); /* The pagecache ref */
- return 1;
+ } else {
+ __remove_from_page_cache(page);
+ spin_unlock_irq(&mapping->tree_lock);
}
- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- __put_page(page);
return 1;
cannot_free:
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+/*
+ * Attempt to detach a locked page from its ->mapping. If it is dirty or if
+ * someone else has a ref on the page, abort and return 0. If it was
+ * successfully detached, return 1. Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ if (__remove_mapping(mapping, page)) {
+ /*
+ * Unfreezing the refcount with 1 rather than 2 effectively
+ * drops the pagecache ref for us without requiring another
+ * atomic operation.
+ */
+ page_unfreeze_refs(page, 1);
+ return 1;
+ }
return 0;
}
@@ -477,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
page = lru_to_page(page_list);
list_del(&page->lru);
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto keep;
VM_BUG_ON(PageActive(page));
@@ -563,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
@@ -597,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PagePrivate(page)) {
if (!try_to_release_page(page, sc->gfp_mask))
goto activate_locked;
- if (!mapping && page_count(page) == 1)
- goto free_it;
+ if (!mapping && page_count(page) == 1) {
+ unlock_page(page);
+ if (put_page_testzero(page))
+ goto free_it;
+ else {
+ /*
+ * rare race with speculative reference.
+ * the speculative reference will free
+ * this page shortly, so we may
+ * increment nr_reclaimed here (and
+ * leave it off the LRU).
+ */
+ nr_reclaimed++;
+ continue;
+ }
+ }
}
- if (!mapping || !remove_mapping(mapping, page))
+ if (!mapping || !__remove_mapping(mapping, page))
goto keep_locked;
-free_it:
unlock_page(page);
+free_it:
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page))
- __pagevec_release_nonlru(&freed_pvec);
+ if (!pagevec_add(&freed_pvec, page)) {
+ __pagevec_free(&freed_pvec);
+ pagevec_reinit(&freed_pvec);
+ }
continue;
activate_locked:
@@ -622,7 +657,7 @@ keep:
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))
- __pagevec_release_nonlru(&freed_pvec);
+ __pagevec_free(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -1316,6 +1351,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+ delayacct_freepages_start();
+
if (scan_global_lru(sc))
count_vm_event(ALLOCSTALL);
/*
@@ -1371,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
congestion_wait(WRITE, HZ/10);
}
- /* top priority shrink_caches still had more to do? don't OOM, then */
+ /* top priority shrink_zones still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
ret = nr_reclaimed;
out:
@@ -1396,6 +1433,8 @@ out:
} else
mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+ delayacct_freepages_end();
+
return ret;
}
@@ -1940,7 +1979,7 @@ module_init(kswapd_init)
int zone_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db9eabb2c5b..d7826af2fb0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
#include <linux/err.h>
#include <linux/module.h>
#include <linux/cpu.h>
+#include <linux/vmstat.h>
#include <linux/sched.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
- for_each_cpu_mask(cpu, *cpumask) {
+ for_each_cpu_mask_nr(cpu, *cpumask) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -515,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
continue;
page = pfn_to_page(pfn);
+#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
+ /*
+ * Ordinarily, memory holes in flatmem still have a valid
+ * memmap for the PFN range. However, an architecture for
+ * embedded systems (e.g. ARM) can free up the memmap backing
+ * holes to save memory on the assumption the memmap is
+ * never used. The page_zone linkages are then broken even
+ * though pfn_valid() returns true. Skip the page if the
+ * linkages are broken. Even if this test passed, the impact
+ * is that the counters for the movable type are off but
+ * fragmentation monitoring is likely meaningless on small
+ * systems.
+ */
+ if (page_zone(page) != zone)
+ continue;
+#endif
mtype = get_pageblock_migratetype(page);
- count[mtype]++;
+ if (mtype < MIGRATE_TYPES)
+ count[mtype]++;
}
/* Print counts */