summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/bootmem.c13
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/hugetlb.c24
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/memblock.c837
-rw-r--r--mm/memcontrol.c10
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/memory.c10
-rw-r--r--mm/mmap.c1
-rw-r--r--mm/oom_kill.c49
-rw-r--r--mm/page_alloc.c90
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/rmap.c23
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c43
17 files changed, 764 insertions, 385 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c2bf86f470e..65d420499a6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@@ -243,6 +244,7 @@ static int __init default_bdi_init(void)
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
+ err = bdi_init(&noop_backing_dev_info);
return err;
}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a5499..13b0caa9793 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
+#include <linux/memblock.h>
#include <asm/bug.h>
#include <asm/io.h>
@@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
#ifdef CONFIG_NO_BOOTMEM
- free_early(physaddr, physaddr + size);
+ kmemleak_free_part(__va(physaddr), size);
+ memblock_x86_free_range(physaddr, physaddr + size);
#else
unsigned long start, end;
@@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
void __init free_bootmem(unsigned long addr, unsigned long size)
{
#ifdef CONFIG_NO_BOOTMEM
- free_early(addr, addr + size);
+ kmemleak_free_part(__va(addr), size);
+ memblock_x86_free_range(addr, addr + size);
#else
unsigned long start, end;
@@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
}
#ifndef CONFIG_NO_BOOTMEM
+int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
+{
+ return reserve_bootmem(phys, len, flags);
+}
+
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dacf90a..ec520c7b28d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
- unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
int has_write_lock = 0;
@@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (start + size <= start)
return err;
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return err;
+
/* Can we represent this offset inside this architecture's pte's? */
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!(vma->vm_flags & VM_CAN_NONLINEAR))
goto out;
- if (end <= start || start < vma->vm_start || end > vma->vm_end)
+ if (start < vma->vm_start || start + size > vma->vm_end)
goto out;
/* Must set VM_NONLINEAR before any pages are populated. */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cc5be788a39..c0327380718 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2324,11 +2324,8 @@ retry_avoidcopy:
* and just make the page writable */
avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) {
- if (!trylock_page(old_page)) {
- if (PageAnon(old_page))
- page_move_anon_rmap(old_page, vma, address);
- } else
- unlock_page(old_page);
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2404,7 +2401,7 @@ retry_avoidcopy:
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page);
- hugepage_add_anon_rmap(new_page, vma, address);
+ hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
mmu_notifier_invalidate_range_end(mm,
@@ -2631,10 +2628,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
- if (!pagecache_page) {
- page = pte_page(entry);
+ /*
+ * hugetlb_cow() requires page locks of pte_page(entry) and
+ * pagecache_page, so here we need take the former one
+ * when page != pagecache_page or !pagecache_page.
+ * Note that locking order is always pagecache_page -> page,
+ * so no worry about deadlock.
+ */
+ page = pte_page(entry);
+ if (page != pagecache_page)
lock_page(page);
- }
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
@@ -2661,9 +2664,8 @@ out_page_table_lock:
if (pagecache_page) {
unlock_page(pagecache_page);
put_page(pagecache_page);
- } else {
- unlock_page(page);
}
+ unlock_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index b1873cf03ed..65ab5c7067d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -712,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (!ptep)
goto out;
- if (pte_write(*ptep)) {
+ if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
swapped = PageSwapCache(page);
@@ -735,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at(mm, addr, ptep, entry);
goto out_unlock;
}
- entry = pte_wrprotect(entry);
+ if (pte_dirty(entry))
+ set_page_dirty(page);
+ entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
diff --git a/mm/memblock.c b/mm/memblock.c
index 43840b305ec..400dc62697d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,237 +11,423 @@
*/
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/memblock.h>
-#define MEMBLOCK_ALLOC_ANYWHERE 0
+struct memblock memblock __initdata_memblock;
-struct memblock memblock;
+int memblock_debug __initdata_memblock;
+int memblock_can_resize __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
-static int memblock_debug;
+/* inline so we don't get a warning when pr_debug is compiled out */
+static inline const char *memblock_type_name(struct memblock_type *type)
+{
+ if (type == &memblock.memory)
+ return "memory";
+ else if (type == &memblock.reserved)
+ return "reserved";
+ else
+ return "unknown";
+}
-static int __init early_memblock(char *p)
+/*
+ * Address comparison utilities
+ */
+
+static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
{
- if (p && strstr(p, "debug"))
- memblock_debug = 1;
+ return addr & ~(size - 1);
+}
+
+static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
+{
+ return (addr + (size - 1)) & ~(size - 1);
+}
+
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ if (base2 == base1 + size1)
+ return 1;
+ else if (base1 == base2 + size2)
+ return -1;
+
return 0;
}
-early_param("memblock", early_memblock);
-static void memblock_dump(struct memblock_region *region, char *name)
+static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
+ unsigned long r1, unsigned long r2)
{
- unsigned long long base, size;
- int i;
+ phys_addr_t base1 = type->regions[r1].base;
+ phys_addr_t size1 = type->regions[r1].size;
+ phys_addr_t base2 = type->regions[r2].base;
+ phys_addr_t size2 = type->regions[r2].size;
- pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+ return memblock_addrs_adjacent(base1, size1, base2, size2);
+}
- for (i = 0; i < region->cnt; i++) {
- base = region->region[i].base;
- size = region->region[i].size;
+long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+{
+ unsigned long i;
- pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n",
- name, i, base, base + size - 1, size);
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
+ if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ break;
}
+
+ return (i < type->cnt) ? i : -1;
}
-void memblock_dump_all(void)
+/*
+ * Find, allocate, deallocate or reserve unreserved regions. All allocations
+ * are top-down.
+ */
+
+static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align)
{
- if (!memblock_debug)
- return;
+ phys_addr_t base, res_base;
+ long j;
- pr_info("MEMBLOCK configuration:\n");
- pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size);
- pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size);
+ /* In case, huge size is requested */
+ if (end < size)
+ return MEMBLOCK_ERROR;
- memblock_dump(&memblock.memory, "memory");
- memblock_dump(&memblock.reserved, "reserved");
+ base = memblock_align_down((end - size), align);
+
+ /* Prevent allocations returning 0 as it's also used to
+ * indicate an allocation failure
+ */
+ if (start == 0)
+ start = PAGE_SIZE;
+
+ while (start <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0)
+ return base;
+ res_base = memblock.reserved.regions[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+
+ return MEMBLOCK_ERROR;
}
-static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2,
- u64 size2)
+static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start, phys_addr_t end)
{
- return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+ long i;
+
+ BUG_ON(0 == size);
+
+ size = memblock_align_up(size, align);
+
+ /* Pump up max_addr */
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ end = memblock.current_limit;
+
+ /* We do a top-down search, this tends to limit memory
+ * fragmentation by keeping early boot allocs near the
+ * top of memory
+ */
+ for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+ phys_addr_t memblockbase = memblock.memory.regions[i].base;
+ phys_addr_t memblocksize = memblock.memory.regions[i].size;
+ phys_addr_t bottom, top, found;
+
+ if (memblocksize < size)
+ continue;
+ if ((memblockbase + memblocksize) <= start)
+ break;
+ bottom = max(memblockbase, start);
+ top = min(memblockbase + memblocksize, end);
+ if (bottom >= top)
+ continue;
+ found = memblock_find_region(bottom, top, size, align);
+ if (found != MEMBLOCK_ERROR)
+ return found;
+ }
+ return MEMBLOCK_ERROR;
}
-static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2)
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
{
- if (base2 == base1 + size1)
- return 1;
- else if (base1 == base2 + size2)
- return -1;
+ return memblock_find_base(size, align, start, end);
+}
- return 0;
+/*
+ * Free memblock.reserved.regions
+ */
+int __init_memblock memblock_free_reserved_regions(void)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ return memblock_free(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
}
-static long memblock_regions_adjacent(struct memblock_region *rgn,
- unsigned long r1, unsigned long r2)
+/*
+ * Reserve memblock.reserved.regions
+ */
+int __init_memblock memblock_reserve_reserved_regions(void)
{
- u64 base1 = rgn->region[r1].base;
- u64 size1 = rgn->region[r1].size;
- u64 base2 = rgn->region[r2].base;
- u64 size2 = rgn->region[r2].size;
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
- return memblock_addrs_adjacent(base1, size1, base2, size2);
+ return memblock_reserve(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
}
-static void memblock_remove_region(struct memblock_region *rgn, unsigned long r)
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
unsigned long i;
- for (i = r; i < rgn->cnt - 1; i++) {
- rgn->region[i].base = rgn->region[i + 1].base;
- rgn->region[i].size = rgn->region[i + 1].size;
+ for (i = r; i < type->cnt - 1; i++) {
+ type->regions[i].base = type->regions[i + 1].base;
+ type->regions[i].size = type->regions[i + 1].size;
}
- rgn->cnt--;
+ type->cnt--;
}
/* Assumption: base addr of region 1 < base addr of region 2 */
-static void memblock_coalesce_regions(struct memblock_region *rgn,
+static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
unsigned long r1, unsigned long r2)
{
- rgn->region[r1].size += rgn->region[r2].size;
- memblock_remove_region(rgn, r2);
+ type->regions[r1].size += type->regions[r2].size;
+ memblock_remove_region(type, r2);
}
-void __init memblock_init(void)
+/* Defined below but needed now */
+static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
+
+static int __init_memblock memblock_double_array(struct memblock_type *type)
{
- /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
- * This simplifies the memblock_add() code below...
+ struct memblock_region *new_array, *old_array;
+ phys_addr_t old_size, new_size, addr;
+ int use_slab = slab_is_available();
+
+ /* We don't allow resizing until we know about the reserved regions
+ * of memory that aren't suitable for allocation
*/
- memblock.memory.region[0].base = 0;
- memblock.memory.region[0].size = 0;
- memblock.memory.cnt = 1;
+ if (!memblock_can_resize)
+ return -1;
- /* Ditto. */
- memblock.reserved.region[0].base = 0;
- memblock.reserved.region[0].size = 0;
- memblock.reserved.cnt = 1;
-}
+ /* Calculate new doubled size */
+ old_size = type->max * sizeof(struct memblock_region);
+ new_size = old_size << 1;
+
+ /* Try to find some space for it.
+ *
+ * WARNING: We assume that either slab_is_available() and we use it or
+ * we use MEMBLOCK for allocations. That means that this is unsafe to use
+ * when bootmem is currently active (unless bootmem itself is implemented
+ * on top of MEMBLOCK which isn't the case yet)
+ *
+ * This should however not be an issue for now, as we currently only
+ * call into MEMBLOCK while it's still active, or much later when slab is
+ * active for memory hotplug operations
+ */
+ if (use_slab) {
+ new_array = kmalloc(new_size, GFP_KERNEL);
+ addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+ } else
+ addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr == MEMBLOCK_ERROR) {
+ pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+ memblock_type_name(type), type->max, type->max * 2);
+ return -1;
+ }
+ new_array = __va(addr);
-void __init memblock_analyze(void)
-{
- int i;
+ memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+ memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
- memblock.memory.size = 0;
+ /* Found space, we now need to move the array over before
+ * we add the reserved region since it may be our reserved
+ * array itself that is full.
+ */
+ memcpy(new_array, type->regions, old_size);
+ memset(new_array + type->max, 0, old_size);
+ old_array = type->regions;
+ type->regions = new_array;
+ type->max <<= 1;
+
+ /* If we use SLAB that's it, we are done */
+ if (use_slab)
+ return 0;
- for (i = 0; i < memblock.memory.cnt; i++)
- memblock.memory.size += memblock.memory.region[i].size;
+ /* Add the new reserved region now. Should not fail ! */
+ BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
+
+ /* If the array wasn't our static init one, then free it. We only do
+ * that before SLAB is available as later on, we don't know whether
+ * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
+ * anyways
+ */
+ if (old_array != memblock_memory_init_regions &&
+ old_array != memblock_reserved_init_regions)
+ memblock_free(__pa(old_array), old_size);
+
+ return 0;
}
-static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size)
+extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+ phys_addr_t addr2, phys_addr_t size2)
+{
+ return 1;
+}
+
+static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
unsigned long coalesced = 0;
long adjacent, i;
- if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) {
- rgn->region[0].base = base;
- rgn->region[0].size = size;
+ if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
return 0;
}
/* First try and coalesce this MEMBLOCK with another. */
- for (i = 0; i < rgn->cnt; i++) {
- u64 rgnbase = rgn->region[i].base;
- u64 rgnsize = rgn->region[i].size;
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
if ((rgnbase == base) && (rgnsize == size))
/* Already have this region, so we're done */
return 0;
adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+ /* Check if arch allows coalescing */
+ if (adjacent != 0 && type == &memblock.memory &&
+ !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
+ break;
if (adjacent > 0) {
- rgn->region[i].base -= size;
- rgn->region[i].size += size;
+ type->regions[i].base -= size;
+ type->regions[i].size += size;
coalesced++;
break;
} else if (adjacent < 0) {
- rgn->region[i].size += size;
+ type->regions[i].size += size;
coalesced++;
break;
}
}
- if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) {
- memblock_coalesce_regions(rgn, i, i+1);
+ /* If we plugged a hole, we may want to also coalesce with the
+ * next region
+ */
+ if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
+ ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
+ type->regions[i].size,
+ type->regions[i+1].base,
+ type->regions[i+1].size)))) {
+ memblock_coalesce_regions(type, i, i+1);
coalesced++;
}
if (coalesced)
return coalesced;
- if (rgn->cnt >= MAX_MEMBLOCK_REGIONS)
+
+ /* If we are out of space, we fail. It's too late to resize the array
+ * but then this shouldn't have happened in the first place.
+ */
+ if (WARN_ON(type->cnt >= type->max))
return -1;
/* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
- for (i = rgn->cnt - 1; i >= 0; i--) {
- if (base < rgn->region[i].base) {
- rgn->region[i+1].base = rgn->region[i].base;
- rgn->region[i+1].size = rgn->region[i].size;
+ for (i = type->cnt - 1; i >= 0; i--) {
+ if (base < type->regions[i].base) {
+ type->regions[i+1].base = type->regions[i].base;
+ type->regions[i+1].size = type->regions[i].size;
} else {
- rgn->region[i+1].base = base;
- rgn->region[i+1].size = size;
+ type->regions[i+1].base = base;
+ type->regions[i+1].size = size;
break;
}
}
- if (base < rgn->region[0].base) {
- rgn->region[0].base = base;
- rgn->region[0].size = size;
+ if (base < type->regions[0].base) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ }
+ type->cnt++;
+
+ /* The array is full ? Try to resize it. If that fails, we undo
+ * our allocation and return an error
+ */
+ if (type->cnt == type->max && memblock_double_array(type)) {
+ type->cnt--;
+ return -1;
}
- rgn->cnt++;
return 0;
}
-long memblock_add(u64 base, u64 size)
+long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- struct memblock_region *_rgn = &memblock.memory;
-
- /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
- if (base == 0)
- memblock.rmo_size = size;
-
- return memblock_add_region(_rgn, base, size);
+ return memblock_add_region(&memblock.memory, base, size);
}
-static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
+static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
- u64 rgnbegin, rgnend;
- u64 end = base + size;
+ phys_addr_t rgnbegin, rgnend;
+ phys_addr_t end = base + size;
int i;
rgnbegin = rgnend = 0; /* supress gcc warnings */
/* Find the region where (base, size) belongs to */
- for (i=0; i < rgn->cnt; i++) {
- rgnbegin = rgn->region[i].base;
- rgnend = rgnbegin + rgn->region[i].size;
+ for (i=0; i < type->cnt; i++) {
+ rgnbegin = type->regions[i].base;
+ rgnend = rgnbegin + type->regions[i].size;
if ((rgnbegin <= base) && (end <= rgnend))
break;
}
/* Didn't find the region */
- if (i == rgn->cnt)
+ if (i == type->cnt)
return -1;
/* Check to see if we are removing entire region */
if ((rgnbegin == base) && (rgnend == end)) {
- memblock_remove_region(rgn, i);
+ memblock_remove_region(type, i);
return 0;
}
/* Check to see if region is matching at the front */
if (rgnbegin == base) {
- rgn->region[i].base = end;
- rgn->region[i].size -= size;
+ type->regions[i].base = end;
+ type->regions[i].size -= size;
return 0;
}
/* Check to see if the region is matching at the end */
if (rgnend == end) {
- rgn->region[i].size -= size;
+ type->regions[i].size -= size;
return 0;
}
@@ -249,208 +435,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
* We need to split the entry - adjust the current one to the
* beginging of the hole and add the region after hole.
*/
- rgn->region[i].size = base - rgn->region[i].base;
- return memblock_add_region(rgn, end, rgnend - end);
+ type->regions[i].size = base - type->regions[i].base;
+ return memblock_add_region(type, end, rgnend - end);
}
-long memblock_remove(u64 base, u64 size)
+long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
return __memblock_remove(&memblock.memory, base, size);
}
-long __init memblock_free(u64 base, u64 size)
+long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
return __memblock_remove(&memblock.reserved, base, size);
}
-long __init memblock_reserve(u64 base, u64 size)
+long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
- struct memblock_region *_rgn = &memblock.reserved;
+ struct memblock_type *_rgn = &memblock.reserved;
BUG_ON(0 == size);
return memblock_add_region(_rgn, base, size);
}
-long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size)
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- unsigned long i;
+ phys_addr_t found;
- for (i = 0; i < rgn->cnt; i++) {
- u64 rgnbase = rgn->region[i].base;
- u64 rgnsize = rgn->region[i].size;
- if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
- break;
- }
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
+ size = memblock_align_up(size, align);
- return (i < rgn->cnt) ? i : -1;
+ found = memblock_find_base(size, align, 0, max_addr);
+ if (found != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, found, size) >= 0)
+ return found;
+
+ return 0;
}
-static u64 memblock_align_down(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return addr & ~(size - 1);
+ phys_addr_t alloc;
+
+ alloc = __memblock_alloc_base(size, align, max_addr);
+
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
}
-static u64 memblock_align_up(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
{
- return (addr + (size - 1)) & ~(size - 1);
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
-static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end,
- u64 size, u64 align)
+
+/*
+ * Additional node-local allocators. Search for node memory is bottom up
+ * and walks memblock regions within that node bottom-up as well, but allocation
+ * within an memblock region is top-down. XXX I plan to fix that at some stage
+ *
+ * WARNING: Only available after early_node_map[] has been populated,
+ * on some architectures, that is after all the calls to add_active_range()
+ * have been done to populate it.
+ */
+
+phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
{
- u64 base, res_base;
- long j;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ /*
+ * This code originates from sparc which really wants use to walk by addresses
+ * and returns the nid. This is not very convenient for early_pfn_map[] users
+ * as the map isn't sorted yet, and it really wants to be walked by nid.
+ *
+ * For now, I implement the inefficient method below which walks the early
+ * map multiple times. Eventually we may want to use an ARCH config option
+ * to implement a completely different method for both case.
+ */
+ unsigned long start_pfn, end_pfn;
+ int i;
- base = memblock_align_down((end - size), align);
- while (start <= base) {
- j = memblock_overlaps_region(&memblock.reserved, base, size);
- if (j < 0) {
- /* this area isn't reserved, take it */
- if (memblock_add_region(&memblock.reserved, base, size) < 0)
- base = ~(u64)0;
- return base;
- }
- res_base = memblock.reserved.region[j].base;
- if (res_base < size)
- break;
- base = memblock_align_down(res_base - size, align);
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
+ if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
+ continue;
+ *nid = i;
+ return min(end, PFN_PHYS(end_pfn));
}
+#endif
+ *nid = 0;
- return ~(u64)0;
+ return end;
}
-static u64 __init memblock_alloc_nid_region(struct memblock_property *mp,
- u64 (*nid_range)(u64, u64, int *),
- u64 size, u64 align, int nid)
+static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
+ phys_addr_t size,
+ phys_addr_t align, int nid)
{
- u64 start, end;
+ phys_addr_t start, end;
start = mp->base;
end = start + mp->size;
start = memblock_align_up(start, align);
while (start < end) {
- u64 this_end;
+ phys_addr_t this_end;
int this_nid;
- this_end = nid_range(start, end, &this_nid);
+ this_end = memblock_nid_range(start, end, &this_nid);
if (this_nid == nid) {
- u64 ret = memblock_alloc_nid_unreserved(start, this_end,
- size, align);
- if (ret != ~(u64)0)
+ phys_addr_t ret = memblock_find_region(start, this_end, size, align);
+ if (ret != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, ret, size) >= 0)
return ret;
}
start = this_end;
}
- return ~(u64)0;
+ return MEMBLOCK_ERROR;
}
-u64 __init memblock_alloc_nid(u64 size, u64 align, int nid,
- u64 (*nid_range)(u64 start, u64 end, int *nid))
+phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- struct memblock_region *mem = &memblock.memory;
+ struct memblock_type *mem = &memblock.memory;
int i;
BUG_ON(0 == size);
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
size = memblock_align_up(size, align);
+ /* We do a bottom-up search for a region with the right
+ * nid since that's easier considering how memblock_nid_range()
+ * works
+ */
for (i = 0; i < mem->cnt; i++) {
- u64 ret = memblock_alloc_nid_region(&mem->region[i],
- nid_range,
+ phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
size, align, nid);
- if (ret != ~(u64)0)
+ if (ret != MEMBLOCK_ERROR)
return ret;
}
- return memblock_alloc(size, align);
-}
-
-u64 __init memblock_alloc(u64 size, u64 align)
-{
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+ return 0;
}
-u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- u64 alloc;
-
- alloc = __memblock_alloc_base(size, align, max_addr);
+ phys_addr_t res = memblock_alloc_nid(size, align, nid);
- if (alloc == 0)
- panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
- (unsigned long long) size, (unsigned long long) max_addr);
-
- return alloc;
+ if (res)
+ return res;
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
}
-u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
-{
- long i, j;
- u64 base = 0;
- u64 res_base;
-
- BUG_ON(0 == size);
- size = memblock_align_up(size, align);
-
- /* On some platforms, make sure we allocate lowmem */
- /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
- if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
- max_addr = MEMBLOCK_REAL_LIMIT;
-
- for (i = memblock.memory.cnt - 1; i >= 0; i--) {
- u64 memblockbase = memblock.memory.region[i].base;
- u64 memblocksize = memblock.memory.region[i].size;
-
- if (memblocksize < size)
- continue;
- if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
- base = memblock_align_down(memblockbase + memblocksize - size, align);
- else if (memblockbase < max_addr) {
- base = min(memblockbase + memblocksize, max_addr);
- base = memblock_align_down(base - size, align);
- } else
- continue;
-
- while (base && memblockbase <= base) {
- j = memblock_overlaps_region(&memblock.reserved, base, size);
- if (j < 0) {
- /* this area isn't reserved, take it */
- if (memblock_add_region(&memblock.reserved, base, size) < 0)
- return 0;
- return base;
- }
- res_base = memblock.reserved.region[j].base;
- if (res_base < size)
- break;
- base = memblock_align_down(res_base - size, align);
- }
- }
- return 0;
-}
+/*
+ * Remaining API functions
+ */
/* You must call memblock_analyze() before this. */
-u64 __init memblock_phys_mem_size(void)
+phys_addr_t __init memblock_phys_mem_size(void)
{
- return memblock.memory.size;
+ return memblock.memory_size;
}
-u64 memblock_end_of_DRAM(void)
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
{
int idx = memblock.memory.cnt - 1;
- return (memblock.memory.region[idx].base + memblock.memory.region[idx].size);
+ return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
}
/* You must call memblock_analyze() after this. */
-void __init memblock_enforce_memory_limit(u64 memory_limit)
+void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
{
unsigned long i;
- u64 limit;
- struct memblock_property *p;
+ phys_addr_t limit;
+ struct memblock_region *p;
if (!memory_limit)
return;
@@ -458,24 +625,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
/* Truncate the memblock regions to satisfy the memory limit. */
limit = memory_limit;
for (i = 0; i < memblock.memory.cnt; i++) {
- if (limit > memblock.memory.region[i].size) {
- limit -= memblock.memory.region[i].size;
+ if (limit > memblock.memory.regions[i].size) {
+ limit -= memblock.memory.regions[i].size;
continue;
}
- memblock.memory.region[i].size = limit;
+ memblock.memory.regions[i].size = limit;
memblock.memory.cnt = i + 1;
break;
}
- if (memblock.memory.region[0].size < memblock.rmo_size)
- memblock.rmo_size = memblock.memory.region[0].size;
-
memory_limit = memblock_end_of_DRAM();
/* And truncate any reserves above the limit also. */
for (i = 0; i < memblock.reserved.cnt; i++) {
- p = &memblock.reserved.region[i];
+ p = &memblock.reserved.regions[i];
if (p->base > memory_limit)
p->size = 0;
@@ -489,53 +653,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
}
}
-int __init memblock_is_reserved(u64 addr)
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+ unsigned int left = 0, right = type->cnt;
+
+ do {
+ unsigned int mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else
+ return mid;
+ } while (left < right);
+ return -1;
+}
+
+int __init memblock_is_reserved(phys_addr_t addr)
+{
+ return memblock_search(&memblock.reserved, addr) != -1;
+}
+
+int __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+ return memblock_search(&memblock.memory, addr) != -1;
+}
+
+int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+ int idx = memblock_search(&memblock.reserved, base);
+
+ if (idx == -1)
+ return 0;
+ return memblock.reserved.regions[idx].base <= base &&
+ (memblock.reserved.regions[idx].base +
+ memblock.reserved.regions[idx].size) >= (base + size);
+}
+
+int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+
+
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
{
+ memblock.current_limit = limit;
+}
+
+static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
+{
+ unsigned long long base, size;
int i;
- for (i = 0; i < memblock.reserved.cnt; i++) {
- u64 upper = memblock.reserved.region[i].base +
- memblock.reserved.region[i].size - 1;
- if ((addr >= memblock.reserved.region[i].base) && (addr <= upper))
- return 1;
+ pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+
+ for (i = 0; i < region->cnt; i++) {
+ base = region->regions[i].base;
+ size = region->regions[i].size;
+
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
+ name, i, base, base + size - 1, size);
}
- return 0;
}
-int memblock_is_region_reserved(u64 base, u64 size)
+void __init_memblock memblock_dump_all(void)
{
- return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+ if (!memblock_debug)
+ return;
+
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+
+ memblock_dump(&memblock.memory, "memory");
+ memblock_dump(&memblock.reserved, "reserved");
}
-/*
- * Given a <base, len>, find which memory regions belong to this range.
- * Adjust the request and return a contiguous chunk.
- */
-int memblock_find(struct memblock_property *res)
+void __init memblock_analyze(void)
{
int i;
- u64 rstart, rend;
- rstart = res->base;
- rend = rstart + res->size - 1;
+ /* Check marker in the unused last array entry */
+ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+ WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+
+ memblock.memory_size = 0;
+
+ for (i = 0; i < memblock.memory.cnt; i++)
+ memblock.memory_size += memblock.memory.regions[i].size;
+
+ /* We allow resizing from there */
+ memblock_can_resize = 1;
+}
+
+void __init memblock_init(void)
+{
+ static int init_done __initdata = 0;
+
+ if (init_done)
+ return;
+ init_done = 1;
+
+ /* Hookup the initial arrays */
+ memblock.memory.regions = memblock_memory_init_regions;
+ memblock.memory.max = INIT_MEMBLOCK_REGIONS;
+ memblock.reserved.regions = memblock_reserved_init_regions;
+ memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
+
+ /* Write a marker in the unused last array entry */
+ memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+ memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+
+ /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+ * This simplifies the memblock_add() code below...
+ */
+ memblock.memory.regions[0].base = 0;
+ memblock.memory.regions[0].size = 0;
+ memblock.memory.cnt = 1;
+
+ /* Ditto. */
+ memblock.reserved.regions[0].base = 0;
+ memblock.reserved.regions[0].size = 0;
+ memblock.reserved.cnt = 1;
+
+ memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
+}
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+ struct memblock_type *type = m->private;
+ struct memblock_region *reg;
+ int i;
+
+ for (i = 0; i < type->cnt; i++) {
+ reg = &type->regions[i];
+ seq_printf(m, "%4d: ", i);
+ if (sizeof(phys_addr_t) == 4)
+ seq_printf(m, "0x%08lx..0x%08lx\n",
+ (unsigned long)reg->base,
+ (unsigned long)(reg->base + reg->size - 1));
+ else
+ seq_printf(m, "0x%016llx..0x%016llx\n",
+ (unsigned long long)reg->base,
+ (unsigned long long)(reg->base + reg->size - 1));
- for (i = 0; i < memblock.memory.cnt; i++) {
- u64 start = memblock.memory.region[i].base;
- u64 end = start + memblock.memory.region[i].size - 1;
-
- if (start > rend)
- return -1;
-
- if ((end >= rstart) && (start < rend)) {
- /* adjust the request */
- if (rstart < start)
- rstart = start;
- if (rend > end)
- rend = end;
- res->base = rstart;
- res->size = rend - rstart + 1;
- return 0;
- }
}
- return -1;
+ return 0;
+}
+
+static int memblock_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, memblock_debug_show, inode->i_private);
}
+
+static const struct file_operations memblock_debug_fops = {
+ .open = memblock_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init memblock_init_debugfs(void)
+{
+ struct dentry *root = debugfs_create_dir("memblock", NULL);
+ if (!root)
+ return -ENXIO;
+ debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
+ debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+
+ return 0;
+}
+__initcall(memblock_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3eed583895a..9be3cf8a5da 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3587,9 +3587,13 @@ unlock:
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
- __mem_cgroup_threshold(memcg, false);
- if (do_swap_account)
- __mem_cgroup_threshold(memcg, true);
+ while (memcg) {
+ __mem_cgroup_threshold(memcg, false);
+ if (do_swap_account)
+ __mem_cgroup_threshold(memcg, true);
+
+ memcg = parent_mem_cgroup(memcg);
+ }
}
static int compare_thresholds(const void *a, const void *b)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9c26eeca134..757f6b0accf 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
* signal.
*/
static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn)
+ unsigned long pfn, struct page *page)
{
struct siginfo si;
int ret;
@@ -198,7 +198,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = PAGE_SHIFT;
+ si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
@@ -235,7 +235,7 @@ void shake_page(struct page *p, int access)
int nr;
do {
nr = shrink_slab(1000, GFP_KERNEL, 1000);
- if (page_count(p) == 0)
+ if (page_count(p) == 1)
break;
} while (nr > 10);
}
@@ -327,7 +327,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* wrong earlier.
*/
static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, unsigned long pfn)
+ int fail, struct page *page, unsigned long pfn)
{
struct to_kill *tk, *next;
@@ -352,7 +352,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
* process anyways.
*/
else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn) < 0)
+ pfn, page) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
@@ -928,7 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* any accesses to the poisoned memory.
*/
kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
- ret != SWAP_SUCCESS, pfn);
+ ret != SWAP_SUCCESS, p, pfn);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 71b161b73bb..98b58fecede 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2680,10 +2680,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
/*
- * Make sure try_to_free_swap didn't release the swapcache
- * from under us. The page pin isn't enough to prevent that.
+ * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+ * release the swapcache from under us. The page pin, and pte_same
+ * test below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not changed.
*/
- if (unlikely(!PageSwapCache(page)))
+ if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
if (ksm_might_need_to_copy(page, vma, address)) {
@@ -3183,7 +3185,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index 6128dc8e5ed..00161a48a45 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2009,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
removed_exe_file_vma(mm);
fput(new->vm_file);
}
+ unlink_anon_vmas(new);
out_free_mpol:
mpol_put(pol);
out_free_vma:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fc81cb22869..4029583a102 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -121,8 +121,8 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
}
/* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
- const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p,
+ const struct mem_cgroup *mem, const nodemask_t *nodemask)
{
if (is_global_init(p))
return true;
@@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
*/
points += p->signal->oom_score_adj;
- if (points < 0)
- return 0;
+ /*
+ * Never return 0 for an eligible task that may be killed since it's
+ * possible that no single user task uses more than 0.1% of memory and
+ * no single admin tasks uses more than 3.0%.
+ */
+ if (points <= 0)
+ return 1;
return (points < 1000) ? points : 1000;
}
@@ -339,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
* @mem: current's memory controller, if constrained
+ * @nodemask: nodemask passed to page allocator for mempolicy ooms
*
- * Dumps the current memory state of all system tasks, excluding kernel threads.
+ * Dumps the current memory state of all eligible tasks. Tasks not in the same
+ * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
+ * are not shown.
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
* value, oom_score_adj value, and name.
*
- * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
- * shown.
- *
* Call with tasklist_lock read-locked.
*/
-static void dump_tasks(const struct mem_cgroup *mem)
+static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
for_each_process(p) {
- if (p->flags & PF_KTHREAD)
- continue;
- if (mem && !task_in_mem_cgroup(p, mem))
+ if (oom_unkillable_task(p, mem, nodemask))
continue;
task = find_lock_task_mm(p);
@@ -381,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *mem)
+ struct mem_cgroup *mem, const nodemask_t *nodemask)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -394,7 +397,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
mem_cgroup_print_oom_info(mem, p);
show_mem();
if (sysctl_oom_dump_tasks)
- dump_tasks(mem);
+ dump_tasks(mem, nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -436,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int victim_points = 0;
if (printk_ratelimit())
- dump_header(p, gfp_mask, order, mem);
+ dump_header(p, gfp_mask, order, mem, nodemask);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -482,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order)
+ int order, const nodemask_t *nodemask)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -496,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
return;
}
read_lock(&tasklist_lock);
- dump_header(NULL, gfp_mask, order, NULL);
+ dump_header(NULL, gfp_mask, order, NULL, nodemask);
read_unlock(&tasklist_lock);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -509,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
unsigned int points = 0;
struct task_struct *p;
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
@@ -641,6 +644,7 @@ static void clear_system_oom(void)
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
{
+ const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
@@ -670,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
&totalpages);
- check_panic_on_oom(constraint, gfp_mask, order);
+ mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
@@ -688,15 +693,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
}
retry:
- p = select_bad_process(&points, totalpages, NULL,
- constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
- NULL);
+ p = select_bad_process(&points, totalpages, NULL, mpol_mask);
if (PTR_ERR(p) == -1UL)
goto out;
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
- dump_header(NULL, gfp_mask, order, NULL);
+ dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8cfa9cc6e8..2a362c52fdf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
@@ -3636,6 +3637,41 @@ void __init free_bootmem_with_active_regions(int nid,
}
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ int i;
+
+ /* Need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ u64 addr;
+ u64 ei_start, ei_last;
+ u64 final_start, final_end;
+
+ ei_last = early_node_map[i].end_pfn;
+ ei_last <<= PAGE_SHIFT;
+ ei_start = early_node_map[i].start_pfn;
+ ei_start <<= PAGE_SHIFT;
+
+ final_start = max(ei_start, goal);
+ final_end = min(ei_last, limit);
+
+ if (final_start >= final_end)
+ continue;
+
+ addr = memblock_find_in_range(final_start, final_end, size, align);
+
+ if (addr == MEMBLOCK_ERROR)
+ continue;
+
+ return addr;
+ }
+
+ return MEMBLOCK_ERROR;
+}
+#endif
+
int __init add_from_early_node_map(struct range *range, int az,
int nr_range, int nid)
{
@@ -3655,46 +3691,26 @@ int __init add_from_early_node_map(struct range *range, int az,
void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
- int i;
void *ptr;
+ u64 addr;
- if (limit > get_max_mapped())
- limit = get_max_mapped();
-
- /* need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
- u64 addr;
- u64 ei_start, ei_last;
-
- ei_last = early_node_map[i].end_pfn;
- ei_last <<= PAGE_SHIFT;
- ei_start = early_node_map[i].start_pfn;
- ei_start <<= PAGE_SHIFT;
- addr = find_early_area(ei_start, ei_last,
- goal, limit, size, align);
-
- if (addr == -1ULL)
- continue;
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
-#if 0
- printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
- nid,
- ei_start, ei_last, goal, limit, size,
- align, addr);
-#endif
+ addr = find_memory_core_early(nid, size, align, goal, limit);
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- reserve_early_without_check(addr, addr + size, "BOOTMEM");
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(ptr, size, 0, 0);
- return ptr;
- }
+ if (addr == MEMBLOCK_ERROR)
+ return NULL;
- return NULL;
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
}
#endif
@@ -5182,9 +5198,9 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
+ printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
tablename,
- (1U << log2qty),
+ (1UL << log2qty),
ilog2(size) - PAGE_SHIFT,
size);
diff --git a/mm/percpu.c b/mm/percpu.c
index 58c572b18b0..c76ef3891e0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1401,9 +1401,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (pcpu_first_unit_cpu == NR_CPUS)
pcpu_first_unit_cpu = cpu;
+ pcpu_last_unit_cpu = cpu;
}
}
- pcpu_last_unit_cpu = cpu;
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
diff --git a/mm/rmap.c b/mm/rmap.c
index f6f0d2dda2e..92e6757f196 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -381,7 +381,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
if (PageAnon(page)) {
- if (vma->anon_vma->root != page_anon_vma(page)->root)
+ struct anon_vma *page__anon_vma = page_anon_vma(page);
+ /*
+ * Note: swapoff's unuse_vma() is more efficient with this
+ * check, and needs it to match anon_vma when KSM is active.
+ */
+ if (!vma->anon_vma || !page__anon_vma ||
+ vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
@@ -1564,13 +1570,14 @@ static void __hugepage_set_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma;
+
BUG_ON(!anon_vma);
- if (!exclusive) {
- struct anon_vma_chain *avc;
- avc = list_entry(vma->anon_vma_chain.prev,
- struct anon_vma_chain, same_vma);
- anon_vma = avc->anon_vma;
- }
+
+ if (PageAnon(page))
+ return;
+ if (!exclusive)
+ anon_vma = anon_vma->root;
+
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
@@ -1581,6 +1588,8 @@ void hugepage_add_anon_rmap(struct page *page,
{
struct anon_vma *anon_vma = vma->anon_vma;
int first;
+
+ BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
first = atomic_inc_and_test(&page->_mapcount);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd67fa4..29d6cbffb28 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (vmemmap_buf_start) {
/* need to free left buf */
-#ifdef CONFIG_NO_BOOTMEM
- free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
- if (vmemmap_buf_start < vmemmap_buf) {
- char name[15];
-
- snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
- reserve_early_without_check(__pa(vmemmap_buf_start),
- __pa(vmemmap_buf), name);
- }
-#else
free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
-#endif
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a..d8087f0db50 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
static void purge_fragmented_blocks_allcpus(void);
/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+ atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
+/*
* Purges all lazily-freed vmap areas.
*
* If sync is 0 then don't purge if there is already a purge in progress.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c391c320dba..c5dfabf25f1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static bool shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
}
shrink_zone(priority, zone, sc);
- all_unreclaimable = false;
}
+}
+
+static bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
+/*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. It can't handle OOM during hibernation.
+ * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
+ */
+static bool all_unreclaimable(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ bool all_unreclaimable = true;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
+ if (!populated_zone(zone))
+ continue;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ if (zone_reclaimable(zone)) {
+ all_unreclaimable = false;
+ break;
+ }
+ }
+
return all_unreclaimable;
}
@@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
- bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
@@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- all_unreclaimable = shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1931,7 +1959,7 @@ out:
return sc->nr_reclaimed;
/* top priority shrink_zones still had more to do? don't OOM, then */
- if (scanning_global_lru(sc) && !all_unreclaimable)
+ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
return 0;
@@ -2197,8 +2225,7 @@ loop_again:
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 &&
- zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
+ if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and