diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 8 | ||||
-rw-r--r-- | mm/Makefile | 7 | ||||
-rw-r--r-- | mm/backing-dev.c | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 13 | ||||
-rw-r--r-- | mm/fremap.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 24 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/memblock.c | 837 | ||||
-rw-r--r-- | mm/memcontrol.c | 10 | ||||
-rw-r--r-- | mm/memory-failure.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 10 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 1 | ||||
-rw-r--r-- | mm/oom_kill.c | 49 | ||||
-rw-r--r-- | mm/page_alloc.c | 90 | ||||
-rw-r--r-- | mm/percpu-km.c | 8 | ||||
-rw-r--r-- | mm/percpu.c | 405 | ||||
-rw-r--r-- | mm/percpu_up.c | 30 | ||||
-rw-r--r-- | mm/rmap.c | 27 | ||||
-rw-r--r-- | mm/slob.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 788 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 11 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/util.c | 13 | ||||
-rw-r--r-- | mm/vmalloc.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 47 |
26 files changed, 1454 insertions, 974 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f0fb9124e41..c2c8a4a1189 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -301,3 +301,11 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. + +# +# UP and nommu archs use km based percpu allocator +# +config NEED_PER_CPU_KM + depends on !SMP + bool + default y diff --git a/mm/Makefile b/mm/Makefile index 34b2546a9e3..f73f75a29f8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o mmu_context.o \ + page_isolation.o mm_init.o mmu_context.o percpu.o \ $(mmu-y) obj-y += init-mm.o @@ -36,11 +36,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_SMP -obj-y += percpu.o -else -obj-y += percpu_up.o -endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c2bf86f470e..65d420499a6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info); struct backing_dev_info noop_backing_dev_info = { .name = "noop", + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; EXPORT_SYMBOL_GPL(noop_backing_dev_info); @@ -243,6 +244,7 @@ static int __init default_bdi_init(void) err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); + err = bdi_init(&noop_backing_dev_info); return err; } diff --git a/mm/bootmem.c b/mm/bootmem.c index 142c84a5499..13b0caa9793 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/kmemleak.h> #include <linux/range.h> +#include <linux/memblock.h> #include <asm/bug.h> #include <asm/io.h> @@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { #ifdef CONFIG_NO_BOOTMEM - free_early(physaddr, physaddr + size); + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); #else unsigned long start, end; @@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, void __init free_bootmem(unsigned long addr, unsigned long size) { #ifdef CONFIG_NO_BOOTMEM - free_early(addr, addr + size); + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); #else unsigned long start, end; @@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #ifndef CONFIG_NO_BOOTMEM +int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} + static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { diff --git a/mm/fremap.c b/mm/fremap.c index 46f5dacf90a..ec520c7b28d 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, { struct mm_struct *mm = current->mm; struct address_space *mapping; - unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; @@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (start + size <= start) return err; + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return err; + /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) @@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!(vma->vm_flags & VM_CAN_NONLINEAR)) goto out; - if (end <= start || start < vma->vm_start || end > vma->vm_end) + if (start < vma->vm_start || start + size > vma->vm_end) goto out; /* Must set VM_NONLINEAR before any pages are populated. */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cc5be788a39..c0327380718 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2324,11 +2324,8 @@ retry_avoidcopy: * and just make the page writable */ avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { - if (!trylock_page(old_page)) { - if (PageAnon(old_page)) - page_move_anon_rmap(old_page, vma, address); - } else - unlock_page(old_page); + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2404,7 +2401,7 @@ retry_avoidcopy: set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page); - hugepage_add_anon_rmap(new_page, vma, address); + hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; mmu_notifier_invalidate_range_end(mm, @@ -2631,10 +2628,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } - if (!pagecache_page) { - page = pte_page(entry); + /* + * hugetlb_cow() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + * Note that locking order is always pagecache_page -> page, + * so no worry about deadlock. + */ + page = pte_page(entry); + if (page != pagecache_page) lock_page(page); - } spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ @@ -2661,9 +2664,8 @@ out_page_table_lock: if (pagecache_page) { unlock_page(pagecache_page); put_page(pagecache_page); - } else { - unlock_page(page); } + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); @@ -712,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (!ptep) goto out; - if (pte_write(*ptep)) { + if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; swapped = PageSwapCache(page); @@ -735,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at(mm, addr, ptep, entry); goto out_unlock; } - entry = pte_wrprotect(entry); + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, addr, ptep, entry); } *orig_pte = *ptep; diff --git a/mm/memblock.c b/mm/memblock.c index 43840b305ec..400dc62697d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -11,237 +11,423 @@ */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/bitops.h> +#include <linux/poison.h> +#include <linux/pfn.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/memblock.h> -#define MEMBLOCK_ALLOC_ANYWHERE 0 +struct memblock memblock __initdata_memblock; -struct memblock memblock; +int memblock_debug __initdata_memblock; +int memblock_can_resize __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; -static int memblock_debug; +/* inline so we don't get a warning when pr_debug is compiled out */ +static inline const char *memblock_type_name(struct memblock_type *type) +{ + if (type == &memblock.memory) + return "memory"; + else if (type == &memblock.reserved) + return "reserved"; + else + return "unknown"; +} -static int __init early_memblock(char *p) +/* + * Address comparison utilities + */ + +static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size) { - if (p && strstr(p, "debug")) - memblock_debug = 1; + return addr & ~(size - 1); +} + +static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size) +{ + return (addr + (size - 1)) & ~(size - 1); +} + +static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); +} + +static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + if (base2 == base1 + size1) + return 1; + else if (base1 == base2 + size2) + return -1; + return 0; } -early_param("memblock", early_memblock); -static void memblock_dump(struct memblock_region *region, char *name) +static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, + unsigned long r1, unsigned long r2) { - unsigned long long base, size; - int i; + phys_addr_t base1 = type->regions[r1].base; + phys_addr_t size1 = type->regions[r1].size; + phys_addr_t base2 = type->regions[r2].base; + phys_addr_t size2 = type->regions[r2].size; - pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + return memblock_addrs_adjacent(base1, size1, base2, size2); +} - for (i = 0; i < region->cnt; i++) { - base = region->region[i].base; - size = region->region[i].size; +long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +{ + unsigned long i; - pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", - name, i, base, base + size - 1, size); + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; + if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + break; } + + return (i < type->cnt) ? i : -1; } -void memblock_dump_all(void) +/* + * Find, allocate, deallocate or reserve unreserved regions. All allocations + * are top-down. + */ + +static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align) { - if (!memblock_debug) - return; + phys_addr_t base, res_base; + long j; - pr_info("MEMBLOCK configuration:\n"); - pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size); - pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size); + /* In case, huge size is requested */ + if (end < size) + return MEMBLOCK_ERROR; - memblock_dump(&memblock.memory, "memory"); - memblock_dump(&memblock.reserved, "reserved"); + base = memblock_align_down((end - size), align); + + /* Prevent allocations returning 0 as it's also used to + * indicate an allocation failure + */ + if (start == 0) + start = PAGE_SIZE; + + while (start <= base) { + j = memblock_overlaps_region(&memblock.reserved, base, size); + if (j < 0) + return base; + res_base = memblock.reserved.regions[j].base; + if (res_base < size) + break; + base = memblock_align_down(res_base - size, align); + } + + return MEMBLOCK_ERROR; } -static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2, - u64 size2) +static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, + phys_addr_t align, phys_addr_t start, phys_addr_t end) { - return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); + long i; + + BUG_ON(0 == size); + + size = memblock_align_up(size, align); + + /* Pump up max_addr */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + end = memblock.current_limit; + + /* We do a top-down search, this tends to limit memory + * fragmentation by keeping early boot allocs near the + * top of memory + */ + for (i = memblock.memory.cnt - 1; i >= 0; i--) { + phys_addr_t memblockbase = memblock.memory.regions[i].base; + phys_addr_t memblocksize = memblock.memory.regions[i].size; + phys_addr_t bottom, top, found; + + if (memblocksize < size) + continue; + if ((memblockbase + memblocksize) <= start) + break; + bottom = max(memblockbase, start); + top = min(memblockbase + memblocksize, end); + if (bottom >= top) + continue; + found = memblock_find_region(bottom, top, size, align); + if (found != MEMBLOCK_ERROR) + return found; + } + return MEMBLOCK_ERROR; } -static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) { - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; + return memblock_find_base(size, align, start, end); +} - return 0; +/* + * Free memblock.reserved.regions + */ +int __init_memblock memblock_free_reserved_regions(void) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + return memblock_free(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); } -static long memblock_regions_adjacent(struct memblock_region *rgn, - unsigned long r1, unsigned long r2) +/* + * Reserve memblock.reserved.regions + */ +int __init_memblock memblock_reserve_reserved_regions(void) { - u64 base1 = rgn->region[r1].base; - u64 size1 = rgn->region[r1].size; - u64 base2 = rgn->region[r2].base; - u64 size2 = rgn->region[r2].size; + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; - return memblock_addrs_adjacent(base1, size1, base2, size2); + return memblock_reserve(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); } -static void memblock_remove_region(struct memblock_region *rgn, unsigned long r) +static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { unsigned long i; - for (i = r; i < rgn->cnt - 1; i++) { - rgn->region[i].base = rgn->region[i + 1].base; - rgn->region[i].size = rgn->region[i + 1].size; + for (i = r; i < type->cnt - 1; i++) { + type->regions[i].base = type->regions[i + 1].base; + type->regions[i].size = type->regions[i + 1].size; } - rgn->cnt--; + type->cnt--; } /* Assumption: base addr of region 1 < base addr of region 2 */ -static void memblock_coalesce_regions(struct memblock_region *rgn, +static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, unsigned long r1, unsigned long r2) { - rgn->region[r1].size += rgn->region[r2].size; - memblock_remove_region(rgn, r2); + type->regions[r1].size += type->regions[r2].size; + memblock_remove_region(type, r2); } -void __init memblock_init(void) +/* Defined below but needed now */ +static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); + +static int __init_memblock memblock_double_array(struct memblock_type *type) { - /* Create a dummy zero size MEMBLOCK which will get coalesced away later. - * This simplifies the memblock_add() code below... + struct memblock_region *new_array, *old_array; + phys_addr_t old_size, new_size, addr; + int use_slab = slab_is_available(); + + /* We don't allow resizing until we know about the reserved regions + * of memory that aren't suitable for allocation */ - memblock.memory.region[0].base = 0; - memblock.memory.region[0].size = 0; - memblock.memory.cnt = 1; + if (!memblock_can_resize) + return -1; - /* Ditto. */ - memblock.reserved.region[0].base = 0; - memblock.reserved.region[0].size = 0; - memblock.reserved.cnt = 1; -} + /* Calculate new doubled size */ + old_size = type->max * sizeof(struct memblock_region); + new_size = old_size << 1; + + /* Try to find some space for it. + * + * WARNING: We assume that either slab_is_available() and we use it or + * we use MEMBLOCK for allocations. That means that this is unsafe to use + * when bootmem is currently active (unless bootmem itself is implemented + * on top of MEMBLOCK which isn't the case yet) + * + * This should however not be an issue for now, as we currently only + * call into MEMBLOCK while it's still active, or much later when slab is + * active for memory hotplug operations + */ + if (use_slab) { + new_array = kmalloc(new_size, GFP_KERNEL); + addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); + } else + addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr == MEMBLOCK_ERROR) { + pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", + memblock_type_name(type), type->max, type->max * 2); + return -1; + } + new_array = __va(addr); -void __init memblock_analyze(void) -{ - int i; + memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", + memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); - memblock.memory.size = 0; + /* Found space, we now need to move the array over before + * we add the reserved region since it may be our reserved + * array itself that is full. + */ + memcpy(new_array, type->regions, old_size); + memset(new_array + type->max, 0, old_size); + old_array = type->regions; + type->regions = new_array; + type->max <<= 1; + + /* If we use SLAB that's it, we are done */ + if (use_slab) + return 0; - for (i = 0; i < memblock.memory.cnt; i++) - memblock.memory.size += memblock.memory.region[i].size; + /* Add the new reserved region now. Should not fail ! */ + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); + + /* If the array wasn't our static init one, then free it. We only do + * that before SLAB is available as later on, we don't know whether + * to use kfree or free_bootmem_pages(). Shouldn't be a big deal + * anyways + */ + if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(__pa(old_array), old_size); + + return 0; } -static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size) +extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, + phys_addr_t addr2, phys_addr_t size2) +{ + return 1; +} + +static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long coalesced = 0; long adjacent, i; - if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { - rgn->region[0].base = base; - rgn->region[0].size = size; + if ((type->cnt == 1) && (type->regions[0].size == 0)) { + type->regions[0].base = base; + type->regions[0].size = size; return 0; } /* First try and coalesce this MEMBLOCK with another. */ - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; if ((rgnbase == base) && (rgnsize == size)) /* Already have this region, so we're done */ return 0; adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); + /* Check if arch allows coalescing */ + if (adjacent != 0 && type == &memblock.memory && + !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) + break; if (adjacent > 0) { - rgn->region[i].base -= size; - rgn->region[i].size += size; + type->regions[i].base -= size; + type->regions[i].size += size; coalesced++; break; } else if (adjacent < 0) { - rgn->region[i].size += size; + type->regions[i].size += size; coalesced++; break; } } - if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) { - memblock_coalesce_regions(rgn, i, i+1); + /* If we plugged a hole, we may want to also coalesce with the + * next region + */ + if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && + ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, + type->regions[i].size, + type->regions[i+1].base, + type->regions[i+1].size)))) { + memblock_coalesce_regions(type, i, i+1); coalesced++; } if (coalesced) return coalesced; - if (rgn->cnt >= MAX_MEMBLOCK_REGIONS) + + /* If we are out of space, we fail. It's too late to resize the array + * but then this shouldn't have happened in the first place. + */ + if (WARN_ON(type->cnt >= type->max)) return -1; /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ - for (i = rgn->cnt - 1; i >= 0; i--) { - if (base < rgn->region[i].base) { - rgn->region[i+1].base = rgn->region[i].base; - rgn->region[i+1].size = rgn->region[i].size; + for (i = type->cnt - 1; i >= 0; i--) { + if (base < type->regions[i].base) { + type->regions[i+1].base = type->regions[i].base; + type->regions[i+1].size = type->regions[i].size; } else { - rgn->region[i+1].base = base; - rgn->region[i+1].size = size; + type->regions[i+1].base = base; + type->regions[i+1].size = size; break; } } - if (base < rgn->region[0].base) { - rgn->region[0].base = base; - rgn->region[0].size = size; + if (base < type->regions[0].base) { + type->regions[0].base = base; + type->regions[0].size = size; + } + type->cnt++; + + /* The array is full ? Try to resize it. If that fails, we undo + * our allocation and return an error + */ + if (type->cnt == type->max && memblock_double_array(type)) { + type->cnt--; + return -1; } - rgn->cnt++; return 0; } -long memblock_add(u64 base, u64 size) +long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - struct memblock_region *_rgn = &memblock.memory; - - /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */ - if (base == 0) - memblock.rmo_size = size; - - return memblock_add_region(_rgn, base, size); + return memblock_add_region(&memblock.memory, base, size); } -static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) +static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { - u64 rgnbegin, rgnend; - u64 end = base + size; + phys_addr_t rgnbegin, rgnend; + phys_addr_t end = base + size; int i; rgnbegin = rgnend = 0; /* supress gcc warnings */ /* Find the region where (base, size) belongs to */ - for (i=0; i < rgn->cnt; i++) { - rgnbegin = rgn->region[i].base; - rgnend = rgnbegin + rgn->region[i].size; + for (i=0; i < type->cnt; i++) { + rgnbegin = type->regions[i].base; + rgnend = rgnbegin + type->regions[i].size; if ((rgnbegin <= base) && (end <= rgnend)) break; } /* Didn't find the region */ - if (i == rgn->cnt) + if (i == type->cnt) return -1; /* Check to see if we are removing entire region */ if ((rgnbegin == base) && (rgnend == end)) { - memblock_remove_region(rgn, i); + memblock_remove_region(type, i); return 0; } /* Check to see if region is matching at the front */ if (rgnbegin == base) { - rgn->region[i].base = end; - rgn->region[i].size -= size; + type->regions[i].base = end; + type->regions[i].size -= size; return 0; } /* Check to see if the region is matching at the end */ if (rgnend == end) { - rgn->region[i].size -= size; + type->regions[i].size -= size; return 0; } @@ -249,208 +435,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) * We need to split the entry - adjust the current one to the * beginging of the hole and add the region after hole. */ - rgn->region[i].size = base - rgn->region[i].base; - return memblock_add_region(rgn, end, rgnend - end); + type->regions[i].size = base - type->regions[i].base; + return memblock_add_region(type, end, rgnend - end); } -long memblock_remove(u64 base, u64 size) +long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.memory, base, size); } -long __init memblock_free(u64 base, u64 size) +long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.reserved, base, size); } -long __init memblock_reserve(u64 base, u64 size) +long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { - struct memblock_region *_rgn = &memblock.reserved; + struct memblock_type *_rgn = &memblock.reserved; BUG_ON(0 == size); return memblock_add_region(_rgn, base, size); } -long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size) +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - unsigned long i; + phys_addr_t found; - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; - if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) - break; - } + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ + size = memblock_align_up(size, align); - return (i < rgn->cnt) ? i : -1; + found = memblock_find_base(size, align, 0, max_addr); + if (found != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, found, size) >= 0) + return found; + + return 0; } -static u64 memblock_align_down(u64 addr, u64 size) +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return addr & ~(size - 1); + phys_addr_t alloc; + + alloc = __memblock_alloc_base(size, align, max_addr); + + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; } -static u64 memblock_align_up(u64 addr, u64 size) +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) { - return (addr + (size - 1)) & ~(size - 1); + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end, - u64 size, u64 align) + +/* + * Additional node-local allocators. Search for node memory is bottom up + * and walks memblock regions within that node bottom-up as well, but allocation + * within an memblock region is top-down. XXX I plan to fix that at some stage + * + * WARNING: Only available after early_node_map[] has been populated, + * on some architectures, that is after all the calls to add_active_range() + * have been done to populate it. + */ + +phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) { - u64 base, res_base; - long j; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * This code originates from sparc which really wants use to walk by addresses + * and returns the nid. This is not very convenient for early_pfn_map[] users + * as the map isn't sorted yet, and it really wants to be walked by nid. + * + * For now, I implement the inefficient method below which walks the early + * map multiple times. Eventually we may want to use an ARCH config option + * to implement a completely different method for both case. + */ + unsigned long start_pfn, end_pfn; + int i; - base = memblock_align_down((end - size), align); - while (start <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (memblock_add_region(&memblock.reserved, base, size) < 0) - base = ~(u64)0; - return base; - } - res_base = memblock.reserved.region[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); + for (i = 0; i < MAX_NUMNODES; i++) { + get_pfn_range_for_nid(i, &start_pfn, &end_pfn); + if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) + continue; + *nid = i; + return min(end, PFN_PHYS(end_pfn)); } +#endif + *nid = 0; - return ~(u64)0; + return end; } -static u64 __init memblock_alloc_nid_region(struct memblock_property *mp, - u64 (*nid_range)(u64, u64, int *), - u64 size, u64 align, int nid) +static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, + phys_addr_t size, + phys_addr_t align, int nid) { - u64 start, end; + phys_addr_t start, end; start = mp->base; end = start + mp->size; start = memblock_align_up(start, align); while (start < end) { - u64 this_end; + phys_addr_t this_end; int this_nid; - this_end = nid_range(start, end, &this_nid); + this_end = memblock_nid_range(start, end, &this_nid); if (this_nid == nid) { - u64 ret = memblock_alloc_nid_unreserved(start, this_end, - size, align); - if (ret != ~(u64)0) + phys_addr_t ret = memblock_find_region(start, this_end, size, align); + if (ret != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, ret, size) >= 0) return ret; } start = this_end; } - return ~(u64)0; + return MEMBLOCK_ERROR; } -u64 __init memblock_alloc_nid(u64 size, u64 align, int nid, - u64 (*nid_range)(u64 start, u64 end, int *nid)) +phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - struct memblock_region *mem = &memblock.memory; + struct memblock_type *mem = &memblock.memory; int i; BUG_ON(0 == size); + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ size = memblock_align_up(size, align); + /* We do a bottom-up search for a region with the right + * nid since that's easier considering how memblock_nid_range() + * works + */ for (i = 0; i < mem->cnt; i++) { - u64 ret = memblock_alloc_nid_region(&mem->region[i], - nid_range, + phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i], size, align, nid); - if (ret != ~(u64)0) + if (ret != MEMBLOCK_ERROR) return ret; } - return memblock_alloc(size, align); -} - -u64 __init memblock_alloc(u64 size, u64 align) -{ - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); + return 0; } -u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr) +phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - u64 alloc; - - alloc = __memblock_alloc_base(size, align, max_addr); + phys_addr_t res = memblock_alloc_nid(size, align, nid); - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); - - return alloc; + if (res) + return res; + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); } -u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr) -{ - long i, j; - u64 base = 0; - u64 res_base; - - BUG_ON(0 == size); - size = memblock_align_up(size, align); - - /* On some platforms, make sure we allocate lowmem */ - /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */ - if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) - max_addr = MEMBLOCK_REAL_LIMIT; - - for (i = memblock.memory.cnt - 1; i >= 0; i--) { - u64 memblockbase = memblock.memory.region[i].base; - u64 memblocksize = memblock.memory.region[i].size; - - if (memblocksize < size) - continue; - if (max_addr == MEMBLOCK_ALLOC_ANYWHERE) - base = memblock_align_down(memblockbase + memblocksize - size, align); - else if (memblockbase < max_addr) { - base = min(memblockbase + memblocksize, max_addr); - base = memblock_align_down(base - size, align); - } else - continue; - - while (base && memblockbase <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (memblock_add_region(&memblock.reserved, base, size) < 0) - return 0; - return base; - } - res_base = memblock.reserved.region[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); - } - } - return 0; -} +/* + * Remaining API functions + */ /* You must call memblock_analyze() before this. */ -u64 __init memblock_phys_mem_size(void) +phys_addr_t __init memblock_phys_mem_size(void) { - return memblock.memory.size; + return memblock.memory_size; } -u64 memblock_end_of_DRAM(void) +phys_addr_t __init_memblock memblock_end_of_DRAM(void) { int idx = memblock.memory.cnt - 1; - return (memblock.memory.region[idx].base + memblock.memory.region[idx].size); + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } /* You must call memblock_analyze() after this. */ -void __init memblock_enforce_memory_limit(u64 memory_limit) +void __init memblock_enforce_memory_limit(phys_addr_t memory_limit) { unsigned long i; - u64 limit; - struct memblock_property *p; + phys_addr_t limit; + struct memblock_region *p; if (!memory_limit) return; @@ -458,24 +625,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit) /* Truncate the memblock regions to satisfy the memory limit. */ limit = memory_limit; for (i = 0; i < memblock.memory.cnt; i++) { - if (limit > memblock.memory.region[i].size) { - limit -= memblock.memory.region[i].size; + if (limit > memblock.memory.regions[i].size) { + limit -= memblock.memory.regions[i].size; continue; } - memblock.memory.region[i].size = limit; + memblock.memory.regions[i].size = limit; memblock.memory.cnt = i + 1; break; } - if (memblock.memory.region[0].size < memblock.rmo_size) - memblock.rmo_size = memblock.memory.region[0].size; - memory_limit = memblock_end_of_DRAM(); /* And truncate any reserves above the limit also. */ for (i = 0; i < memblock.reserved.cnt; i++) { - p = &memblock.reserved.region[i]; + p = &memblock.reserved.regions[i]; if (p->base > memory_limit) p->size = 0; @@ -489,53 +653,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit) } } -int __init memblock_is_reserved(u64 addr) +static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +{ + unsigned int left = 0, right = type->cnt; + + do { + unsigned int mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else + return mid; + } while (left < right); + return -1; +} + +int __init memblock_is_reserved(phys_addr_t addr) +{ + return memblock_search(&memblock.reserved, addr) != -1; +} + +int __init_memblock memblock_is_memory(phys_addr_t addr) +{ + return memblock_search(&memblock.memory, addr) != -1; +} + +int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) +{ + int idx = memblock_search(&memblock.reserved, base); + + if (idx == -1) + return 0; + return memblock.reserved.regions[idx].base <= base && + (memblock.reserved.regions[idx].base + + memblock.reserved.regions[idx].size) >= (base + size); +} + +int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +{ + return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; +} + + +void __init_memblock memblock_set_current_limit(phys_addr_t limit) { + memblock.current_limit = limit; +} + +static void __init_memblock memblock_dump(struct memblock_type *region, char *name) +{ + unsigned long long base, size; int i; - for (i = 0; i < memblock.reserved.cnt; i++) { - u64 upper = memblock.reserved.region[i].base + - memblock.reserved.region[i].size - 1; - if ((addr >= memblock.reserved.region[i].base) && (addr <= upper)) - return 1; + pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + + for (i = 0; i < region->cnt; i++) { + base = region->regions[i].base; + size = region->regions[i].size; + + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", + name, i, base, base + size - 1, size); } - return 0; } -int memblock_is_region_reserved(u64 base, u64 size) +void __init_memblock memblock_dump_all(void) { - return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; + if (!memblock_debug) + return; + + pr_info("MEMBLOCK configuration:\n"); + pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); + + memblock_dump(&memblock.memory, "memory"); + memblock_dump(&memblock.reserved, "reserved"); } -/* - * Given a <base, len>, find which memory regions belong to this range. - * Adjust the request and return a contiguous chunk. - */ -int memblock_find(struct memblock_property *res) +void __init memblock_analyze(void) { int i; - u64 rstart, rend; - rstart = res->base; - rend = rstart + res->size - 1; + /* Check marker in the unused last array entry */ + WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + + memblock.memory_size = 0; + + for (i = 0; i < memblock.memory.cnt; i++) + memblock.memory_size += memblock.memory.regions[i].size; + + /* We allow resizing from there */ + memblock_can_resize = 1; +} + +void __init memblock_init(void) +{ + static int init_done __initdata = 0; + + if (init_done) + return; + init_done = 1; + + /* Hookup the initial arrays */ + memblock.memory.regions = memblock_memory_init_regions; + memblock.memory.max = INIT_MEMBLOCK_REGIONS; + memblock.reserved.regions = memblock_reserved_init_regions; + memblock.reserved.max = INIT_MEMBLOCK_REGIONS; + + /* Write a marker in the unused last array entry */ + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + + /* Create a dummy zero size MEMBLOCK which will get coalesced away later. + * This simplifies the memblock_add() code below... + */ + memblock.memory.regions[0].base = 0; + memblock.memory.regions[0].size = 0; + memblock.memory.cnt = 1; + + /* Ditto. */ + memblock.reserved.regions[0].base = 0; + memblock.reserved.regions[0].size = 0; + memblock.reserved.cnt = 1; + + memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; +} + +static int __init early_memblock(char *p) +{ + if (p && strstr(p, "debug")) + memblock_debug = 1; + return 0; +} +early_param("memblock", early_memblock); + +#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) + +static int memblock_debug_show(struct seq_file *m, void *private) +{ + struct memblock_type *type = m->private; + struct memblock_region *reg; + int i; + + for (i = 0; i < type->cnt; i++) { + reg = &type->regions[i]; + seq_printf(m, "%4d: ", i); + if (sizeof(phys_addr_t) == 4) + seq_printf(m, "0x%08lx..0x%08lx\n", + (unsigned long)reg->base, + (unsigned long)(reg->base + reg->size - 1)); + else + seq_printf(m, "0x%016llx..0x%016llx\n", + (unsigned long long)reg->base, + (unsigned long long)(reg->base + reg->size - 1)); - for (i = 0; i < memblock.memory.cnt; i++) { - u64 start = memblock.memory.region[i].base; - u64 end = start + memblock.memory.region[i].size - 1; - - if (start > rend) - return -1; - - if ((end >= rstart) && (start < rend)) { - /* adjust the request */ - if (rstart < start) - rstart = start; - if (rend > end) - rend = end; - res->base = rstart; - res->size = rend - rstart + 1; - return 0; - } } - return -1; + return 0; +} + +static int memblock_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, memblock_debug_show, inode->i_private); } + +static const struct file_operations memblock_debug_fops = { + .open = memblock_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root = debugfs_create_dir("memblock", NULL); + if (!root) + return -ENXIO; + debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); + debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); + + return 0; +} +__initcall(memblock_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eed583895a..9be3cf8a5da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3587,9 +3587,13 @@ unlock: static void mem_cgroup_threshold(struct mem_cgroup *memcg) { - __mem_cgroup_threshold(memcg, false); - if (do_swap_account) - __mem_cgroup_threshold(memcg, true); + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_swap_account) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } } static int compare_thresholds(const void *a, const void *b) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9c26eeca134..757f6b0accf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter); * signal. */ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) + unsigned long pfn, struct page *page) { struct siginfo si; int ret; @@ -198,7 +198,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -235,7 +235,7 @@ void shake_page(struct page *p, int access) int nr; do { nr = shrink_slab(1000, GFP_KERNEL, 1000); - if (page_count(p) == 0) + if (page_count(p) == 1) break; } while (nr > 10); } @@ -327,7 +327,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) + int fail, struct page *page, unsigned long pfn) { struct to_kill *tk, *next; @@ -352,7 +352,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * process anyways. */ else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) + pfn, page) < 0) printk(KERN_ERR "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); @@ -928,7 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, - ret != SWAP_SUCCESS, pfn); + ret != SWAP_SUCCESS, p, pfn); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 71b161b73bb..98b58fecede 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2680,10 +2680,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_clear_flag(DELAYACCT_PF_SWAPIN); /* - * Make sure try_to_free_swap didn't release the swapcache - * from under us. The page pin isn't enough to prevent that. + * Make sure try_to_free_swap or reuse_swap_page or swapoff did not + * release the swapcache from under us. The page pin, and pte_same + * test below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not changed. */ - if (unlikely(!PageSwapCache(page))) + if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; if (ksm_might_need_to_copy(page, vma, address)) { @@ -3183,7 +3185,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * with threads. */ if (flags & FAULT_FLAG_WRITE) - flush_tlb_page(vma, address); + flush_tlb_fix_spurious_fault(vma, address); } unlock: pte_unmap_unlock(pte, ptl); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dd186c1a5d5..d4e940a2694 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -840,7 +840,6 @@ repeat: ret = 0; if (drain) { lru_add_drain_all(); - flush_scheduled_work(); cond_resched(); drain_all_pages(); } @@ -862,7 +861,6 @@ repeat: } /* drain all zone's lru pagevec, this is asyncronous... */ lru_add_drain_all(); - flush_scheduled_work(); yield(); /* drain pcp pages , this is synchrouns. */ drain_all_pages(); diff --git a/mm/mmap.c b/mm/mmap.c index 6128dc8e5ed..00161a48a45 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2009,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, removed_exe_file_vma(mm); fput(new->vm_file); } + unlink_anon_vmas(new); out_free_mpol: mpol_put(pol); out_free_vma: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fc81cb22869..4029583a102 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -121,8 +121,8 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) } /* return true if the task is not adequate as candidate victim task. */ -static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem, - const nodemask_t *nodemask) +static bool oom_unkillable_task(struct task_struct *p, + const struct mem_cgroup *mem, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, */ points += p->signal->oom_score_adj; - if (points < 0) - return 0; + /* + * Never return 0 for an eligible task that may be killed since it's + * possible that no single user task uses more than 0.1% of memory and + * no single admin tasks uses more than 3.0%. + */ + if (points <= 0) + return 1; return (points < 1000) ? points : 1000; } @@ -339,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, /** * dump_tasks - dump current memory state of all system tasks * @mem: current's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * - * Dumps the current memory state of all system tasks, excluding kernel threads. + * Dumps the current memory state of all eligible tasks. Tasks not in the same + * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes + * are not shown. * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj * value, oom_score_adj value, and name. * - * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are - * shown. - * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem) +static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); for_each_process(p) { - if (p->flags & PF_KTHREAD) - continue; - if (mem && !task_in_mem_cgroup(p, mem)) + if (oom_unkillable_task(p, mem, nodemask)) continue; task = find_lock_task_mm(p); @@ -381,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem) + struct mem_cgroup *mem, const nodemask_t *nodemask) { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " @@ -394,7 +397,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, mem_cgroup_print_oom_info(mem, p); show_mem(); if (sysctl_oom_dump_tasks) - dump_tasks(mem); + dump_tasks(mem, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -436,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem); + dump_header(p, gfp_mask, order, mem, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -482,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order) + int order, const nodemask_t *nodemask) { if (likely(!sysctl_panic_on_oom)) return; @@ -496,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, return; } read_lock(&tasklist_lock); - dump_header(NULL, gfp_mask, order, NULL); + dump_header(NULL, gfp_mask, order, NULL, nodemask); read_unlock(&tasklist_lock); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); @@ -509,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned int points = 0; struct task_struct *p; - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: @@ -641,6 +644,7 @@ static void clear_system_oom(void) void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask) { + const nodemask_t *mpol_mask; struct task_struct *p; unsigned long totalpages; unsigned long freed = 0; @@ -670,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ constraint = constrained_alloc(zonelist, gfp_mask, nodemask, &totalpages); - check_panic_on_oom(constraint, gfp_mask, order); + mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && @@ -688,15 +693,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, } retry: - p = select_bad_process(&points, totalpages, NULL, - constraint == CONSTRAINT_MEMORY_POLICY ? nodemask : - NULL); + p = select_bad_process(&points, totalpages, NULL, mpol_mask); if (PTR_ERR(p) == -1UL) goto out; /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { - dump_header(NULL, gfp_mask, order, NULL); + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); read_unlock(&tasklist_lock); panic("Out of memory and no killable processes...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a8cfa9cc6e8..2a362c52fdf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kmemcheck.h> @@ -3636,6 +3637,41 @@ void __init free_bootmem_with_active_regions(int nid, } } +#ifdef CONFIG_HAVE_MEMBLOCK +u64 __init find_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + + /* Need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + u64 final_start, final_end; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + + final_start = max(ei_start, goal); + final_end = min(ei_last, limit); + + if (final_start >= final_end) + continue; + + addr = memblock_find_in_range(final_start, final_end, size, align); + + if (addr == MEMBLOCK_ERROR) + continue; + + return addr; + } + + return MEMBLOCK_ERROR; +} +#endif + int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { @@ -3655,46 +3691,26 @@ int __init add_from_early_node_map(struct range *range, int az, void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { - int i; void *ptr; + u64 addr; - if (limit > get_max_mapped()) - limit = get_max_mapped(); - - /* need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { - u64 addr; - u64 ei_start, ei_last; - - ei_last = early_node_map[i].end_pfn; - ei_last <<= PAGE_SHIFT; - ei_start = early_node_map[i].start_pfn; - ei_start <<= PAGE_SHIFT; - addr = find_early_area(ei_start, ei_last, - goal, limit, size, align); - - if (addr == -1ULL) - continue; + if (limit > memblock.current_limit) + limit = memblock.current_limit; -#if 0 - printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", - nid, - ei_start, ei_last, goal, limit, size, - align, addr); -#endif + addr = find_memory_core_early(nid, size, align, goal, limit); - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - reserve_early_without_check(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; - } + if (addr == MEMBLOCK_ERROR) + return NULL; - return NULL; + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; } #endif @@ -5182,9 +5198,9 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", + printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", tablename, - (1U << log2qty), + (1UL << log2qty), ilog2(size) - PAGE_SHIFT, size); diff --git a/mm/percpu-km.c b/mm/percpu-km.c index df680855540..89633fefc6a 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -27,7 +27,7 @@ * chunk size is not aligned. percpu-km code will whine about it. */ -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #error "contiguous percpu allocation is incompatible with paged first chunk" #endif @@ -35,7 +35,11 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - /* noop */ + unsigned int cpu; + + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + return 0; } diff --git a/mm/percpu.c b/mm/percpu.c index 58c572b18b0..efe816856a9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -31,7 +31,7 @@ * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be eqaul to or larger than the maximum contiguous + * guaranteed to be equal to or larger than the maximum contiguous * area in the chunk. This helps the allocator not to iterate the * chunk maps unnecessarily. * @@ -76,6 +76,7 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ @@ -89,6 +90,11 @@ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif +#else /* CONFIG_SMP */ +/* on UP, it's always identity mapped */ +#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) +#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) +#endif /* CONFIG_SMP */ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ @@ -820,8 +826,8 @@ fail_unlock_mutex: * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align. Might - * sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align. + * Might sleep. Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -840,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align from reserved - * percpu area if arch has set it up; otherwise, allocation is served - * from the same dynamic area. Might sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align + * from reserved percpu area if arch has set it up; otherwise, + * allocation is served from the same dynamic area. Might sleep. + * Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -949,6 +956,7 @@ EXPORT_SYMBOL_GPL(free_percpu); */ bool is_kernel_percpu_address(unsigned long addr) { +#ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; @@ -959,6 +967,8 @@ bool is_kernel_percpu_address(unsigned long addr) if ((void *)addr >= start && (void *)addr < start + static_size) return true; } +#endif + /* on UP, can't distinguish from other static vars, always false */ return false; } @@ -1067,161 +1077,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) } /** - * pcpu_build_alloc_info - build alloc_info considering distances between CPUs - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: minimum free size for dynamic allocation in bytes - * @atom_size: allocation atom size - * @cpu_distance_fn: callback to determine distance between cpus, optional - * - * This function determines grouping of units, their mappings to cpus - * and other parameters considering needed percpu size, allocation - * atom size and distances between CPUs. - * - * Groups are always mutliples of atom size and CPUs which are of - * LOCAL_DISTANCE both ways are grouped together and share space for - * units in the same group. The returned configuration is guaranteed - * to have CPUs on different nodes on different groups and >=75% usage - * of allocated virtual address space. - * - * RETURNS: - * On success, pointer to the new allocation_info is returned. On - * failure, ERR_PTR value is returned. - */ -static struct pcpu_alloc_info * __init pcpu_build_alloc_info( - size_t reserved_size, size_t dyn_size, - size_t atom_size, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - static int group_map[NR_CPUS] __initdata; - static int group_cnt[NR_CPUS] __initdata; - const size_t static_size = __per_cpu_end - __per_cpu_start; - int nr_groups = 1, nr_units = 0; - size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs, group, unit; - unsigned int cpu, tcpu; - struct pcpu_alloc_info *ai; - unsigned int *cpu_map; - - /* this function may be called multiple times */ - memset(group_map, 0, sizeof(group_map)); - memset(group_cnt, 0, sizeof(group_cnt)); - - /* calculate size_sum and ensure dyn_size is enough for early alloc */ - size_sum = PFN_ALIGN(static_size + reserved_size + - max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); - dyn_size = size_sum - static_size - reserved_size; - - /* - * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of atom_size and is the smallest - * which can accomodate 4k aligned segments which are equal to - * or larger than min_unit_size. - */ - min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - - alloc_size = roundup(min_unit_size, atom_size); - upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - upa--; - max_upa = upa; - - /* group cpus according to their proximity */ - for_each_possible_cpu(cpu) { - group = 0; - next_group: - for_each_possible_cpu(tcpu) { - if (cpu == tcpu) - break; - if (group_map[tcpu] == group && cpu_distance_fn && - (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || - cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { - group++; - nr_groups = max(nr_groups, group + 1); - goto next_group; - } - } - group_map[cpu] = group; - group_cnt[group]++; - } - - /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. - */ - last_allocs = INT_MAX; - for (upa = max_upa; upa; upa--) { - int allocs = 0, wasted = 0; - - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - continue; - - for (group = 0; group < nr_groups; group++) { - int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); - allocs += this_allocs; - wasted += this_allocs * upa - group_cnt[group]; - } - - /* - * Don't accept if wastage is over 1/3. The - * greater-than comparison ensures upa==1 always - * passes the following check. - */ - if (wasted > num_possible_cpus() / 3) - continue; - - /* and then don't consume more memory */ - if (allocs > last_allocs) - break; - last_allocs = allocs; - best_upa = upa; - } - upa = best_upa; - - /* allocate and fill alloc_info */ - for (group = 0; group < nr_groups; group++) - nr_units += roundup(group_cnt[group], upa); - - ai = pcpu_alloc_alloc_info(nr_groups, nr_units); - if (!ai) - return ERR_PTR(-ENOMEM); - cpu_map = ai->groups[0].cpu_map; - - for (group = 0; group < nr_groups; group++) { - ai->groups[group].cpu_map = cpu_map; - cpu_map += roundup(group_cnt[group], upa); - } - - ai->static_size = static_size; - ai->reserved_size = reserved_size; - ai->dyn_size = dyn_size; - ai->unit_size = alloc_size / upa; - ai->atom_size = atom_size; - ai->alloc_size = alloc_size; - - for (group = 0, unit = 0; group_cnt[group]; group++) { - struct pcpu_group_info *gi = &ai->groups[group]; - - /* - * Initialize base_offset as if all groups are located - * back-to-back. The caller should update this to - * reflect actual allocation. - */ - gi->base_offset = unit * ai->unit_size; - - for_each_possible_cpu(cpu) - if (group_map[cpu] == group) - gi->cpu_map[gi->nr_units++] = cpu; - gi->nr_units = roundup(gi->nr_units, upa); - unit += gi->nr_units; - } - BUG_ON(unit != nr_units); - - return ai; -} - -/** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump @@ -1363,7 +1218,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* sanity checks */ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); +#ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); +#endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); @@ -1401,9 +1258,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; + pcpu_last_unit_cpu = cpu; } } - pcpu_last_unit_cpu = cpu; pcpu_nr_units = unit; for_each_possible_cpu(cpu) @@ -1488,6 +1345,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, return 0; } +#ifdef CONFIG_SMP + const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", @@ -1515,8 +1374,180 @@ static int __init percpu_alloc_setup(char *str) } early_param("percpu_alloc", percpu_alloc_setup); +/* + * pcpu_embed_first_chunk() is used by the generic percpu setup. + * Build it if needed by the arch config or the generic setup is going + * to be used. + */ #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) +#define BUILD_EMBED_FIRST_CHUNK +#endif + +/* build pcpu_page_first_chunk() iff needed by the arch config */ +#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#define BUILD_PAGE_FIRST_CHUNK +#endif + +/* pcpu_build_alloc_info() is used by both embed and page first chunk */ +#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +static struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_cnt)); + + /* calculate size_sum and ensure dyn_size is enough for early alloc */ + size_sum = PFN_ALIGN(static_size + reserved_size + + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + dyn_size = size_sum - static_size - reserved_size; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 1/3. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} +#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ + +#if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes @@ -1645,10 +1676,9 @@ out_free: free_bootmem(__pa(areas), areas_size); return rc; } -#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || - !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#endif /* BUILD_EMBED_FIRST_CHUNK */ -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +#ifdef BUILD_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes @@ -1756,10 +1786,11 @@ out_free_ar: pcpu_free_alloc_info(ai); return rc; } -#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ +#endif /* BUILD_PAGE_FIRST_CHUNK */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* - * Generic percpu area setup. + * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is @@ -1770,7 +1801,6 @@ out_free_ar: * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -1799,13 +1829,48 @@ void __init setup_per_cpu_areas(void) PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) - panic("Failed to initialized percpu areas."); + panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#else /* CONFIG_SMP */ + +/* + * UP percpu area setup. + * + * UP always uses km-based percpu allocator with identity mapping. + * Static percpu variables are indistinguishable from the usual static + * variables and don't require any special preparation. + */ +void __init setup_per_cpu_areas(void) +{ + const size_t unit_size = + roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, + PERCPU_DYNAMIC_RESERVE)); + struct pcpu_alloc_info *ai; + void *fc; + + ai = pcpu_alloc_alloc_info(1, 1); + fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!ai || !fc) + panic("Failed to allocate memory for percpu areas."); + + ai->dyn_size = unit_size; + ai->unit_size = unit_size; + ai->atom_size = unit_size; + ai->alloc_size = unit_size; + ai->groups[0].nr_units = 1; + ai->groups[0].cpu_map[0] = 0; + + if (pcpu_setup_first_chunk(ai, fc) < 0) + panic("Failed to initialize percpu areas."); +} + +#endif /* CONFIG_SMP */ /* * First and reserved chunks are initialized with temporary allocation diff --git a/mm/percpu_up.c b/mm/percpu_up.c deleted file mode 100644 index db884fae572..00000000000 --- a/mm/percpu_up.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * mm/percpu_up.c - dummy percpu memory allocator implementation for UP - */ - -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/slab.h> - -void __percpu *__alloc_percpu(size_t size, size_t align) -{ - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - return (void __percpu __force *)kzalloc(size, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -void free_percpu(void __percpu *p) -{ - kfree(this_cpu_ptr(p)); -} -EXPORT_SYMBOL_GPL(free_percpu); - -phys_addr_t per_cpu_ptr_to_phys(void *addr) -{ - return __pa(addr); -} diff --git a/mm/rmap.c b/mm/rmap.c index f6f0d2dda2e..5f17fad1bee 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -381,7 +381,13 @@ vma_address(struct page *page, struct vm_area_struct *vma) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { if (PageAnon(page)) { - if (vma->anon_vma->root != page_anon_vma(page)->root) + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* + * Note: swapoff's unuse_vma() is more efficient with this + * check, and needs it to match anon_vma when KSM is active. + */ + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { if (!vma->vm_file || @@ -739,7 +745,7 @@ int page_mkclean(struct page *page) if (mapping) { ret = page_mkclean_file(mapping, page); if (page_test_dirty(page)) { - page_clear_dirty(page); + page_clear_dirty(page, 1); ret = 1; } } @@ -936,7 +942,7 @@ void page_remove_rmap(struct page *page) * containing the swap entry, but page not yet written to swap. */ if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { - page_clear_dirty(page); + page_clear_dirty(page, 1); set_page_dirty(page); } /* @@ -1564,13 +1570,14 @@ static void __hugepage_set_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { struct anon_vma *anon_vma = vma->anon_vma; + BUG_ON(!anon_vma); - if (!exclusive) { - struct anon_vma_chain *avc; - avc = list_entry(vma->anon_vma_chain.prev, - struct anon_vma_chain, same_vma); - anon_vma = avc->anon_vma; - } + + if (PageAnon(page)) + return; + if (!exclusive) + anon_vma = anon_vma->root; + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; page->index = linear_page_index(vma, address); @@ -1581,6 +1588,8 @@ void hugepage_add_anon_rmap(struct page *page, { struct anon_vma *anon_vma = vma->anon_vma; int first; + + BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); BUG_ON(address < vma->vm_start || address >= vma->vm_end); first = atomic_inc_and_test(&page->_mapcount); diff --git a/mm/slob.c b/mm/slob.c index d582171c810..617b6d6c42c 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } else { unsigned int order = get_order(size); - ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); + if (likely(order)) + gfp |= __GFP_COMP; + ret = slob_new_pages(gfp, order, node); if (ret) { struct page *page; page = virt_to_page(ret); diff --git a/mm/slub.c b/mm/slub.c index 13fffe1f0f3..8fd5401bb07 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -168,7 +168,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ -#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */ static int kmem_size = sizeof(struct kmem_cache); @@ -178,7 +177,7 @@ static struct notifier_block slab_notifier; static enum { DOWN, /* No slab functionality available */ - PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + PARTIAL, /* Kmem_cache_node works */ UP, /* Everything works but does not show up in sysfs */ SYSFS /* Sysfs up */ } slab_state = DOWN; @@ -199,7 +198,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); @@ -210,6 +209,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { + kfree(s->name); kfree(s); } @@ -233,11 +233,7 @@ int slab_is_available(void) static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { -#ifdef CONFIG_NUMA return s->node[node]; -#else - return &s->local_node; -#endif } /* Verify that a pointer has an address that is valid within a slab page */ @@ -494,7 +490,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) dump_stack(); } -static void init_object(struct kmem_cache *s, void *object, int active) +static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; @@ -504,9 +500,7 @@ static void init_object(struct kmem_cache *s, void *object, int active) } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, - s->inuse - s->objsize); + memset(p + s->objsize, val, s->inuse - s->objsize); } static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) @@ -641,17 +635,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) } static int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) + void *object, u8 val) { u8 *p = object; u8 *endobject = object + s->objsize; if (s->flags & SLAB_RED_ZONE) { - unsigned int red = - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, red, s->inuse - s->objsize)) + endobject, val, s->inuse - s->objsize)) return 0; } else { if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { @@ -661,7 +652,7 @@ static int check_object(struct kmem_cache *s, struct page *page, } if (s->flags & SLAB_POISON) { - if (!active && (s->flags & __OBJECT_POISON) && + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, POISON_FREE, s->objsize - 1) || !check_bytes_and_report(s, page, p, "Poison", @@ -673,7 +664,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && active) + if (!s->offset && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -792,6 +783,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->objsize, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) +{ + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} + +static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) +{ + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); +} + +/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache_node *n, struct page *page) @@ -838,7 +862,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (!NUMA_BUILD || n) { + if (n) { atomic_long_inc(&n->nr_slabs); atomic_long_add(objects, &n->total_objects); } @@ -858,11 +882,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) return; - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static int alloc_debug_processing(struct kmem_cache *s, struct page *page, +static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) @@ -878,14 +902,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; } - if (!check_object(s, page, object, 0)) + if (!check_object(s, page, object, SLUB_RED_INACTIVE)) goto bad; /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_ALLOC, addr); trace(s, page, object, 1); - init_object(s, object, 1); + init_object(s, object, SLUB_RED_ACTIVE); return 1; bad: @@ -902,8 +926,8 @@ bad: return 0; } -static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, unsigned long addr) +static noinline int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -918,7 +942,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, goto fail; } - if (!check_object(s, page, object, 1)) + if (!check_object(s, page, object, SLUB_RED_ACTIVE)) return 0; if (unlikely(s != page->slab)) { @@ -942,7 +966,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); return 1; fail: @@ -1046,7 +1070,7 @@ static inline int free_debug_processing(struct kmem_cache *s, static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) { return 1; } + void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, @@ -1066,7 +1090,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -#endif + +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) + { return 0; } + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + void *object) {} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) {} + +static inline void slab_free_hook_irq(struct kmem_cache *s, + void *object) {} + +#endif /* CONFIG_SLUB_DEBUG */ /* * Slab allocation and freeing @@ -1194,7 +1230,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) slab_pad_check(s, page); for_each_object(p, s, page_address(page), page->objects) - check_object(s, page, p, 0); + check_object(s, page, p, SLUB_RED_INACTIVE); } kmemcheck_free_shadow(page, compound_order(page)); @@ -1274,13 +1310,19 @@ static void add_partial(struct kmem_cache_node *n, spin_unlock(&n->list_lock); } +static inline void __remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + list_del(&page->lru); + n->nr_partial--; +} + static void remove_partial(struct kmem_cache *s, struct page *page) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); spin_lock(&n->list_lock); - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); spin_unlock(&n->list_lock); } @@ -1293,8 +1335,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); __SetPageSlubFrozen(page); return 1; } @@ -1405,6 +1446,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) * On exit the slab lock will have been dropped. */ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) + __releases(bitlock) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -1447,6 +1489,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + __releases(bitlock) { struct page *page = c->page; int tail = 1; @@ -1647,6 +1690,7 @@ new_slab: goto load_freelist; } + gfpflags &= gfp_allowed_mask; if (gfpflags & __GFP_WAIT) local_irq_enable(); @@ -1674,7 +1718,7 @@ debug: c->page->inuse++; c->page->freelist = get_freepointer(s, object); - c->node = -1; + c->node = NUMA_NO_NODE; goto unlock_out; } @@ -1695,12 +1739,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; - gfpflags &= gfp_allowed_mask; - - lockdep_trace_alloc(gfpflags); - might_sleep_if(gfpflags & __GFP_WAIT); - - if (should_failslab(s->objsize, gfpflags, s->flags)) + if (slab_pre_alloc_hook(s, gfpflags)) return NULL; local_irq_save(flags); @@ -1719,8 +1758,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); - kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); + slab_post_alloc_hook(s, gfpflags, object); return object; } @@ -1754,7 +1792,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif #ifdef CONFIG_TRACING void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, @@ -1765,6 +1802,7 @@ void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, } EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); #endif +#endif /* * Slow patch handling. This may still be called frequently since objects @@ -1850,14 +1888,14 @@ static __always_inline void slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; - kmemleak_free_recursive(x, s->flags); + slab_free_hook(s, x); + local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); - if (likely(page == c->page && c->node >= 0)) { + + slab_free_hook_irq(s, x); + + if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); c->freelist = object; stat(s, FREE_FASTPATH); @@ -2062,26 +2100,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); - -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { - if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) - /* - * Boot time creation of the kmalloc array. Use static per cpu data - * since the per cpu allocator is not available yet. - */ - s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); - else - s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); + BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < + SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); - if (!s->cpu_slab) - return 0; + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); - return 1; + return s->cpu_slab != NULL; } -#ifdef CONFIG_NUMA +static struct kmem_cache *kmem_cache_node; + /* * No kmalloc_node yet so do it by hand. We know that this is the first * slab on the node for this slabcache. There are no concurrent accesses @@ -2091,15 +2121,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) +static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; unsigned long flags; - BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags, node); + page = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!page); if (page_to_nid(page) != node) { @@ -2111,15 +2141,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmalloc_caches, n); + page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; - kmalloc_caches->node[node] = n; + kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG - init_object(kmalloc_caches, n, 1); - init_tracking(kmalloc_caches, n); + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmalloc_caches); - inc_slabs_node(kmalloc_caches, node, page->objects); + init_kmem_cache_node(n, kmem_cache_node); + inc_slabs_node(kmem_cache_node, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2137,13 +2167,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; + if (n) - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); + s->node[node] = NULL; } } -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; @@ -2151,11 +2183,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) struct kmem_cache_node *n; if (slab_state == DOWN) { - early_kmem_cache_node_alloc(gfpflags, node); + early_kmem_cache_node_alloc(node); continue; } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); + n = kmem_cache_alloc_node(kmem_cache_node, + GFP_KERNEL, node); if (!n) { free_kmem_cache_nodes(s); @@ -2167,17 +2199,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } return 1; } -#else -static void free_kmem_cache_nodes(struct kmem_cache *s) -{ -} - -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) -{ - init_kmem_cache_node(&s->local_node, s); - return 1; -} -#endif static void set_min_partial(struct kmem_cache *s, unsigned long min) { @@ -2312,7 +2333,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } -static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, +static int kmem_cache_open(struct kmem_cache *s, const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) @@ -2348,10 +2369,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif - if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (!init_kmem_cache_nodes(s)) goto error; - if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) + if (alloc_kmem_cache_cpus(s)) return 1; free_kmem_cache_nodes(s); @@ -2414,9 +2435,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), - GFP_ATOMIC); - + unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * + sizeof(long), GFP_ATOMIC); if (!map) return; slab_err(s, page, "%s", text); @@ -2448,9 +2468,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - list_del(&page->lru); + __remove_partial(n, page); discard_slab(s, page); - n->nr_partial--; } else { list_slab_objects(s, page, "Objects remaining on kmem_cache_close()"); @@ -2507,9 +2526,15 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; +struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; EXPORT_SYMBOL(kmalloc_caches); +static struct kmem_cache *kmem_cache; + +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; +#endif + static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2546,116 +2571,29 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, - const char *name, int size, gfp_t gfp_flags) +static struct kmem_cache *__init create_kmalloc_cache(const char *name, + int size, unsigned int flags) { - unsigned int flags = 0; + struct kmem_cache *s; - if (gfp_flags & SLUB_DMA) - flags = SLAB_CACHE_DMA; + s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slub_lock here. */ - if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - - if (sysfs_slab_add(s)) - goto panic; return s; panic: panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); + return NULL; } -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; - -static void sysfs_add_func(struct work_struct *w) -{ - struct kmem_cache *s; - - down_write(&slub_lock); - list_for_each_entry(s, &slab_caches, list) { - if (s->flags & __SYSFS_ADD_DEFERRED) { - s->flags &= ~__SYSFS_ADD_DEFERRED; - sysfs_slab_add(s); - } - } - up_write(&slub_lock); -} - -static DECLARE_WORK(sysfs_add_work, sysfs_add_func); - -static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) -{ - struct kmem_cache *s; - char *text; - size_t realsize; - unsigned long slabflags; - int i; - - s = kmalloc_caches_dma[index]; - if (s) - return s; - - /* Dynamically create dma cache */ - if (flags & __GFP_WAIT) - down_write(&slub_lock); - else { - if (!down_write_trylock(&slub_lock)) - goto out; - } - - if (kmalloc_caches_dma[index]) - goto unlock_out; - - realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - - s = NULL; - for (i = 0; i < KMALLOC_CACHES; i++) - if (!kmalloc_caches[i].size) - break; - - BUG_ON(i >= KMALLOC_CACHES); - s = kmalloc_caches + i; - - /* - * Must defer sysfs creation to a workqueue because we don't know - * what context we are called from. Before sysfs comes up, we don't - * need to do anything because our sysfs initcall will start by - * adding all existing slabs to sysfs. - */ - slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; - if (slab_state >= SYSFS) - slabflags |= __SYSFS_ADD_DEFERRED; - - if (!text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - s->size = 0; - kfree(text); - goto unlock_out; - } - - list_add(&s->list, &slab_caches); - kmalloc_caches_dma[index] = s; - - if (slab_state >= SYSFS) - schedule_work(&sysfs_add_work); - -unlock_out: - up_write(&slub_lock); -out: - return kmalloc_caches_dma[index]; -} -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -2708,10 +2646,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) #ifdef CONFIG_ZONE_DMA if (unlikely((flags & SLUB_DMA))) - return dma_kmalloc_cache(index, flags); + return kmalloc_dma_caches[index]; #endif - return &kmalloc_caches[index]; + return kmalloc_caches[index]; } void *__kmalloc(size_t size, gfp_t flags) @@ -2735,6 +2673,7 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); +#ifdef CONFIG_NUMA static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; @@ -2749,7 +2688,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; @@ -2889,8 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * may have freed the last object and be * waiting to release the slab. */ - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); slab_unlock(page); discard_slab(s, page); } else { @@ -2914,7 +2851,7 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; @@ -2956,7 +2893,7 @@ static void slab_mem_offline_callback(void *arg) BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); } } up_read(&slub_lock); @@ -2989,7 +2926,7 @@ static int slab_mem_going_online_callback(void *arg) * since memory is not yet available from the node that * is brought up. */ - n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); + n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { ret = -ENOMEM; goto out; @@ -3035,46 +2972,92 @@ static int slab_memory_callback(struct notifier_block *self, * Basic setup of slabs *******************************************************************/ +/* + * Used for early kmem_cache structures that were allocated using + * the page allocator + */ + +static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +{ + int node; + + list_add(&s->list, &slab_caches); + s->refcount = -1; + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + struct page *p; + + if (n) { + list_for_each_entry(p, &n->partial, lru) + p->slab = s; + +#ifdef CONFIG_SLAB_DEBUG + list_for_each_entry(p, &n->full, lru) + p->slab = s; +#endif + } + } +} + void __init kmem_cache_init(void) { int i; int caches = 0; + struct kmem_cache *temp_kmem_cache; + int order; + struct kmem_cache *temp_kmem_cache_node; + unsigned long kmalloc_size; + + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); + + /* Allocate two kmem_caches from the page allocator */ + kmalloc_size = ALIGN(kmem_size, cache_line_size()); + order = get_order(2 * kmalloc_size); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); -#ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the * struct kmem_cache_node's. There is special bootstrap code in * kmem_cache_open for slab_state == DOWN. */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_NOWAIT); - kmalloc_caches[0].refcount = -1; - caches++; + kmem_cache_node = (void *)kmem_cache + kmalloc_size; + + kmem_cache_open(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); -#endif /* Able to allocate the per node structures */ slab_state = PARTIAL; - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 32) { - create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_NOWAIT); - caches++; - } - if (KMALLOC_MIN_SIZE <= 64) { - create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_NOWAIT); - caches++; - } + temp_kmem_cache = kmem_cache; + kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache, temp_kmem_cache, kmem_size); - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_NOWAIT); - caches++; - } + /* + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. + */ + temp_kmem_cache_node = kmem_cache_node; + + kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); + + kmem_cache_bootstrap_fixup(kmem_cache_node); + caches++; + kmem_cache_bootstrap_fixup(kmem_cache); + caches++; + /* Free temporary boot structure */ + free_pages((unsigned long)temp_kmem_cache, order); + + /* Now we can use the kmem_cache to allocate kmalloc slabs */ /* * Patch up the size_index table if we have strange large alignment @@ -3114,26 +3097,60 @@ void __init kmem_cache_init(void) size_index[size_index_elem(i)] = 8; } + /* Caches that are not of the two-to-the-power-of size */ + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); + caches++; + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); + caches++; + } + + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); + caches++; + } + slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[1]->name); + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[2]->name); + } + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); BUG_ON(!s); - kmalloc_caches[i].name = s; + kmalloc_caches[i]->name = s; } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif -#ifdef CONFIG_NUMA - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); -#else - kmem_size = sizeof(struct kmem_cache); -#endif +#ifdef CONFIG_ZONE_DMA + for (i = 0; i < SLUB_PAGE_SHIFT; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + + if (s && s->size) { + char *name = kasprintf(GFP_NOWAIT, + "dma-kmalloc-%d", s->objsize); + + BUG_ON(!name); + kmalloc_dma_caches[i] = create_kmalloc_cache(name, + s->objsize, SLAB_CACHE_DMA); + } + } +#endif printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", @@ -3211,6 +3228,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; + char *n; if (WARN_ON(!name)) return NULL; @@ -3234,19 +3252,25 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, return s; } + n = kstrdup(name, GFP_KERNEL); + if (!n) + goto err; + s = kmalloc(kmem_size, GFP_KERNEL); if (s) { - if (kmem_cache_open(s, GFP_KERNEL, name, + if (kmem_cache_open(s, n, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); if (sysfs_slab_add(s)) { list_del(&s->list); + kfree(n); kfree(s); goto err; } up_write(&slub_lock); return s; } + kfree(n); kfree(s); } up_write(&slub_lock); @@ -3318,6 +3342,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) return ret; } +#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { @@ -3346,8 +3371,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } +#endif -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int count_inuse(struct page *page) { return page->inuse; @@ -3357,7 +3383,9 @@ static int count_total(struct page *page) { return page->objects; } +#endif +#ifdef CONFIG_SLUB_DEBUG static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3448,65 +3476,6 @@ static long validate_slab_cache(struct kmem_cache *s) kfree(map); return count; } - -#ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) -{ - u8 *p; - - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); - - p = kzalloc(16, GFP_KERNEL); - p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); - - validate_slab_cache(kmalloc_caches + 4); - - /* Hmmm... The next two are dangerous */ - p = kzalloc(32, GFP_KERNEL); - p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - - validate_slab_cache(kmalloc_caches + 5); - p = kzalloc(64, GFP_KERNEL); - p += 64 + (get_cycles() & 0xff) * sizeof(void *); - *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches + 6); - - printk(KERN_ERR "\nB. Corruption after free\n"); - p = kzalloc(128, GFP_KERNEL); - kfree(p); - *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 7); - - p = kzalloc(256, GFP_KERNEL); - kfree(p); - p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); - validate_slab_cache(kmalloc_caches + 8); - - p = kzalloc(512, GFP_KERNEL); - kfree(p); - p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 9); -} -#else -static void resiliency_test(void) {}; -#endif - /* * Generate lists of code addresses where slabcache objects are allocated * and freed. @@ -3635,7 +3604,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc, - long *map) + unsigned long *map) { void *addr = page_address(page); void *p; @@ -3735,7 +3704,71 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf, "No data\n"); return len; } +#endif + +#ifdef SLUB_RESILIENCY_TEST +static void resiliency_test(void) +{ + u8 *p; + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); + + printk(KERN_ERR "SLUB resiliency testing\n"); + printk(KERN_ERR "-----------------------\n"); + printk(KERN_ERR "A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" + " 0x12->0x%p\n\n", p + 16); + + validate_slab_cache(kmalloc_caches[4]); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches[5]); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches[6]); + + printk(KERN_ERR "\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[7]); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", + p); + validate_slab_cache(kmalloc_caches[8]); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[9]); +} +#else +#ifdef CONFIG_SYSFS +static void resiliency_test(void) {}; +#endif +#endif + +#ifdef CONFIG_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -3788,6 +3821,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } + down_read(&slub_lock); +#ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3804,7 +3839,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; } - } else if (flags & SO_PARTIAL) { + } else +#endif + if (flags & SO_PARTIAL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3829,6 +3866,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return x + sprintf(buf + x, "\n"); } +#ifdef CONFIG_SLUB_DEBUG static int any_slab_objects(struct kmem_cache *s) { int node; @@ -3844,6 +3882,7 @@ static int any_slab_objects(struct kmem_cache *s) } return 0; } +#endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj); @@ -3945,12 +3984,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(aliases); -static ssize_t slabs_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL); -} -SLAB_ATTR_RO(slabs); - static ssize_t partial_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_PARTIAL); @@ -3975,93 +4008,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); -static ssize_t total_objects_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); -} -SLAB_ATTR_RO(total_objects); - -static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } -static ssize_t sanity_checks_store(struct kmem_cache *s, +static ssize_t reclaim_account_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_RECLAIM_ACCOUNT; if (buf[0] == '1') - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_RECLAIM_ACCOUNT; return length; } -SLAB_ATTR(sanity_checks); +SLAB_ATTR(reclaim_account); -static ssize_t trace_show(struct kmem_cache *s, char *buf) +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); } +SLAB_ATTR_RO(hwcache_align); -static ssize_t trace_store(struct kmem_cache *s, const char *buf, - size_t length) +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') - s->flags |= SLAB_TRACE; - return length; + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); } -SLAB_ATTR(trace); +SLAB_ATTR_RO(cache_dma); +#endif -#ifdef CONFIG_FAILSLAB -static ssize_t failslab_show(struct kmem_cache *s, char *buf) +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); } +SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t failslab_store(struct kmem_cache *s, const char *buf, - size_t length) +#ifdef CONFIG_SLUB_DEBUG +static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_FAILSLAB; - if (buf[0] == '1') - s->flags |= SLAB_FAILSLAB; - return length; + return show_slab_objects(s, buf, SO_ALL); } -SLAB_ATTR(failslab); -#endif +SLAB_ATTR_RO(slabs); -static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); } +SLAB_ATTR_RO(total_objects); -static ssize_t reclaim_account_store(struct kmem_cache *s, - const char *buf, size_t length) +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_RECLAIM_ACCOUNT; - if (buf[0] == '1') - s->flags |= SLAB_RECLAIM_ACCOUNT; - return length; + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); } -SLAB_ATTR(reclaim_account); -static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') + s->flags |= SLAB_DEBUG_FREE; + return length; } -SLAB_ATTR_RO(hwcache_align); +SLAB_ATTR(sanity_checks); -#ifdef CONFIG_ZONE_DMA -static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +static ssize_t trace_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } -SLAB_ATTR_RO(cache_dma); -#endif -static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') + s->flags |= SLAB_TRACE; + return length; } -SLAB_ATTR_RO(destroy_by_rcu); +SLAB_ATTR(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { @@ -4139,6 +4162,40 @@ static ssize_t validate_store(struct kmem_cache *s, } SLAB_ATTR(validate); +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t shrink_show(struct kmem_cache *s, char *buf) { return 0; @@ -4158,22 +4215,6 @@ static ssize_t shrink_store(struct kmem_cache *s, } SLAB_ATTR(shrink); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); - #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -4279,25 +4320,27 @@ static struct attribute *slab_attrs[] = { &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, - &total_objects_attr.attr, - &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, &ctor_attr.attr, &aliases_attr.attr, &align_attr.attr, - &sanity_checks_attr.attr, - &trace_attr.attr, &hwcache_align_attr.attr, &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, + &shrink_attr.attr, +#ifdef CONFIG_SLUB_DEBUG + &total_objects_attr.attr, + &slabs_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, &red_zone_attr.attr, &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, +#endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4377,6 +4420,7 @@ static void kmem_cache_release(struct kobject *kobj) { struct kmem_cache *s = to_slab(kobj); + kfree(s->name); kfree(s); } @@ -4579,7 +4623,7 @@ static int __init slab_sysfs_init(void) } __initcall(slab_sysfs_init); -#endif +#endif /* CONFIG_SYSFS */ /* * The /proc/slabinfo ABI diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index aa33fd67fa4..29d6cbffb28 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (vmemmap_buf_start) { /* need to free left buf */ -#ifdef CONFIG_NO_BOOTMEM - free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); - if (vmemmap_buf_start < vmemmap_buf) { - char name[15]; - - snprintf(name, sizeof(name), "MEMMAP %d", nodeid); - reserve_early_without_check(__pa(vmemmap_buf_start), - __pa(vmemmap_buf), name); - } -#else free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); -#endif vmemmap_buf = NULL; vmemmap_buf_end = NULL; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 7c703ff2f36..9fc7bac7db0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -139,7 +139,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); + nr_blocks, GFP_KERNEL, 0); if (err) return err; cond_resched(); @@ -150,7 +150,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); + nr_blocks, GFP_KERNEL, 0); if (err) break; @@ -189,7 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) + nr_blocks, GFP_NOIO, 0)) break; } diff --git a/mm/util.c b/mm/util.c index 4735ea48181..73dac81e9f7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -245,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + * If the architecture not support this fucntion, simply return with no + * page pinned + */ +int __attribute__((weak)) __get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + return 0; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b8889da69a..9f909622a25 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void purge_fragmented_blocks_allcpus(void); /* + * called before a call to iounmap() if the caller wants vm_area_struct's + * immediately freed. + */ +void set_iounmap_nonlazy(void) +{ + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); +} + +/* * Purges all lazily-freed vmap areas. * * If sync is 0 then don't purge if there is already a purge in progress. @@ -2056,6 +2065,7 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +#ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; @@ -2336,6 +2346,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) free_vm_area(vms[i]); kfree(vms); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) diff --git a/mm/vmscan.c b/mm/vmscan.c index c391c320dba..b94c9464f26 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,8 +79,8 @@ struct scan_control { int order; /* - * Intend to reclaim enough contenious memory rather than to reclaim - * enough amount memory. I.e, it's the mode for high order allocation. + * Intend to reclaim enough continuous memory rather than reclaim + * enough amount of memory. i.e, mode for high order allocation. */ bool lumpy_reclaim_mode; @@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static bool shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - bool all_unreclaimable = true; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, } shrink_zone(priority, zone, sc); - all_unreclaimable = false; } +} + +static bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + +/* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. It can't handle OOM during hibernation. + * So let's check zone's unreclaimable in direct reclaim as well as kswapd. + */ +static bool all_unreclaimable(struct zonelist *zonelist, + struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + bool all_unreclaimable = true; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!populated_zone(zone)) + continue; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + if (zone_reclaimable(zone)) { + all_unreclaimable = false; + break; + } + } + return all_unreclaimable; } @@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - bool all_unreclaimable; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; struct zoneref *z; @@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - all_unreclaimable = shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -1931,7 +1959,7 @@ out: return sc->nr_reclaimed; /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable) + if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; @@ -2197,8 +2225,7 @@ loop_again: total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && - zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) + if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and |