diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 7 | ||||
-rw-r--r-- | mm/bootmem.c | 20 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 19 | ||||
-rw-r--r-- | mm/kmemleak.c | 470 | ||||
-rw-r--r-- | mm/memcontrol.c | 151 | ||||
-rw-r--r-- | mm/memory.c | 87 | ||||
-rw-r--r-- | mm/mempolicy.c | 84 | ||||
-rw-r--r-- | mm/mempool.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 43 | ||||
-rw-r--r-- | mm/oom_kill.c | 64 | ||||
-rw-r--r-- | mm/page-writeback.c | 13 | ||||
-rw-r--r-- | mm/page_alloc.c | 86 | ||||
-rw-r--r-- | mm/page_cgroup.c | 24 | ||||
-rw-r--r-- | mm/percpu.c | 72 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/shmem_acl.c | 29 | ||||
-rw-r--r-- | mm/slab.c | 32 | ||||
-rw-r--r-- | mm/slob.c | 8 | ||||
-rw-r--r-- | mm/slub.c | 139 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/thrash.c | 32 | ||||
-rw-r--r-- | mm/util.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 21 |
28 files changed, 882 insertions, 571 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bd..fe5f674d7a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 493b468a503..c86edd24429 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -283,7 +283,6 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; - void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; @@ -308,18 +307,18 @@ EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested - * @rw: READ or WRITE + * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ -long congestion_wait(int rw, long timeout) +long congestion_wait(int sync, long timeout) { long ret; DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); diff --git a/mm/bootmem.c b/mm/bootmem.c index 282df0a09e6..701740c9e81 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <linux/bootmem.h> #include <linux/module.h> +#include <linux/kmemleak.h> #include <asm/bug.h> #include <asm/io.h> @@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, { unsigned long start, end; + kmemleak_free_part(__va(physaddr), size); + start = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); @@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size) { unsigned long start, end; + kmemleak_free_part(__va(addr), size); + start = PFN_UP(addr); end = PFN_DOWN(addr + size); @@ -516,6 +521,7 @@ find_block: region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size); + kmemleak_alloc(region, size, 1, 0); return region; } @@ -536,11 +542,15 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, return kzalloc(size, GFP_NOWAIT); #ifdef CONFIG_HAVE_ARCH_BOOTMEM - bootmem_data_t *p_bdata; - - p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); - if (p_bdata) - return alloc_bootmem_core(p_bdata, size, align, goal, limit); + { + bootmem_data_t *p_bdata; + + p_bdata = bootmem_arch_preferred_node(bdata, size, align, + goal, limit); + if (p_bdata) + return alloc_bootmem_core(p_bdata, size, align, + goal, limit); + } #endif return NULL; } diff --git a/mm/dmapool.c b/mm/dmapool.c index b1f0885dda2..3df063706f5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) unsigned pages = 0; unsigned blocks = 0; + spin_lock_irq(&pool->lock); list_for_each_entry(page, &pool->page_list, page_list) { pages++; blocks += page->in_use; } + spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", diff --git a/mm/filemap.c b/mm/filemap.c index 22396713feb..ccea3b665c1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2272,6 +2272,7 @@ again: pagefault_enable(); flush_dcache_page(page); + mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a56e6f3ce97..cafdcee154e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1985,7 +1985,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, int write_access) + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; @@ -2053,7 +2053,7 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if (write_access && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -2072,7 +2072,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); - if (write_access && !(vma->vm_flags & VM_SHARED)) { + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } @@ -2091,7 +2091,7 @@ backout_unlocked: } int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pte_t *ptep; pte_t entry; @@ -2112,7 +2112,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_lock(&hugetlb_instantiation_mutex); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + ret = hugetlb_no_page(mm, vma, address, ptep, flags); goto out_mutex; } @@ -2126,7 +2126,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if (write_access && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -2143,7 +2143,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page_table_lock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); @@ -2152,7 +2152,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + if (huge_ptep_set_access_flags(vma, address, ptep, entry, + flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, entry); out_page_table_lock: @@ -2369,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= blocks_per_huge_page(h); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 58ec86c9e58..487267310a8 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -48,10 +48,10 @@ * scanned. This list is only modified during a scanning episode when the * scan_mutex is held. At the end of a scan, the gray_list is always empty. * Note that the kmemleak_object.use_count is incremented when an object is - * added to the gray_list and therefore cannot be freed - * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs - * file together with modifications to the memory scanning parameters - * including the scan_thread pointer + * added to the gray_list and therefore cannot be freed. This mutex also + * prevents multiple users of the "kmemleak" debugfs file together with + * modifications to the memory scanning parameters including the scan_thread + * pointer * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes @@ -61,6 +61,8 @@ * structure. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> @@ -101,14 +103,16 @@ * Kmemleak configuration and common defines. */ #define MAX_TRACE 16 /* stack trace length */ -#define REPORTS_NR 50 /* maximum number of reported leaks */ #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ -#define MSECS_SCAN_YIELD 10 /* CPU yielding period */ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ +#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ #define BYTES_PER_POINTER sizeof(void *) +/* GFP bitmask for kmemleak internal allocations */ +#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) + /* scanning area inside a memory block */ struct kmemleak_scan_area { struct hlist_node node; @@ -154,6 +158,8 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +/* flag set on newly allocated objects */ +#define OBJECT_NEW (1 << 3) /* the list of all allocated objects */ static LIST_HEAD(object_list); @@ -181,27 +187,21 @@ static atomic_t kmemleak_error = ATOMIC_INIT(0); static unsigned long min_addr = ULONG_MAX; static unsigned long max_addr; -/* used for yielding the CPU to other tasks during scanning */ -static unsigned long next_scan_yield; static struct task_struct *scan_thread; -static unsigned long jiffies_scan_yield; +/* used to avoid reporting of recently allocated objects */ static unsigned long jiffies_min_age; +static unsigned long jiffies_last_scan; /* delay between automatic memory scannings */ static signed long jiffies_scan_wait; /* enables or disables the task stacks scanning */ -static int kmemleak_stack_scan; -/* mutex protecting the memory scanning */ +static int kmemleak_stack_scan = 1; +/* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); -/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ -static DEFINE_MUTEX(kmemleak_mutex); - -/* number of leaks reported (for limitation purposes) */ -static int reported_leaks; /* - * Early object allocation/freeing logging. Kkmemleak is initialized after the + * Early object allocation/freeing logging. Kmemleak is initialized after the * kernel allocator. However, both the kernel allocator and kmemleak may - * allocate memory blocks which need to be tracked. Kkmemleak defines an + * allocate memory blocks which need to be tracked. Kmemleak defines an * arbitrary buffer to hold the allocation/freeing information before it is * fully initialized. */ @@ -210,6 +210,7 @@ static int reported_leaks; enum { KMEMLEAK_ALLOC, KMEMLEAK_FREE, + KMEMLEAK_FREE_PART, KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, @@ -230,7 +231,7 @@ struct early_log { }; /* early logging buffer and current position */ -static struct early_log early_log[200]; +static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; static int crt_early_log; static void kmemleak_disable(void); @@ -245,10 +246,10 @@ static void kmemleak_disable(void); /* * Macro invoked when a serious kmemleak condition occured and cannot be - * recovered from. Kkmemleak will be disabled and further allocation/freeing + * recovered from. Kmemleak will be disabled and further allocation/freeing * tracing no longer available. */ -#define kmemleak_panic(x...) do { \ +#define kmemleak_stop(x...) do { \ kmemleak_warn(x); \ kmemleak_disable(); \ } while (0) @@ -273,13 +274,9 @@ static int color_gray(const struct kmemleak_object *object) return object->min_count != -1 && object->count >= object->min_count; } -/* - * Objects are considered referenced if their color is gray and they have not - * been deleted. - */ -static int referenced_object(struct kmemleak_object *object) +static int color_black(const struct kmemleak_object *object) { - return (object->flags & OBJECT_ALLOCATED) && color_gray(object); + return object->min_count == -1; } /* @@ -290,42 +287,28 @@ static int referenced_object(struct kmemleak_object *object) static int unreferenced_object(struct kmemleak_object *object) { return (object->flags & OBJECT_ALLOCATED) && color_white(object) && - time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); + time_before_eq(object->jiffies + jiffies_min_age, + jiffies_last_scan); } /* - * Printing of the (un)referenced objects information, either to the seq file - * or to the kernel log. The print_referenced/print_unreferenced functions - * must be called with the object->lock held. + * Printing of the unreferenced objects information to the seq file. The + * print_unreferenced function must be called with the object->lock held. */ -#define print_helper(seq, x...) do { \ - struct seq_file *s = (seq); \ - if (s) \ - seq_printf(s, x); \ - else \ - pr_info(x); \ -} while (0) - -static void print_referenced(struct kmemleak_object *object) -{ - pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n", - object->pointer, object->size); -} - static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; - print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); - print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); - print_helper(seq, " backtrace:\n"); + seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - print_helper(seq, " [<%p>] %pS\n", ptr, ptr); + seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -341,7 +324,7 @@ static void dump_object_info(struct kmemleak_object *object) trace.nr_entries = object->trace_len; trace.entries = object->trace; - pr_notice("kmemleak: Object 0x%08lx (size %zu):\n", + pr_notice("Object 0x%08lx (size %zu):\n", object->tree_node.start, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); @@ -369,7 +352,7 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) object = prio_tree_entry(node, struct kmemleak_object, tree_node); if (!alias && object->pointer != ptr) { - kmemleak_warn("kmemleak: Found object by alias"); + kmemleak_warn("Found object by alias"); object = NULL; } } else @@ -462,10 +445,9 @@ static void create_object(unsigned long ptr, size_t size, int min_count, struct prio_tree_node *node; struct stack_trace trace; - object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK); + object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); if (!object) { - kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object " - "structure\n"); + kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); return; } @@ -474,7 +456,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count, INIT_HLIST_HEAD(&object->area_list); spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED; + object->flags = OBJECT_ALLOCATED | OBJECT_NEW; object->pointer = ptr; object->size = size; object->min_count = min_count; @@ -524,8 +506,8 @@ static void create_object(unsigned long ptr, size_t size, int min_count, if (node != &object->tree_node) { unsigned long flags; - kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object " - "search tree (already existing)\n", ptr); + kmemleak_stop("Cannot insert 0x%lx into the object search tree " + "(already existing)\n", ptr); object = lookup_object(ptr, 1); spin_lock_irqsave(&object->lock, flags); dump_object_info(object); @@ -542,39 +524,87 @@ out: * Remove the metadata (struct kmemleak_object) for a memory block from the * object_list and object_tree_root and decrement its use_count. */ -static void delete_object(unsigned long ptr) +static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - struct kmemleak_object *object; write_lock_irqsave(&kmemleak_lock, flags); - object = lookup_object(ptr, 0); - if (!object) { - kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n", - ptr); - write_unlock_irqrestore(&kmemleak_lock, flags); - return; - } prio_tree_remove(&object_tree_root, &object->tree_node); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 1); + WARN_ON(atomic_read(&object->use_count) < 2); /* * Locking here also ensures that the corresponding memory block * cannot be freed when it is being scanned. */ spin_lock_irqsave(&object->lock, flags); - if (object->flags & OBJECT_REPORTED) - print_referenced(object); object->flags &= ~OBJECT_ALLOCATED; spin_unlock_irqrestore(&object->lock, flags); put_object(object); } /* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. + */ +static void delete_object_full(unsigned long ptr) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", + ptr); +#endif + return; + } + __delete_object(object); + put_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. + */ +static void delete_object_part(unsigned long ptr, size_t size) +{ + struct kmemleak_object *object; + unsigned long start, end; + + object = find_and_get_object(ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx " + "(size %zu)\n", ptr, size); +#endif + return; + } + __delete_object(object); + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. The path below is + * only executed during early log recording in kmemleak_init(), so + * GFP_KERNEL is enough. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + create_object(start, ptr - start, object->min_count, + GFP_KERNEL); + if (ptr + size < end) + create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL); + + put_object(object); +} +/* * Make a object permanently as gray-colored so that it can no longer be * reported as a leak. This is used in general to mark a false positive. */ @@ -585,8 +615,7 @@ static void make_gray_object(unsigned long ptr) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n", - ptr); + kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); return; } @@ -607,8 +636,7 @@ static void make_black_object(unsigned long ptr) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n", - ptr); + kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); return; } @@ -631,21 +659,20 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Adding scan area to unknown " - "object at 0x%08lx\n", ptr); + kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", + ptr); return; } - area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK); + area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); if (!area) { - kmemleak_warn("kmemleak: Cannot allocate a scan area\n"); + kmemleak_warn("Cannot allocate a scan area\n"); goto out; } spin_lock_irqsave(&object->lock, flags); if (offset + length > object->size) { - kmemleak_warn("kmemleak: Scan area larger than object " - "0x%08lx\n", ptr); + kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); goto out_unlock; @@ -674,8 +701,7 @@ static void object_no_scan(unsigned long ptr) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Not scanning unknown object at " - "0x%08lx\n", ptr); + kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); return; } @@ -696,7 +722,8 @@ static void log_early(int op_type, const void *ptr, size_t size, struct early_log *log; if (crt_early_log >= ARRAY_SIZE(early_log)) { - kmemleak_panic("kmemleak: Early log buffer exceeded\n"); + pr_warning("Early log buffer exceeded\n"); + kmemleak_disable(); return; } @@ -741,13 +768,28 @@ void kmemleak_free(const void *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - delete_object((unsigned long)ptr); + delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); /* + * Partial memory freeing function callback. This function is usually called + * from bootmem allocator when (part of) a memory block is freed. + */ +void kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + +/* * Mark an already allocated memory block as a false positive. This will cause * the block to no longer be reported as leak and always be scanned. */ @@ -808,21 +850,6 @@ void kmemleak_no_scan(const void *ptr) EXPORT_SYMBOL(kmemleak_no_scan); /* - * Yield the CPU so that other tasks get a chance to run. The yielding is - * rate-limited to avoid excessive number of calls to the schedule() function - * during memory scanning. - */ -static void scan_yield(void) -{ - might_sleep(); - - if (time_is_before_eq_jiffies(next_scan_yield)) { - schedule(); - next_scan_yield = jiffies + jiffies_scan_yield; - } -} - -/* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occured. */ @@ -848,7 +875,7 @@ static int scan_should_stop(void) * found to the gray list. */ static void scan_block(void *_start, void *_end, - struct kmemleak_object *scanned) + struct kmemleak_object *scanned, int allow_resched) { unsigned long *ptr; unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); @@ -859,18 +886,11 @@ static void scan_block(void *_start, void *_end, unsigned long pointer = *ptr; struct kmemleak_object *object; + if (allow_resched) + cond_resched(); if (scan_should_stop()) break; - /* - * When scanning a memory block with a corresponding - * kmemleak_object, the CPU yielding is handled in the calling - * code since it holds the object->lock to avoid the block - * freeing. - */ - if (!scanned) - scan_yield(); - object = find_and_get_object(pointer, 1); if (!object) continue; @@ -931,12 +951,12 @@ static void scan_object(struct kmemleak_object *object) goto out; if (hlist_empty(&object->area_list)) scan_block((void *)object->pointer, - (void *)(object->pointer + object->size), object); + (void *)(object->pointer + object->size), object, 0); else hlist_for_each_entry(area, elem, &object->area_list, node) scan_block((void *)(object->pointer + area->offset), (void *)(object->pointer + area->offset - + area->length), object); + + area->length), object, 0); out: spin_unlock_irqrestore(&object->lock, flags); } @@ -952,6 +972,10 @@ static void kmemleak_scan(void) struct kmemleak_object *object, *tmp; struct task_struct *task; int i; + int new_leaks = 0; + int gray_list_pass = 0; + + jiffies_last_scan = jiffies; /* prepare the kmemleak_object's */ rcu_read_lock(); @@ -963,13 +987,14 @@ static void kmemleak_scan(void) * 1 reference to any object at this point. */ if (atomic_read(&object->use_count) > 1) { - pr_debug("kmemleak: object->use_count = %d\n", + pr_debug("object->use_count = %d\n", atomic_read(&object->use_count)); dump_object_info(object); } #endif /* reset the reference count (whiten the object) */ object->count = 0; + object->flags &= ~OBJECT_NEW; if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); @@ -978,14 +1003,14 @@ static void kmemleak_scan(void) rcu_read_unlock(); /* data/bss scanning */ - scan_block(_sdata, _edata, NULL); - scan_block(__bss_start, __bss_stop, NULL); + scan_block(_sdata, _edata, NULL, 1); + scan_block(__bss_start, __bss_stop, NULL, 1); #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) scan_block(__per_cpu_start + per_cpu_offset(i), - __per_cpu_end + per_cpu_offset(i), NULL); + __per_cpu_end + per_cpu_offset(i), NULL, 1); #endif /* @@ -1007,7 +1032,7 @@ static void kmemleak_scan(void) /* only scan if page is in use */ if (page_count(page) == 0) continue; - scan_block(page, page + 1, NULL); + scan_block(page, page + 1, NULL, 1); } } @@ -1019,7 +1044,8 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); for_each_process(task) scan_block(task_stack_page(task), - task_stack_page(task) + THREAD_SIZE, NULL); + task_stack_page(task) + THREAD_SIZE, + NULL, 0); read_unlock(&tasklist_lock); } @@ -1031,9 +1057,10 @@ static void kmemleak_scan(void) * kmemleak objects cannot be freed from outside the loop because their * use_count was increased. */ +repeat: object = list_entry(gray_list.next, typeof(*object), gray_list); while (&object->gray_list != &gray_list) { - scan_yield(); + cond_resched(); /* may add new objects to the list */ if (!scan_should_stop()) @@ -1048,7 +1075,59 @@ static void kmemleak_scan(void) object = tmp; } + + if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) + goto scan_end; + + /* + * Check for new objects allocated during this scanning and add them + * to the gray list. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_NEW) && !color_black(object) && + get_object(object)) { + object->flags &= ~OBJECT_NEW; + list_add_tail(&object->gray_list, &gray_list); + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (!list_empty(&gray_list)) + goto repeat; + +scan_end: WARN_ON(!list_empty(&gray_list)); + + /* + * If scanning was stopped or new objects were being allocated at a + * higher rate than gray list scanning, do not report any new + * unreferenced objects. + */ + if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) + return; + + /* + * Scanning result reporting. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; + new_leaks++; + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (new_leaks) + pr_info("%d new suspected memory leaks (see " + "/sys/kernel/debug/kmemleak)\n", new_leaks); + } /* @@ -1059,7 +1138,8 @@ static int kmemleak_scan_thread(void *arg) { static int first_run = 1; - pr_info("kmemleak: Automatic memory scanning thread started\n"); + pr_info("Automatic memory scanning thread started\n"); + set_user_nice(current, 10); /* * Wait before the first scan to allow the system to fully initialize. @@ -1070,49 +1150,25 @@ static int kmemleak_scan_thread(void *arg) } while (!kthread_should_stop()) { - struct kmemleak_object *object; signed long timeout = jiffies_scan_wait; mutex_lock(&scan_mutex); - kmemleak_scan(); - reported_leaks = 0; - - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) { - unsigned long flags; - - if (reported_leaks >= REPORTS_NR) - break; - spin_lock_irqsave(&object->lock, flags); - if (!(object->flags & OBJECT_REPORTED) && - unreferenced_object(object)) { - print_unreferenced(NULL, object); - object->flags |= OBJECT_REPORTED; - reported_leaks++; - } else if ((object->flags & OBJECT_REPORTED) && - referenced_object(object)) { - print_referenced(object); - object->flags &= ~OBJECT_REPORTED; - } - spin_unlock_irqrestore(&object->lock, flags); - } - rcu_read_unlock(); - mutex_unlock(&scan_mutex); + /* wait before the next scan */ while (timeout && !kthread_should_stop()) timeout = schedule_timeout_interruptible(timeout); } - pr_info("kmemleak: Automatic memory scanning thread ended\n"); + pr_info("Automatic memory scanning thread ended\n"); return 0; } /* * Start the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ void start_scan_thread(void) { @@ -1120,14 +1176,14 @@ void start_scan_thread(void) return; scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); if (IS_ERR(scan_thread)) { - pr_warning("kmemleak: Failed to create the scan thread\n"); + pr_warning("Failed to create the scan thread\n"); scan_thread = NULL; } } /* * Stop the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ void stop_scan_thread(void) { @@ -1146,13 +1202,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) { struct kmemleak_object *object; loff_t n = *pos; + int err; - if (!n) { - kmemleak_scan(); - reported_leaks = 0; - } - if (reported_leaks >= REPORTS_NR) - return NULL; + err = mutex_lock_interruptible(&scan_mutex); + if (err < 0) + return ERR_PTR(err); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { @@ -1163,7 +1217,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) } object = NULL; out: - rcu_read_unlock(); return object; } @@ -1178,17 +1231,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct list_head *n = &prev_obj->object_list; ++(*pos); - if (reported_leaks >= REPORTS_NR) - goto out; - rcu_read_lock(); list_for_each_continue_rcu(n, &object_list) { next_obj = list_entry(n, struct kmemleak_object, object_list); if (get_object(next_obj)) break; } - rcu_read_unlock(); -out: + put_object(prev_obj); return next_obj; } @@ -1198,8 +1247,16 @@ out: */ static void kmemleak_seq_stop(struct seq_file *seq, void *v) { - if (v) - put_object(v); + if (!IS_ERR(v)) { + /* + * kmemleak_seq_start may return ERR_PTR if the scan_mutex + * waiting was interrupted, so only release it if !IS_ERR. + */ + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + if (v) + put_object(v); + } } /* @@ -1211,11 +1268,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v) unsigned long flags; spin_lock_irqsave(&object->lock, flags); - if (!unreferenced_object(object)) - goto out; - print_unreferenced(seq, object); - reported_leaks++; -out: + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) + print_unreferenced(seq, object); spin_unlock_irqrestore(&object->lock, flags); return 0; } @@ -1229,43 +1283,15 @@ static const struct seq_operations kmemleak_seq_ops = { static int kmemleak_open(struct inode *inode, struct file *file) { - int ret = 0; - if (!atomic_read(&kmemleak_enabled)) return -EBUSY; - ret = mutex_lock_interruptible(&kmemleak_mutex); - if (ret < 0) - goto out; - if (file->f_mode & FMODE_READ) { - ret = mutex_lock_interruptible(&scan_mutex); - if (ret < 0) - goto kmemleak_unlock; - ret = seq_open(file, &kmemleak_seq_ops); - if (ret < 0) - goto scan_unlock; - } - return ret; - -scan_unlock: - mutex_unlock(&scan_mutex); -kmemleak_unlock: - mutex_unlock(&kmemleak_mutex); -out: - return ret; + return seq_open(file, &kmemleak_seq_ops); } static int kmemleak_release(struct inode *inode, struct file *file) { - int ret = 0; - - if (file->f_mode & FMODE_READ) { - seq_release(inode, file); - mutex_unlock(&scan_mutex); - } - mutex_unlock(&kmemleak_mutex); - - return ret; + return seq_release(inode, file); } /* @@ -1278,21 +1304,24 @@ static int kmemleak_release(struct inode *inode, struct file *file) * scan=off - stop the automatic memory scanning thread * scan=... - set the automatic memory scanning period in seconds (0 to * disable it) + * scan - trigger a memory scan */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { char buf[64]; int buf_size; - - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; + int ret; buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; buf[buf_size] = 0; + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + return ret; + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1305,18 +1334,24 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, stop_scan_thread(); else if (strncmp(buf, "scan=", 5) == 0) { unsigned long secs; - int err; - err = strict_strtoul(buf + 5, 0, &secs); - if (err < 0) - return err; + ret = strict_strtoul(buf + 5, 0, &secs); + if (ret < 0) + goto out; stop_scan_thread(); if (secs) { jiffies_scan_wait = msecs_to_jiffies(secs * 1000); start_scan_thread(); } - } else - return -EINVAL; + } else if (strncmp(buf, "scan", 4) == 0) + kmemleak_scan(); + else + ret = -EINVAL; + +out: + mutex_unlock(&scan_mutex); + if (ret < 0) + return ret; /* ignore the rest of the buffer, only one command at a time */ *ppos += size; @@ -1340,14 +1375,12 @@ static int kmemleak_cleanup_thread(void *arg) { struct kmemleak_object *object; - mutex_lock(&kmemleak_mutex); + mutex_lock(&scan_mutex); stop_scan_thread(); - mutex_unlock(&kmemleak_mutex); - mutex_lock(&scan_mutex); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) - delete_object(object->pointer); + delete_object_full(object->pointer); rcu_read_unlock(); mutex_unlock(&scan_mutex); @@ -1364,7 +1397,7 @@ static void kmemleak_cleanup(void) cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, "kmemleak-clean"); if (IS_ERR(cleanup_thread)) - pr_warning("kmemleak: Failed to create the clean-up thread\n"); + pr_warning("Failed to create the clean-up thread\n"); } /* @@ -1404,14 +1437,13 @@ static int kmemleak_boot_config(char *str) early_param("kmemleak", kmemleak_boot_config); /* - * Kkmemleak initialization. + * Kmemleak initialization. */ void __init kmemleak_init(void) { int i; unsigned long flags; - jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); @@ -1443,6 +1475,9 @@ void __init kmemleak_init(void) case KMEMLEAK_FREE: kmemleak_free(log->ptr); break; + case KMEMLEAK_FREE_PART: + kmemleak_free_part(log->ptr, log->size); + break; case KMEMLEAK_NOT_LEAK: kmemleak_not_leak(log->ptr); break; @@ -1485,11 +1520,10 @@ static int __init kmemleak_late_init(void) dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, &kmemleak_fops); if (!dentry) - pr_warning("kmemleak: Failed to create the debugfs kmemleak " - "file\n"); - mutex_lock(&kmemleak_mutex); + pr_warning("Failed to create the debugfs kmemleak file\n"); + mutex_lock(&scan_mutex); start_scan_thread(); - mutex_unlock(&kmemleak_mutex); + mutex_unlock(&scan_mutex); pr_info("Kernel memory leak detector initialized\n"); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 70db6e0a5ee..fd4529d86de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -45,7 +45,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP -/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ +/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #else @@ -62,7 +62,8 @@ enum mem_cgroup_stat_index { * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ + MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ @@ -176,6 +177,9 @@ struct mem_cgroup { unsigned int swappiness; + /* set when res.limit == memsw.limit */ + bool memsw_is_minimum; + /* * statistics. This must be placed at the end of memcg. */ @@ -188,6 +192,7 @@ enum charge_type { MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ + MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ NR_CHARGE_TYPE, }; @@ -644,6 +649,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; int lru = LRU_FILE * !!file + !!active; + int ret; BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); @@ -661,9 +667,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, continue; scan++; - if (__isolate_lru_page(page, mode, file) == 0) { + ret = __isolate_lru_page(page, mode, file); + switch (ret) { + case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; + break; + case -EBUSY: + /* we don't affect global LRU but rotate in our LRU */ + mem_cgroup_rotate_lru_list(page, page_lru(page)); + break; + default: + break; } } @@ -845,6 +861,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, int ret, total = 0; int loop = 0; + /* If memsw_is_minimum==1, swap-out is of-no-use. */ + if (root_mem->memsw_is_minimum) + noswap = true; + while (loop < 2) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) @@ -900,6 +920,44 @@ static void record_last_oom(struct mem_cgroup *mem) mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); } +/* + * Currently used to update mapped file statistics, but the routine can be + * generalized to update other statistics as well. + */ +void mem_cgroup_update_mapped_file_stat(struct page *page, int val) +{ + struct mem_cgroup *mem; + struct mem_cgroup_stat *stat; + struct mem_cgroup_stat_cpu *cpustat; + int cpu; + struct page_cgroup *pc; + + if (!page_is_file_cache(page)) + return; + + pc = lookup_page_cgroup(page); + if (unlikely(!pc)) + return; + + lock_page_cgroup(pc); + mem = pc->mem_cgroup; + if (!mem) + goto done; + + if (!PageCgroupUsed(pc)) + goto done; + + /* + * Preemption is already disabled, we don't need get_cpu() + */ + cpu = smp_processor_id(); + stat = &mem->stat; + cpustat = &stat->cpustat[cpu]; + + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); +done: + unlock_page_cgroup(pc); +} /* * Unlike exported interface, "oom" parameter is added. if oom==true, @@ -1098,6 +1156,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup_per_zone *from_mz, *to_mz; int nid, zid; int ret = -EBUSY; + struct page *page; + int cpu; + struct mem_cgroup_stat *stat; + struct mem_cgroup_stat_cpu *cpustat; VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); @@ -1118,6 +1180,23 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); + + page = pc->page; + if (page_is_file_cache(page) && page_mapped(page)) { + cpu = smp_processor_id(); + /* Update mapped_file data for mem_cgroup "from" */ + stat = &from->stat; + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, + -1); + + /* Update mapped_file data for mem_cgroup "to" */ + stat = &to->stat; + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, + 1); + } + if (do_swap_account) res_counter_uncharge(&from->memsw, PAGE_SIZE); css_put(&from->css); @@ -1128,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1349,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1378,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1433,6 +1523,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: + case MEM_CGROUP_CHARGE_TYPE_DROP: if (page_mapped(page)) goto unlock_out; break; @@ -1496,18 +1587,23 @@ void mem_cgroup_uncharge_cache_page(struct page *page) * called after __delete_from_swap_cache() and drop "page" account. * memcg information is recorded to swap_cgroup of "ent" */ -void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) +void +mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) { struct mem_cgroup *memcg; + int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; + + if (!swapout) /* this was a swap cache but the swap is unused ! */ + ctype = MEM_CGROUP_CHARGE_TYPE_DROP; + + memcg = __mem_cgroup_uncharge_common(page, ctype); - memcg = __mem_cgroup_uncharge_common(page, - MEM_CGROUP_CHARGE_TYPE_SWAPOUT); /* record memcg information */ - if (do_swap_account && memcg) { + if (do_swap_account && swapout && memcg) { swap_cgroup_record(ent, css_id(&memcg->css)); mem_cgroup_get(memcg); } - if (memcg) + if (swapout && memcg) css_put(&memcg->css); } #endif @@ -1579,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1619,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* @@ -1685,6 +1787,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } ret = res_counter_set_limit(&memcg->res, val); + if (!ret) { + if (memswlimit == val) + memcg->memsw_is_minimum = true; + else + memcg->memsw_is_minimum = false; + } mutex_unlock(&set_limit_mutex); if (!ret) @@ -1703,16 +1811,14 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, return ret; } -int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long long val) +static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, + unsigned long long val) { int retry_count; u64 memlimit, oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; - if (!do_swap_account) - return -EINVAL; /* see mem_cgroup_resize_res_limit */ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); @@ -1734,6 +1840,12 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, break; } ret = res_counter_set_limit(&memcg->memsw, val); + if (!ret) { + if (memlimit == val) + memcg->memsw_is_minimum = true; + else + memcg->memsw_is_minimum = false; + } mutex_unlock(&set_limit_mutex); if (!ret) @@ -1878,7 +1990,7 @@ try_to_free: if (!progress) { nr_retries--; /* maybe some writeback is necessary */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } } @@ -1947,8 +2059,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - if (do_swap_account) - val = res_counter_read_u64(&mem->memsw, name); + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); @@ -2046,6 +2157,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) enum { MCS_CACHE, MCS_RSS, + MCS_MAPPED_FILE, MCS_PGPGIN, MCS_PGPGOUT, MCS_INACTIVE_ANON, @@ -2066,6 +2178,7 @@ struct { } memcg_stat_strings[NR_MCS_STAT] = { {"cache", "total_cache"}, {"rss", "total_rss"}, + {"mapped_file", "total_mapped_file"}, {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, {"inactive_anon", "total_inactive_anon"}, @@ -2086,6 +2199,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_CACHE] += val * PAGE_SIZE; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); s->stat[MCS_RSS] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); + s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); s->stat[MCS_PGPGIN] += val; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); diff --git a/mm/memory.c b/mm/memory.c index d5d1653d60a..aede2ce3aba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* @@ -1207,8 +1208,8 @@ static inline int use_zero_page(struct vm_area_struct *vma) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, int flags, + struct page **pages, struct vm_area_struct **vmas) { int i; unsigned int vm_flags = 0; @@ -1217,7 +1218,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); - if (len <= 0) + if (nr_pages <= 0) return 0; /* * Require read or write permissions. @@ -1269,7 +1270,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = gate_vma; i++; start += PAGE_SIZE; - len--; + nr_pages--; continue; } @@ -1280,7 +1281,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i, write); + &start, &nr_pages, i, write); continue; } @@ -1310,8 +1311,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + ret = handle_mm_fault(mm, vma, start, - foll_flags & FOLL_WRITE); + (foll_flags & FOLL_WRITE) ? + FAULT_FLAG_WRITE : 0); + if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; @@ -1354,9 +1358,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = vma; i++; start += PAGE_SIZE; - len--; - } while (len && start < vma->vm_end); - } while (len); + nr_pages--; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); return i; } @@ -1365,7 +1369,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address - * @len: number of pages from start to pin + * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller * @force: whether to force write access even if user mapping is * readonly. This will result in the page being COWed even @@ -1377,7 +1381,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * Or NULL if the caller does not require them. * * Returns number of pages pinned. This may be fewer than the number - * requested. If len is 0 or negative, returns 0. If no pages + * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. @@ -1411,7 +1415,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * See also get_user_pages_fast, for performance critical applications. */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; @@ -1421,9 +1425,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= GUP_FLAGS_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); @@ -2496,7 +2498,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; struct page *page; @@ -2516,7 +2518,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ + grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { @@ -2572,9 +2574,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - write_access = 0; + flags &= ~FAULT_FLAG_WRITE; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); @@ -2587,7 +2589,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, try_to_free_swap(page); unlock_page(page); - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; @@ -2616,7 +2618,7 @@ out_page: */ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) + unsigned int flags) { struct page *page; spinlock_t *ptl; @@ -2776,7 +2778,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * due to the bad i386 page protection. But it's valid * for other architectures too. * - * Note that if write_access is true, we either now have + * Note that if FAULT_FLAG_WRITE is set, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. @@ -2847,11 +2849,10 @@ unwritable_page: static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { pgoff_t pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); pte_unmap(page_table); return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); @@ -2868,12 +2869,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { - unsigned int flags = FAULT_FLAG_NONLINEAR | - (write_access ? FAULT_FLAG_WRITE : 0); pgoff_t pgoff; + flags |= FAULT_FLAG_NONLINEAR; + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; @@ -2904,7 +2905,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, int write_access) + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -2915,30 +2916,30 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); + pte, pmd, flags); } if (pte_file(entry)) return do_nonlinear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); return do_swap_page(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { + if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, entry); } else { /* @@ -2947,7 +2948,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (write_access) + if (flags & FAULT_FLAG_WRITE) flush_tlb_page(vma, address); } unlock: @@ -2959,7 +2960,7 @@ unlock: * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; @@ -2971,7 +2972,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, write_access); + return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); @@ -2984,7 +2985,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) return VM_FAULT_OOM; - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); } #ifndef __PAGETABLE_PUD_FOLDED diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63..7dd9d9f8069 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) { - nodemask_t cpuset_context_nmask; int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); + nodes_and(nsc->mask2, *nodes, nsc->mask1); + if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else @@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) cpuset_current_mems_allowed; } - ret = mpol_ops[pol->mode].create(pol, - nodes ? &cpuset_context_nmask : NULL); + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } @@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, { struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (mm) down_write(&mm->mmap_sem); task_lock(current); - ret = mpol_set_nodemask(new, nodes); + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); if (mm) up_write(&mm->mmap_sem); mpol_put(new); - return ret; + goto out; } old = current->mempolicy; current->mempolicy = new; @@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, up_write(&mm->mmap_sem); mpol_put(old); - return 0; + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* @@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) return err; } - down_write(&mm->mmap_sem); - task_lock(current); - err = mpol_set_nodemask(new, nmask); - task_unlock(current); + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } if (err) { - up_write(&mm->mmap_sem); mpol_put(new); return err; } @@ -1891,6 +1911,7 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) { mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); return; /* no valid nodemask intersection */ } task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ if (ret) { + NODEMASK_SCRATCH_FREE(scratch); mpol_put(new); return; } @@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ + NODEMASK_SCRATCH_FREE(scratch); } } @@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) err = 1; else { int ret; - - task_lock(current); - ret = mpol_set_nodemask(new, &nodes); - task_unlock(current); - if (ret) + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { err = 1; - else if (no_context) { + mpol_put(new); + } else if (no_context) { /* save for contextualization */ new->w.user_nodemask = nodes; } diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b4bb6..32e75d40050 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t)(long)pool_data; + size_t size = (size_t)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t) pool_data; + size_t size = (size_t)pool_data; return kzalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kzalloc); diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd..8101de490c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 2fd2ad5da98..66e81e7e9fe 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); @@ -173,8 +170,8 @@ unsigned int kobjsize(const void *objp) } int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, int flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -189,7 +186,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - for (i = 0; i < len; i++) { + for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) goto finish_or_fault; @@ -224,7 +221,7 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; @@ -234,12 +231,31 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= GUP_FLAGS_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; @@ -903,6 +919,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && @@ -1332,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file, } vma->vm_region = region; + add_nommu_region(region); /* set up the mapping */ if (file && vma->vm_flags & VM_SHARED) @@ -1341,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file, if (ret < 0) goto error_put_region; - add_nommu_region(region); - /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a9..a7b2460e922 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; task_lock(p); mm = p->mm; @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oom_adj. + * Adjust the score by oomkilladj. */ - if (oom_adj) { - if (oom_adj > 0) { + if (p->oomkilladj) { + if (p->oomkilladj > 0) { if (!points) points = 1; - points <<= oom_adj; + points <<= p->oomkilladj; } else - points >>= -(oom_adj); + points >>= -(p->oomkilladj); } #ifdef DEBUG @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; - task_lock(p); mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) return 1; - } - task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + __oom_kill_task(p, 1); /* @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935..81627ebcd31 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -541,8 +541,11 @@ static void balance_dirty_pages(struct address_space *mapping) * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. */ - if (bdi_nr_reclaimable) { + if (bdi_nr_reclaimable > bdi_thresh) { writeback_inodes(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, @@ -572,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -666,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion @@ -712,7 +715,7 @@ static void background_writeout(unsigned long _min_pages) if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); else break; } @@ -784,7 +787,7 @@ static void wb_kupdate(unsigned long arg) writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); else break; /* All the old data is written */ } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a5f3c278c57..a0de15f4698 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -73,6 +73,7 @@ unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; +gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; @@ -487,7 +488,6 @@ static inline void __free_one_page(struct page *page, */ static inline void free_page_mlock(struct page *page) { - __ClearPageMlocked(page); __dec_zone_page_state(page, NR_MLOCK); __count_vm_event(UNEVICTABLE_MLOCKFREED); } @@ -557,7 +557,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; - int clearMlocked = PageMlocked(page); + int wasMlocked = TestClearPageMlocked(page); kmemcheck_free_shadow(page, order); @@ -575,7 +575,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); - if (unlikely(clearMlocked)) + if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order, @@ -817,13 +817,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * agressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE) { + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { unsigned long pages; pages = move_freepages_block(zone, page, start_migratetype); /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1))) + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_migratetype); @@ -882,7 +884,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -901,7 +903,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } @@ -1021,7 +1026,7 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int clearMlocked = PageMlocked(page); + int wasMlocked = TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); @@ -1040,7 +1045,7 @@ static void free_hot_cold_page(struct page *page, int cold) pcp = &zone_pcp(zone, get_cpu())->pcp; set_page_private(page, get_pageblock_migratetype(page)); local_irq_save(flags); - if (unlikely(clearMlocked)) + if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); @@ -1119,7 +1124,8 @@ again: local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); if (unlikely(!pcp->count)) goto failed; } @@ -1138,7 +1144,8 @@ again: /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); page = list_entry(pcp->list.next, struct page, lru); } @@ -1153,10 +1160,10 @@ again: * properly detect and handle allocation failures. * * We most definitely don't want callers attempting to - * allocate greater than single-page units with + * allocate greater than order-1 page units with * __GFP_NOFAIL. */ - WARN_ON_ONCE(order > 0); + WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); @@ -1666,7 +1673,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1740,8 +1747,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * be using allocators in order of preference for an area that is * too large. */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; + } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1789,6 +1798,10 @@ rebalance: if (p->flags & PF_MEMALLOC) goto nopage; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -1831,7 +1844,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -1863,6 +1876,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page; int migratetype = allocflags_to_migratetype(gfp_mask); + gfp_mask &= gfp_allowed_mask; + lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_WAIT); @@ -1981,7 +1996,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; @@ -2531,7 +2546,6 @@ static void build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2640,6 +2654,9 @@ static int __build_all_zonelists(void *dummy) { int nid; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3024,7 +3041,7 @@ bad: if (dzone == zone) break; kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = NULL; + zone_pcp(dzone, cpu) = &boot_pageset[cpu]; } return -ENOMEM; } @@ -3039,7 +3056,7 @@ static inline void free_zone_pagesets(int cpu) /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) kfree(pset); - zone_pcp(zone, cpu) = NULL; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; } } @@ -4030,6 +4047,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -4057,7 +4076,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -4156,6 +4175,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4240,11 +4263,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].start_pfn, early_node_map[i].end_pfn); - /* - * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init - * that node_mask, clear it at first - */ - nodes_clear(node_states[N_HIGH_MEMORY]); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); @@ -4657,7 +4675,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; - for_each_zone(zone) { + for_each_populated_zone(zone) { for_each_online_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; @@ -4742,8 +4760,10 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) + if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4761,16 +4781,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 11a8a10a390..f22b4ebbd8d 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -83,12 +83,12 @@ void __init page_cgroup_init_flatmem(void) goto fail; } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try cgroup_disable=memory option if you" - " don't want\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" + " don't want memory cgroups\n"); return; fail: - printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); - printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); + printk(KERN_CRIT "allocation of page_cgroup failed.\n"); + printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); panic("Out of memory"); } @@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + if (!section->page_cgroup) + return NULL; return section->page_cgroup + pfn; } @@ -252,14 +254,14 @@ void __init page_cgroup_init(void) fail = init_section_page_cgroup(pfn); } if (fail) { - printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); + printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); panic("Out of memory"); } else { hotplug_memory_notifier(page_cgroup_callback, 0); } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try cgroup_disable=memory option if you don't" - " want\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" + " want memory cgroups\n"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) @@ -309,8 +311,6 @@ static int swap_cgroup_prepare(int type) struct swap_cgroup_ctrl *ctrl; unsigned long idx, max; - if (!do_swap_account) - return 0; ctrl = &swap_cgroup_ctrl[type]; for (idx = 0; idx < ctrl->length; idx++) { @@ -347,9 +347,6 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) struct swap_cgroup *sc; unsigned short old; - if (!do_swap_account) - return 0; - ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; @@ -378,9 +375,6 @@ unsigned short lookup_swap_cgroup(swp_entry_t ent) struct swap_cgroup *sc; unsigned short ret; - if (!do_swap_account) - return 0; - ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); diff --git a/mm/percpu.c b/mm/percpu.c index c0b2c1a76e8..3311c8919f3 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,12 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of nr_cpu_ids units and the first chunk is used + * for static percpu variables in the kernel image (special boot time + * alloc/init handling necessary as these areas need to be brought up + * before allocation services are running). Unit grows as necessary + * and all units grow or shrink in unison. When a chunk is filled up, + * another chunk is allocated. ie. in vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -197,7 +197,12 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, int page_idx) { - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + /* + * Any possible cpu id can be used here, so there's no need to + * worry about preemption or cpu hotplug. + */ + return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(), + page_idx) != NULL; } /* set the pointer to a chunk in a page struct */ @@ -297,6 +302,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_first_chunk; } + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += raw_smp_processor_id() * pcpu_unit_size; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -549,16 +562,16 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) * @chunk: chunk of interest * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 - * @flush: whether to flush cache and tlb or not + * @flush_tlb: whether to flush tlb or not * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * If @flush is true, vcache is flushed before unmapping and tlb * after. */ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush) + bool flush_tlb) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; /* unmap must not be done on immutable chunk */ @@ -569,9 +582,8 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, * the whole region at once rather than doing it for each cpu. * This could be an overkill but is more scalable. */ - if (flush) - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); for_each_possible_cpu(cpu) unmap_kernel_range_noflush( @@ -579,7 +591,7 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, (page_end - page_start) << PAGE_SHIFT); /* ditto as flush_cache_vunmap() */ - if (flush) + if (flush_tlb) flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); } @@ -644,7 +656,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; int err; @@ -750,7 +762,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); if (!chunk->vm) { free_pcpu_chunk(chunk); return NULL; @@ -1068,9 +1080,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1234,6 +1246,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size) { + size_t chunk_size; unsigned int cpu; /* determine parameters and allocate */ @@ -1248,19 +1261,26 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } else pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - pcpue_ptr = __alloc_bootmem_nopanic( - num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) + chunk_size = pcpue_unit_size * nr_cpu_ids; + + pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!pcpue_ptr) { + pr_warning("PERCPU: failed to allocate %zu bytes for " + "embedding\n", chunk_size); return -ENOMEM; + } /* return the leftover and copy */ - for_each_possible_cpu(cpu) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + if (cpu_possible(cpu)) { + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } else + free_bootmem(__pa(ptr), pcpue_unit_size); } /* we're ready, commit */ diff --git a/mm/rmap.c b/mm/rmap.c index c9ccc1a72dc..0895b5c7cbf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page, */ if (vma->vm_flags & VM_LOCKED) { *mapcount = 1; /* break early from loop */ + *vm_flags |= VM_LOCKED; goto out_unmap; } @@ -703,8 +704,10 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - if (atomic_inc_and_test(&page->_mapcount)) + if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_update_mapped_file_stat(page, 1); + } } #ifdef CONFIG_DEBUG_VM @@ -753,6 +756,7 @@ void page_remove_rmap(struct page *page) mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + mem_cgroup_update_mapped_file_stat(page, -1); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap diff --git a/mm/shmem.c b/mm/shmem.c index e89d7ec18ed..d713239ce2c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1558,6 +1558,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); + cache_no_acl(inode); switch (mode & S_IFMT) { default: @@ -2388,7 +2389,6 @@ static void shmem_destroy_inode(struct inode *inode) /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - shmem_acl_destroy_inode(inode); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -2397,10 +2397,6 @@ static void init_once(void *foo) struct shmem_inode_info *p = (struct shmem_inode_info *) foo; inode_init_once(&p->vfs_inode); -#ifdef CONFIG_TMPFS_POSIX_ACL - p->i_acl = NULL; - p->i_default_acl = NULL; -#endif } static int init_inodecache(void) diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 8e5aadd7dcd..606a8e757a4 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type) spin_lock(&inode->i_lock); switch(type) { case ACL_TYPE_ACCESS: - acl = posix_acl_dup(SHMEM_I(inode)->i_acl); + acl = posix_acl_dup(inode->i_acl); break; case ACL_TYPE_DEFAULT: - acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); + acl = posix_acl_dup(inode->i_default_acl); break; } spin_unlock(&inode->i_lock); @@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) spin_lock(&inode->i_lock); switch(type) { case ACL_TYPE_ACCESS: - free = SHMEM_I(inode)->i_acl; - SHMEM_I(inode)->i_acl = posix_acl_dup(acl); + free = inode->i_acl; + inode->i_acl = posix_acl_dup(acl); break; case ACL_TYPE_DEFAULT: - free = SHMEM_I(inode)->i_default_acl; - SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); + free = inode->i_default_acl; + inode->i_default_acl = posix_acl_dup(acl); break; } spin_unlock(&inode->i_lock); @@ -155,23 +155,6 @@ shmem_acl_init(struct inode *inode, struct inode *dir) } /** - * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode - * - * This is done before destroying the actual inode. - */ - -void -shmem_acl_destroy_inode(struct inode *inode) -{ - if (SHMEM_I(inode)->i_acl) - posix_acl_release(SHMEM_I(inode)->i_acl); - SHMEM_I(inode)->i_acl = NULL; - if (SHMEM_I(inode)->i_default_acl) - posix_acl_release(SHMEM_I(inode)->i_default_acl); - SHMEM_I(inode)->i_default_acl = NULL; -} - -/** * shmem_check_acl - check_acl() callback for generic_permission() */ static int diff --git a/mm/slab.c b/mm/slab.c index f257d4dd474..7b5d4deacfc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -305,12 +305,6 @@ struct kmem_list3 { }; /* - * The slab allocator is initialized with interrupts disabled. Therefore, make - * sure early boot allocations don't accidentally enable interrupts. - */ -static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; - -/* * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (3 * MAX_NUMNODES) @@ -1550,20 +1544,12 @@ void __init kmem_cache_init(void) } g_cpucache_up = EARLY; - - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); } void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - /* - * Interrupts are enabled now so all GFP allocations are safe. - */ - slab_gfp_mask = __GFP_BITS_MASK; - /* 6) resize the head arrays to their final sizes */ mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) @@ -1574,6 +1560,9 @@ void __init kmem_cache_init_late(void) /* Done! */ g_cpucache_up = FULL; + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus @@ -2308,6 +2297,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* really off slab. No need for manual alignment */ slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + +#ifdef CONFIG_PAGE_POISONING + /* If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif } cachep->colour_off = cache_line_size(); @@ -2549,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) } if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - synchronize_rcu(); + rcu_barrier(); __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); @@ -3298,7 +3296,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; - flags &= slab_gfp_mask; + flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); @@ -3383,7 +3381,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; - flags &= slab_gfp_mask; + flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); diff --git a/mm/slob.c b/mm/slob.c index 64f6db1943b..9641da3d5e5 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -133,17 +133,17 @@ static LIST_HEAD(free_slob_large); */ static inline int is_slob_page(struct slob_page *sp) { - return PageSlobPage((struct page *)sp); + return PageSlab((struct page *)sp); } static inline void set_slob_page(struct slob_page *sp) { - __SetPageSlobPage((struct page *)sp); + __SetPageSlab((struct page *)sp); } static inline void clear_slob_page(struct slob_page *sp) { - __ClearPageSlobPage((struct page *)sp); + __ClearPageSlab((struct page *)sp); } static inline struct slob_page *slob_page(const void *addr) @@ -595,6 +595,8 @@ EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *c) { kmemleak_free(c); + if (c->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); slob_free(c, sizeof(struct kmem_cache)); } EXPORT_SYMBOL(kmem_cache_destroy); diff --git a/mm/slub.c b/mm/slub.c index 2701419b0ad..b6276753626 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -21,7 +21,6 @@ #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> -#include <linux/kmemleak.h> #include <linux/mempolicy.h> #include <linux/ctype.h> #include <linux/debugobjects.h> @@ -179,12 +178,6 @@ static enum { SYSFS /* Sysfs up */ } slab_state = DOWN; -/* - * The slab allocator is initialized with interrupts disabled. Therefore, make - * sure early boot allocations don't accidentally enable interrupts. - */ -static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK; - /* A list of all slab caches on the system */ static DECLARE_RWSEM(slub_lock); static LIST_HEAD(slab_caches); @@ -840,6 +833,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node) return atomic_long_read(&n->nr_slabs); } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->nr_slabs); +} + static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); @@ -1058,6 +1056,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, static inline unsigned long slabs_node(struct kmem_cache *s, int node) { return 0; } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) + { return 0; } static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, @@ -1084,11 +1084,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; flags |= s->allocflags; - page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, - oo); + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + + page = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; /* @@ -1514,6 +1520,65 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) return 1; } +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} + +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ +#ifdef CONFIG_SLUB_DEBUG + return atomic_long_read(&n->total_objects); +#else + return 0; +#endif +} + +static noinline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) +{ + int node; + + printk(KERN_WARNING + "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nid, gfpflags); + printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " + "default order: %d, min order: %d\n", s->name, s->objsize, + s->size, oo_order(s->oo), oo_order(s->min)); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long nr_slabs; + unsigned long nr_objs; + unsigned long nr_free; + + if (!n) + continue; + + nr_free = count_partial(n, count_free); + nr_slabs = node_nr_slabs(n); + nr_objs = node_nr_objs(n); + + printk(KERN_WARNING + " node %d: slabs: %ld, objs: %ld, free: %ld\n", + node, nr_slabs, nr_objs, nr_free); + } +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -1595,6 +1660,8 @@ new_slab: c->page = new; goto load_freelist; } + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -1624,7 +1691,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, unsigned long flags; unsigned int objsize; - gfpflags &= slab_gfp_mask; + gfpflags &= gfp_allowed_mask; lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); @@ -2537,6 +2604,8 @@ void kmem_cache_destroy(struct kmem_cache *s) "still has objects.\n", s->name, __func__); dump_stack(); } + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); sysfs_slab_remove(s); } else up_write(&slub_lock); @@ -2636,6 +2705,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) struct kmem_cache *s; char *text; size_t realsize; + unsigned long slabflags; s = kmalloc_caches_dma[index]; if (s) @@ -2657,10 +2727,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) (unsigned int)realsize); s = kmalloc(kmem_size, flags & ~SLUB_DMA); + /* + * Must defer sysfs creation to a workqueue because we don't know + * what context we are called from. Before sysfs comes up, we don't + * need to do anything because our sysfs initcall will start by + * adding all existing slabs to sysfs. + */ + slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; + if (slab_state >= SYSFS) + slabflags |= __SYSFS_ADD_DEFERRED; + if (!s || !text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, - SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED, - NULL)) { + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { kfree(s); kfree(text); goto unlock_out; @@ -2669,7 +2747,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) list_add(&s->list, &slab_caches); kmalloc_caches_dma[index] = s; - schedule_work(&sysfs_add_work); + if (slab_state >= SYSFS) + schedule_work(&sysfs_add_work); unlock_out: up_write(&slub_lock); @@ -2755,13 +2834,15 @@ EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; + void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; page = alloc_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; } #ifdef CONFIG_NUMA @@ -2846,6 +2927,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } @@ -3142,10 +3224,6 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { - /* - * Interrupts are enabled now so all GFP allocations are safe. - */ - slab_gfp_mask = __GFP_BITS_MASK; } /* @@ -3368,20 +3446,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, } #ifdef CONFIG_SLUB_DEBUG -static unsigned long count_partial(struct kmem_cache_node *n, - int (*get_count)(struct page *)) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += get_count(page); - spin_unlock_irqrestore(&n->list_lock, flags); - return x; -} - static int count_inuse(struct page *page) { return page->inuse; @@ -3392,11 +3456,6 @@ static int count_total(struct page *page) return page->objects; } -static int count_free(struct page *page) -{ - return page->objects - page->inuse; -} - static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 28faa01cf57..8ffdc0d23c5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -583,8 +583,9 @@ static int swap_entry_free(struct swap_info_struct *p, swap_list.next = p - swap_info; nr_swap_pages++; p->inuse_pages--; - mem_cgroup_uncharge_swap(ent); } + if (!swap_count(count)) + mem_cgroup_uncharge_swap(ent); return count; } @@ -609,12 +610,19 @@ void swap_free(swp_entry_t entry) void swapcache_free(swp_entry_t entry, struct page *page) { struct swap_info_struct *p; + int ret; - if (page) - mem_cgroup_uncharge_swapcache(page, entry); p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry, SWAP_CACHE); + ret = swap_entry_free(p, entry, SWAP_CACHE); + if (page) { + bool swapout; + if (ret) + swapout = true; /* the end of swap out */ + else + swapout = false; /* no more swap users! */ + mem_cgroup_uncharge_swapcache(page, entry, swapout); + } spin_unlock(&swap_lock); } return; @@ -745,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); return i; @@ -757,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205a9c3..2372d4ed5dd 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; static unsigned int global_faults; -void grab_swap_token(void) +void grab_swap_token(struct mm_struct *mm) { int current_interval; global_faults++; - current_interval = global_faults - current->mm->faultstamp; + current_interval = global_faults - mm->faultstamp; if (!spin_trylock(&swap_token_lock)) return; /* First come first served */ if (swap_token_mm == NULL) { - current->mm->token_priority = current->mm->token_priority + 2; - swap_token_mm = current->mm; + mm->token_priority = mm->token_priority + 2; + swap_token_mm = mm; goto out; } - if (current->mm != swap_token_mm) { - if (current_interval < current->mm->last_interval) - current->mm->token_priority++; + if (mm != swap_token_mm) { + if (current_interval < mm->last_interval) + mm->token_priority++; else { - if (likely(current->mm->token_priority > 0)) - current->mm->token_priority--; + if (likely(mm->token_priority > 0)) + mm->token_priority--; } /* Check if we deserve the token */ - if (current->mm->token_priority > - swap_token_mm->token_priority) { - current->mm->token_priority += 2; - swap_token_mm = current->mm; + if (mm->token_priority > swap_token_mm->token_priority) { + mm->token_priority += 2; + swap_token_mm = mm; } } else { /* Token holder came in again! */ - current->mm->token_priority += 2; + mm->token_priority += 2; } out: - current->mm->faultstamp = global_faults; - current->mm->last_interval = current_interval; + mm->faultstamp = global_faults; + mm->last_interval = current_interval; spin_unlock(&swap_token_lock); -return; } /* Called on process exit. */ diff --git a/mm/util.c b/mm/util.c index d5d2213728c..7c35ad95f92 100644 --- a/mm/util.c +++ b/mm/util.c @@ -168,6 +168,10 @@ EXPORT_SYMBOL(krealloc); * * The memory of the object @p points to is zeroed before freed. * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. */ void kzfree(const void *p) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 4139aa52b94..94e86dd6954 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); - /* In active use or really unfreeable? Activate it. */ + /* + * In active use or really unfreeable? Activate it. + * If page which have PG_mlocked lost isoltation race, + * try_to_unmap moves it to unevictable list + */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + referenced && page_mapping_inuse(page) + && !(vm_flags & VM_LOCKED)) goto activate_locked; /* @@ -837,7 +842,6 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ ClearPageLRU(page); ret = 0; - mem_cgroup_del_lru(page); } return ret; @@ -885,12 +889,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); + mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -931,6 +937,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); + mem_cgroup_del_lru(cursor_page); nr_taken++; scan++; } @@ -1102,7 +1109,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, */ if (nr_freed < nr_taken && !current_is_kswapd() && lumpy_reclaim) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1719,7 +1726,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) @@ -1958,7 +1965,7 @@ loop_again: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -2231,7 +2238,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); } } |