diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 13 | ||||
-rw-r--r-- | mm/bootmem.c | 24 | ||||
-rw-r--r-- | mm/compaction.c | 124 | ||||
-rw-r--r-- | mm/highmem.c | 1 | ||||
-rw-r--r-- | mm/huge_memory.c | 21 | ||||
-rw-r--r-- | mm/hugetlb.c | 11 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 19 | ||||
-rw-r--r-- | mm/internal.h | 1 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/ksm.c | 16 | ||||
-rw-r--r-- | mm/memblock.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 1256 | ||||
-rw-r--r-- | mm/memory.c | 13 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 18 | ||||
-rw-r--r-- | mm/mempolicy.c | 130 | ||||
-rw-r--r-- | mm/migrate.c | 14 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 30 | ||||
-rw-r--r-- | mm/page-writeback.c | 25 | ||||
-rw-r--r-- | mm/page_alloc.c | 113 | ||||
-rw-r--r-- | mm/page_isolation.c | 26 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 383 | ||||
-rw-r--r-- | mm/slab.h | 190 | ||||
-rw-r--r-- | mm/slab_common.c | 292 | ||||
-rw-r--r-- | mm/slob.c | 48 | ||||
-rw-r--r-- | mm/slub.c | 447 | ||||
-rw-r--r-- | mm/truncate.c | 23 | ||||
-rw-r--r-- | mm/vmscan.c | 133 |
29 files changed, 2407 insertions, 976 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 71259e052ce..278e3ab1f16 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -149,7 +149,18 @@ config MOVABLE_NODE depends on NO_BOOTMEM depends on X86_64 depends on NUMA - depends on BROKEN + default n + help + Allow a node to have only movable memory. Pages used by the kernel, + such as direct mapping pages cannot be migrated. So the corresponding + memory device cannot be hotplugged. This option allows users to + online all the memory of a node as movable memory so that the whole + node can be hotplugged. Users who don't use the memory hotplug + feature are fine with this option on since they don't online memory + as movable. + + Say Y here if you want to hotplug a whole node. + Say N here if you want kernel to use memory on all nodes evenly. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG diff --git a/mm/bootmem.c b/mm/bootmem.c index 1324cd74fae..b93376c39b6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) while (start < end) { unsigned long *map, idx, vec; + unsigned shift; map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; + shift = idx & (BITS_PER_LONG - 1); + /* + * vec holds at most BITS_PER_LONG map bits, + * bit 0 corresponds to start. + */ vec = ~map[idx / BITS_PER_LONG]; + + if (shift) { + vec >>= shift; + if (end - start >= BITS_PER_LONG) + vec |= ~map[idx / BITS_PER_LONG + 1] << + (BITS_PER_LONG - shift); + } /* * If we have a properly aligned and fully unreserved * BITS_PER_LONG block of pages in front of us, free @@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long off = 0; + unsigned long cur = start; - vec >>= start & (BITS_PER_LONG - 1); - while (vec) { + start = ALIGN(start + 1, BITS_PER_LONG); + while (vec && cur != start) { if (vec & 1) { - page = pfn_to_page(start + off); + page = pfn_to_page(cur); __free_pages_bootmem(page, 0); count++; } vec >>= 1; - off++; + ++cur; } - start = ALIGN(start + 1, BITS_PER_LONG); } } diff --git a/mm/compaction.c b/mm/compaction.c index 5ad7f4f4d6f..c62bd063d76 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -17,6 +17,21 @@ #include <linux/balloon_compaction.h> #include "internal.h" +#ifdef CONFIG_COMPACTION +static inline void count_compact_event(enum vm_event_item item) +{ + count_vm_event(item); +} + +static inline void count_compact_events(enum vm_event_item item, long delta) +{ + count_vm_events(item, delta); +} +#else +#define count_compact_event(item) do { } while (0) +#define count_compact_events(item, delta) do { } while (0) +#endif + #if defined CONFIG_COMPACTION || defined CONFIG_CMA #define CREATE_TRACE_POINTS @@ -303,10 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); - count_vm_events(COMPACTFREE_SCANNED, nr_scanned); + count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) - count_vm_events(COMPACTISOLATED, total_isolated); - + count_compact_events(COMPACTISOLATED, total_isolated); return total_isolated; } @@ -613,9 +627,9 @@ next_pageblock: trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); - count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); + count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); if (nr_isolated) - count_vm_events(COMPACTISOLATED, nr_isolated); + count_compact_events(COMPACTISOLATED, nr_isolated); return low_pfn; } @@ -802,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { + unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -836,22 +851,16 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - if (cc->page) { - /* Was a suitable page captured? */ - if (*cc->page) + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; - } else { - unsigned int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[cc->order]; - /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) - return COMPACT_PARTIAL; - } } return COMPACT_CONTINUE; @@ -907,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_CONTINUE; } -static void compact_capture_page(struct compact_control *cc) -{ - unsigned long flags; - int mtype, mtype_low, mtype_high; - - if (!cc->page || *cc->page) - return; - - /* - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP - * regardless of the migratetype of the freelist is is captured from. - * This is fine because the order for a high-order MIGRATE_MOVABLE - * allocation is typically at least a pageblock size and overall - * fragmentation is not impaired. Other allocation types must - * capture pages from their own migratelist because otherwise they - * could pollute other pageblocks like MIGRATE_MOVABLE with - * difficult to move pages and making fragmentation worse overall. - */ - if (cc->migratetype == MIGRATE_MOVABLE) { - mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; - } else { - mtype_low = cc->migratetype; - mtype_high = cc->migratetype + 1; - } - - /* Speculatively examine the free lists without zone lock */ - for (mtype = mtype_low; mtype < mtype_high; mtype++) { - int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct page *page; - struct free_area *area; - area = &(cc->zone->free_area[order]); - if (list_empty(&area->free_list[mtype])) - continue; - - /* Take the lock and attempt capture of the page */ - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) - return; - if (!list_empty(&area->free_list[mtype])) { - page = list_entry(area->free_list[mtype].next, - struct page, lru); - if (capture_free_page(page, cc->order, mtype)) { - spin_unlock_irqrestore(&cc->zone->lock, - flags); - *cc->page = page; - return; - } - } - spin_unlock_irqrestore(&cc->zone->lock, flags); - } - } -} - static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; @@ -1040,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } - - /* Capture a page now if it is a suitable size */ - compact_capture_page(cc); } out: @@ -1055,8 +1007,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended, - struct page **page) + bool sync, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1066,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1096,7 +1046,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1110,7 +1060,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, if (!order || !may_enter_fs || !may_perform_io) return rc; - count_vm_event(COMPACTSTALL); + count_compact_event(COMPACTSTALL); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) @@ -1122,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended, page); + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -1178,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, - .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -1189,14 +1138,13 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, - .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); } /* Compact all nodes in the system */ -static int compact_nodes(void) +static void compact_nodes(void) { int nid; @@ -1205,8 +1153,6 @@ static int compact_nodes(void) for_each_online_node(nid) compact_node(nid); - - return COMPACT_COMPLETE; } /* The written value is actually unused, all memory is compacted */ @@ -1217,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { if (write) - return compact_nodes(); + compact_nodes(); return 0; } diff --git a/mm/highmem.c b/mm/highmem.c index d999077431d..b32b70cdaed 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -105,6 +105,7 @@ struct page *kmap_to_page(void *vaddr) return virt_to_page(addr); } +EXPORT_SYMBOL(kmap_to_page); static void flush_all_zero_pkmaps(void) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 32754eece63..6001ee6347a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -574,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed kobject create\n"); + printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); + printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); + printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); goto remove_hp_group; } @@ -1819,9 +1819,19 @@ int split_huge_page(struct page *page) BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma_read(page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(page); if (!anon_vma) goto out; + anon_vma_lock_write(anon_vma); + ret = 0; if (!PageCompound(page)) goto out_unlock; @@ -1832,7 +1842,8 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma_read(anon_vma); + anon_vma_unlock(anon_vma); + put_anon_vma(anon_vma); out: return ret; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5318c7793a..4f3ea0b1e57 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void) default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); - gather_bootmem_prealloc(); - report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); + hugetlb_cgroup_file_init(); return 0; } @@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order) h->next_nid_to_free = first_node(node_states[N_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgoup details. - */ - if (order >= HUGETLB_CGROUP_MIN_ORDER) - hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b5bde7a5c01..9cea7de22ff 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -int __init hugetlb_cgroup_file_init(int idx) +static void __init __hugetlb_cgroup_file_init(int idx) { char buf[32]; struct cftype *cft; @@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx) WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); - return 0; + return; +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } } /* diff --git a/mm/internal.h b/mm/internal.h index d597f94cc20..9ba21100ebf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -135,7 +135,6 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool contended; /* True if a lock was contended */ - struct page **page; /* Page captured of requested size */ }; unsigned long diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a217cc54406..752a705c77c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str) struct kmemleak_object *object; unsigned long addr; - addr= simple_strtoul(str, NULL, 0); + if (kstrtoul(str, 0, &addr)) + return -EINVAL; object = find_and_get_object(addr, 0); if (!object) { pr_info("Unknown object at 0x%08lx\n", addr); @@ -1624,7 +1624,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1648,7 +1648,7 @@ again: if (!search_new_forks || !mapcount) break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); if (!mapcount) goto out; } @@ -1678,7 +1678,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1697,11 +1697,11 @@ again: ret = try_to_unmap_one(page, vma, rmap_item->address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); goto out; } } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; @@ -1731,7 +1731,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock_write(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1749,11 +1749,11 @@ again: ret = rmap_one(page, vma, rmap_item->address, arg); if (ret != SWAP_AGAIN) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); goto out; } } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; diff --git a/mm/memblock.c b/mm/memblock.c index 625905523c2..88adc8afb61 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) } this->size += next->size; - memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); + /* move forward from next + 1, index of which is i + 2 */ + memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); type->cnt--; } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bbfac5063ca..09255ec8159 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10,6 +10,10 @@ * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -268,6 +272,10 @@ struct mem_cgroup { }; /* + * the counter to account for kernel memory usage. + */ + struct res_counter kmem; + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. */ @@ -282,6 +290,7 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -332,8 +341,61 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif }; +/* internal only representation about the status of kmem accounting. */ +enum { + KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ + KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ +}; + +/* We account when limit is on, but only after call sites are patched */ +#define KMEM_ACCOUNTED_MASK \ + ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_set_activated(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) +{ + clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ + if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) + set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ + return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, + &memcg->kmem_account_flags); +} +#endif + /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -388,9 +450,13 @@ enum charge_type { }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, + _KMEM, +}; + #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * There are two main reasons for not using the css_id for this: + * 1) this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * 2) In order not to violate the cgroup API, we would like to do all memory + * allocation in ->create(). At that point, we haven't yet allocated the + * css_id. Having a separate index prevents us from messing with the cgroup + * core for this + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size. It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +int memcg_limited_groups_array_size; + +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * css_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE 65535 + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ + if (memcg_kmem_is_active(memcg)) { + static_key_slow_dec(&memcg_kmem_enabled_key); + ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + } + /* + * This check can't live in kmem destruction function, + * since the charges will outlive the cgroup + */ + WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ + disarm_sock_keys(memcg); + disarm_kmem_keys(memcg); +} + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -1453,6 +1588,10 @@ done: res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); + printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); } /* @@ -2060,20 +2199,28 @@ struct memcg_stock_pcp { static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; bool ret = true; + if (nr_pages > CHARGE_BATCH) + return false; + stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages) - stock->nr_pages--; + if (memcg == stock->cached && stock->nr_pages >= nr_pages) + stock->nr_pages -= nr_pages; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -2250,7 +2397,8 @@ enum { }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, bool oom_check) + unsigned int nr_pages, unsigned int min_pages, + bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch - * of regular pages (CHARGE_BATCH), or a single regular page (1). - * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (nr_pages == CHARGE_BATCH) + if (nr_pages > min_pages) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; + if (gfp_mask & __GFP_NORETRY) + return CHARGE_NOMEM; + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; @@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages == 1 && ret) + if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) return CHARGE_RETRY; /* @@ -2371,7 +2519,7 @@ again: memcg = *ptr; if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(memcg)) + if (consume_stock(memcg, nr_pages)) goto done; css_get(&memcg->css); } else { @@ -2396,7 +2544,7 @@ again: rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(memcg)) { + if (consume_stock(memcg, nr_pages)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -2431,7 +2579,8 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, + oom_check); switch (ret) { case CHARGE_OK: break; @@ -2624,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +static DEFINE_MUTEX(set_limit_mutex); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ + return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +} + +/* + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. + */ +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) +{ + struct kmem_cache *cachep; + + VM_BUG_ON(p->is_root_cache); + cachep = p->root_cache; + return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; +} + +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct memcg_cache_params *params; + + if (!memcg_can_account_kmem(memcg)) + return -EIO; + + print_slabinfo_header(m); + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) + cache_show(memcg_params_to_cache(params), m); + mutex_unlock(&memcg->slab_caches_mutex); + + return 0; +} +#endif + +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ + struct res_counter *fail_res; + struct mem_cgroup *_memcg; + int ret = 0; + bool may_oom; + + ret = res_counter_charge(&memcg->kmem, size, &fail_res); + if (ret) + return ret; + + /* + * Conditions under which we can wait for the oom_killer. Those are + * the same conditions tested by the core page allocator + */ + may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + + _memcg = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, + &_memcg, may_oom); + + if (ret == -EINTR) { + /* + * __mem_cgroup_try_charge() chosed to bypass to root due to + * OOM kill or fatal signal. Since our only options are to + * either fail the allocation or charge it to this cgroup, do + * it as a temporary condition. But we can't fail. From a + * kmem/slab perspective, the cache has already been selected, + * by mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed during + * __mem_cgroup_try_charge() above. Tasks that were already + * dying when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + res_counter_charge_nofail(&memcg->res, size, &fail_res); + if (do_swap_account) + res_counter_charge_nofail(&memcg->memsw, size, + &fail_res); + ret = 0; + } else if (ret) + res_counter_uncharge(&memcg->kmem, size); + + return ret; +} + +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +{ + res_counter_uncharge(&memcg->res, size); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); + + /* Not down to 0 */ + if (res_counter_uncharge(&memcg->kmem, size)) + return; + + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); +} + +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) +{ + if (!memcg) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +/* + * This ends up being protected by the set_limit mutex, during normal + * operation, because that is its main call site. + * + * But when we create a new cache, we can call this as well if its parent + * is kmem-limited. That will have to hold set_limit_mutex as well. + */ +int memcg_update_cache_sizes(struct mem_cgroup *memcg) +{ + int num, ret; + + num = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (num < 0) + return num; + /* + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. + */ + memcg_kmem_set_activated(memcg); + + ret = memcg_update_all_caches(num+1); + if (ret) { + ida_simple_remove(&kmem_limited_groups, num); + memcg_kmem_clear_activated(memcg); + return ret; + } + + memcg->kmemcg_id = num; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + return 0; +} + +static size_t memcg_caches_array_size(int num_groups) +{ + ssize_t size; + if (num_groups <= 0) + return 0; + + size = 2 * num_groups; + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + return size; +} + +/* + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. + */ +void memcg_update_array_size(int num) +{ + if (num > memcg_limited_groups_array_size) + memcg_limited_groups_array_size = memcg_caches_array_size(num); +} + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ + struct memcg_cache_params *cur_params = s->memcg_params; + + VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + + if (num_groups > memcg_limited_groups_array_size) { + int i; + ssize_t size = memcg_caches_array_size(num_groups); + + size *= sizeof(void *); + size += sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) { + s->memcg_params = cur_params; + return -ENOMEM; + } + + s->memcg_params->is_root_cache = true; + + /* + * There is the chance it will be bigger than + * memcg_limited_groups_array_size, if we failed an allocation + * in a cache, in which case all caches updated before it, will + * have a bigger array. + * + * But if that is the case, the data after + * memcg_limited_groups_array_size is certainly unused + */ + for (i = 0; i < memcg_limited_groups_array_size; i++) { + if (!cur_params->memcg_caches[i]) + continue; + s->memcg_params->memcg_caches[i] = + cur_params->memcg_caches[i]; + } + + /* + * Ideally, we would wait until all caches succeed, and only + * then free the old one. But this is not worth the extra + * pointer per-cache we'd have to have for this. + * + * It is not a big deal if some caches are left with a size + * bigger than the others. And all updates will reset this + * anyway. + */ + kfree(cur_params); + } + return 0; +} + +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) +{ + size_t size = sizeof(struct memcg_cache_params); + + if (!memcg_kmem_enabled()) + return 0; + + if (!memcg) + size += memcg_limited_groups_array_size * sizeof(void *); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) { + s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } + return 0; +} + +void memcg_release_cache(struct kmem_cache *s) +{ + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + /* + * This happens, for instance, when a root cache goes away before we + * add any memcg. + */ + if (!s->memcg_params) + return; + + if (s->memcg_params->is_root_cache) + goto out; + + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + root = s->memcg_params->root_cache; + root->memcg_params->memcg_caches[id] = NULL; + mem_cgroup_put(memcg); + + mutex_lock(&memcg->slab_caches_mutex); + list_del(&s->memcg_params->list); + mutex_unlock(&memcg->slab_caches_mutex); + +out: + kfree(s->memcg_params); +} + +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *p; + + p = container_of(w, struct memcg_cache_params, destroy); + + cachep = memcg_params_to_cache(p); + + /* + * If we get down to 0 after shrink, we could delete right away. + * However, memcg_release_pages() already puts us back in the workqueue + * in that case. If we proceed deleting, we'll get a dangling + * reference, and removing the object from the workqueue in that case + * is unnecessary complication. We are not a fast path. + * + * Note that this case is fundamentally different from racing with + * shrink_slab(): if memcg_cgroup_destroy_cache() is called in + * kmem_cache_shrink, not only we would be reinserting a dead cache + * into the queue, but doing so from inside the worker racing to + * destroy it. + * + * So if we aren't down to zero, we'll just schedule a worker and try + * again + */ + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + return; + } else + kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ + if (!cachep->memcg_params->dead) + return; + + /* + * There are many ways in which we can get here. + * + * We can get to a memory-pressure situation while the delayed work is + * still pending to run. The vmscan shrinkers can then release all + * cache memory and get us to destruction. If this is the case, we'll + * be executed twice, which is a bug (the second time will execute over + * bogus data). In this case, cancelling the work should be fine. + * + * But we can also get here from the worker itself, if + * kmem_cache_shrink is enough to shake all the remaining objects and + * get the page count to 0. In this case, we'll deadlock if we try to + * cancel the work (the worker runs with an internal lock held, which + * is the same lock we would hold for cancel_work_sync().) + * + * Since we can't possibly know who got us here, just refrain from + * running if there is already work pending + */ + if (work_pending(&cachep->memcg_params->destroy)) + return; + /* + * We have to defer the actual destroying to a workqueue, because + * we might currently be in a context that cannot sleep. + */ + schedule_work(&cachep->memcg_params->destroy); +} + +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + char *name; + struct dentry *dentry; + + rcu_read_lock(); + dentry = rcu_dereference(memcg->css.cgroup->dentry); + rcu_read_unlock(); + + BUG_ON(dentry == NULL); + + name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), dentry->d_name.name); + + return name; +} + +static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + char *name; + struct kmem_cache *new; + + name = memcg_cache_name(memcg, s); + if (!name) + return NULL; + + new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + (s->flags & ~SLAB_PANIC), s->ctor, s); + + if (new) + new->allocflags |= __GFP_KMEMCG; + + kfree(name); + return new; +} + +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct kmem_cache *new_cachep; + int idx; + + BUG_ON(!memcg_can_account_kmem(memcg)); + + idx = memcg_cache_id(memcg); + + mutex_lock(&memcg_cache_mutex); + new_cachep = cachep->memcg_params->memcg_caches[idx]; + if (new_cachep) + goto out; + + new_cachep = kmem_cache_dup(memcg, cachep); + if (new_cachep == NULL) { + new_cachep = cachep; + goto out; + } + + mem_cgroup_get(memcg); + atomic_set(&new_cachep->memcg_params->nr_pages , 0); + + cachep->memcg_params->memcg_caches[idx] = new_cachep; + /* + * the readers won't lock, make sure everybody sees the updated value, + * so they won't put stuff in the queue again for no reason + */ + wmb(); +out: + mutex_unlock(&memcg_cache_mutex); + return new_cachep; +} + +void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + struct kmem_cache *c; + int i; + + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + return; + + /* + * If the cache is being destroyed, we trust that there is no one else + * requesting objects from it. Even if there are, the sanity checks in + * kmem_cache_destroy should caught this ill-case. + * + * Still, we don't want anyone else freeing memcg_caches under our + * noses, which can happen if a new memcg comes to life. As usual, + * we'll take the set_limit_mutex to protect ourselves against this. + */ + mutex_lock(&set_limit_mutex); + for (i = 0; i < memcg_limited_groups_array_size; i++) { + c = s->memcg_params->memcg_caches[i]; + if (!c) + continue; + + /* + * We will now manually delete the caches, so to avoid races + * we need to cancel all pending destruction workers and + * proceed with destruction ourselves. + * + * kmem_cache_destroy() will call kmem_cache_shrink internally, + * and that could spawn the workers again: it is likely that + * the cache still have active pages until this very moment. + * This would lead us back to mem_cgroup_destroy_cache. + * + * But that will not execute at all if the "dead" flag is not + * set, so flip it down to guarantee we are in control. + */ + c->memcg_params->dead = false; + cancel_work_sync(&c->memcg_params->destroy); + kmem_cache_destroy(c); + } + mutex_unlock(&set_limit_mutex); +} + +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!memcg_kmem_is_active(memcg)) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + cachep = memcg_params_to_cache(params); + cachep->memcg_params->dead = true; + INIT_WORK(&cachep->memcg_params->destroy, + kmem_cache_destroy_work_func); + schedule_work(&cachep->memcg_params->destroy); + } + mutex_unlock(&memcg->slab_caches_mutex); +} + +static void memcg_create_cache_work_func(struct work_struct *w) +{ + struct create_work *cw; + + cw = container_of(w, struct create_work, work); + memcg_create_kmem_cache(cw->memcg, cw->cachep); + /* Drop the reference gotten when we enqueued. */ + css_put(&cw->memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + * Called with rcu_read_lock. + */ +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct create_work *cw; + + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + if (cw == NULL) + return; + + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) { + kfree(cw); + return; + } + + cw->memcg = memcg; + cw->cachep = cachep; + + INIT_WORK(&cw->work, memcg_create_cache_work_func); + schedule_work(&cw->work); +} + +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int idx; + + VM_BUG_ON(!cachep->memcg_params); + VM_BUG_ON(!cachep->memcg_params->is_root_cache); + + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); + rcu_read_unlock(); + + if (!memcg_can_account_kmem(memcg)) + return cachep; + + idx = memcg_cache_id(memcg); + + /* + * barrier to mare sure we're always seeing the up to date value. The + * code updating memcg_caches will issue a write barrier to match this. + */ + read_barrier_depends(); + if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; + } + + return cachep->memcg_params->memcg_caches[idx]; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + memcg = try_get_mem_cgroup_from_mm(current->mm); + + /* + * very rare case described in mem_cgroup_from_task. Unfortunately there + * isn't much we can do without complicating this too much, and it would + * be gfp-dependent anyway. Just let it go + */ + if (unlikely(!memcg)) + return true; + + if (!memcg_can_account_kmem(memcg)) { + css_put(&memcg->css); + return true; + } + + ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + if (!ret) + *_memcg = memcg; + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + struct page_cgroup *pc; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + return; + } + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + pc->mem_cgroup = memcg; + SetPageCgroupUsed(pc); + unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + /* + * Fast unlocked return. Theoretically might have changed, have to + * check again after locking. + */ + if (!PageCgroupUsed(pc)) + return; + + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + ClearPageCgroupUsed(pc); + } + unlock_page_cgroup(pc); + + /* + * We trust that only if there is a memcg associated with the page, it + * is a valid allocation + */ + if (!memcg) + return; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); +} +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) @@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page) } #endif -static DEFINE_MUTEX(set_limit_mutex); - static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { @@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { int node, zid; + u64 usage; do { /* This is for making all *used* pages to be on LRU. */ @@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) cond_resched(); /* + * Kernel memory may not necessarily be trackable to a specific + * process. So they are not migrated, and therefore we can't + * expect their value to drop to 0 here. + * Having res filled up with kmem only is enough. + * * This is a safety check because mem_cgroup_force_empty_list * could have raced with mem_cgroup_replace_page_cache callers * so the lru seemed empty but the page could have been added * right after the check. RES_USAGE should be safe as we always * charge before adding to the LRU. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); + usage = res_counter_read_u64(&memcg->res, RES_USAGE) - + res_counter_read_u64(&memcg->kmem, RES_USAGE); + } while (usage > 0); } /* @@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); char str[64]; u64 val; - int type, name, len; + int name, len; + enum res_type type; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); @@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, else val = res_counter_read_u64(&memcg->memsw, name); break; + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; default: BUG(); } @@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); return simple_read_from_buffer(buf, nbytes, ppos, str, len); } + +static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +{ + int ret = -EINVAL; +#ifdef CONFIG_MEMCG_KMEM + bool must_inc_static_branch = false; + + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + * + * Taking the cgroup_lock is really offensive, but it is so far the only + * way to guarantee that no children will appear. There are plenty of + * other offenders, and they should all go away. Fine grained locking + * is probably the way to go here. When we are fully hierarchical, we + * can also get rid of the use_hierarchy check. + */ + cgroup_lock(); + mutex_lock(&set_limit_mutex); + if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (cgroup_task_count(cont) || (memcg->use_hierarchy && + !list_empty(&cont->children))) { + ret = -EBUSY; + goto out; + } + ret = res_counter_set_limit(&memcg->kmem, val); + VM_BUG_ON(ret); + + ret = memcg_update_cache_sizes(memcg); + if (ret) { + res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + goto out; + } + must_inc_static_branch = true; + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes, so it is unfeasible to migrate them away. We + * need to reference count the memcg because of that. + */ + mem_cgroup_get(memcg); + } else + ret = res_counter_set_limit(&memcg->kmem, val); +out: + mutex_unlock(&set_limit_mutex); + cgroup_unlock(); + + /* + * We are by now familiar with the fact that we can't inc the static + * branch inside cgroup_lock. See disarm functions for details. A + * worker here is overkill, but also wrong: After the limit is set, we + * must start accounting right away. Since this operation can't fail, + * we can safely defer it to here - no rollback will be needed. + * + * The boolean used to control this is also safe, because + * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be + * able to set it to true; + */ + if (must_inc_static_branch) { + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * setting the active bit after the inc will guarantee no one + * starts accounting before all call sites are patched + */ + memcg_kmem_set_active(memcg); + } + +#endif + return ret; +} + +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + int ret = 0; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (!parent) + goto out; + + memcg->kmem_account_flags = parent->kmem_account_flags; +#ifdef CONFIG_MEMCG_KMEM + /* + * When that happen, we need to disable the static branch only on those + * memcgs that enabled it. To achieve this, we would be forced to + * complicate the code by keeping track of which memcgs were the ones + * that actually enabled limits, and which ones got it from its + * parents. + * + * It is a lot simpler just to do static_key_slow_inc() on every child + * that is accounted. + */ + if (!memcg_kmem_is_active(memcg)) + goto out; + + /* + * destroy(), called if we fail, will issue static_key_slow_inc() and + * mem_cgroup_put() if kmem is enabled. We have to either call them + * unconditionally, or clear the KMEM_ACTIVE flag. I personally find + * this more consistent, since it always leads to the same destroy path + */ + mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + + mutex_lock(&set_limit_mutex); + ret = memcg_update_cache_sizes(memcg); + mutex_unlock(&set_limit_mutex); +#endif +out: + return ret; +} + /* * The user of this function is... * RES_LIMIT. @@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + enum res_type type; + int name; unsigned long long val; int ret; @@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); - else + else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); + else if (type == _KMEM) + ret = memcg_update_kmem_limit(cont, val); + else + return -EINVAL; break; case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buffer, &val); @@ -4054,7 +5097,8 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + int name; + enum res_type type; type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); @@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) case RES_MAX_USAGE: if (type == _MEM) res_counter_reset_max(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_max(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_max(&memcg->kmem); + else + return -EINVAL; break; case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_failcnt(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_failcnt(&memcg->kmem); + else + return -EINVAL; break; } @@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *event; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); @@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + int ret; + + memcg->kmemcg_id = -1; + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; + return mem_cgroup_sockets_init(memcg, ss); }; static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); + + memcg_kmem_mark_dead(memcg); + + if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) + return; + + /* + * Charges already down to 0, undo mem_cgroup_get() done in the charge + * path here, being careful not to race with memcg_uncharge_kmem: it is + * possible that the charges went down to 0 between mark_dead and the + * res_counter read, so in that case, we don't need the put + */ + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -4749,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = { .read = mem_cgroup_read, }, #endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read = mem_cgroup_read, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .read_seq_string = mem_cgroup_slabinfo_read, + }, +#endif +#endif { }, /* terminate */ }; @@ -4816,16 +5920,29 @@ out_free: } /* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void free_work(struct work_struct *work) + +static void __mem_cgroup_free(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + int node; int size = sizeof(struct mem_cgroup); - memcg = container_of(work, struct mem_cgroup, work_freeing); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); + + for_each_node(node) + free_mem_cgroup_per_zone_info(memcg, node); + + free_percpu(memcg->stat); + /* * We need to make sure that (at least for now), the jump label * destruction code runs outside of the cgroup lock. This is because @@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work) * to move this code around, and make sure it is outside * the cgroup_lock. */ - disarm_sock_keys(memcg); + disarm_static_keys(memcg); if (size < PAGE_SIZE) kfree(memcg); else vfree(memcg); } -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} /* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void free_work(struct work_struct *work) { - int node; + struct mem_cgroup *memcg; - mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); + memcg = container_of(work, struct mem_cgroup, work_freeing); + __mem_cgroup_free(memcg); +} - for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); +static void free_rcu(struct rcu_head *rcu_head) +{ + struct mem_cgroup *memcg; - free_percpu(memcg->stat); - call_rcu(&memcg->rcu_freeing, free_rcu); + memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); + INIT_WORK(&memcg->work_freeing, free_work); + schedule_work(&memcg->work_freeing); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { if (atomic_sub_and_test(count, &memcg->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); if (parent) mem_cgroup_put(parent); } @@ -4984,7 +6090,6 @@ mem_cgroup_css_alloc(struct cgroup *cont) &per_cpu(memcg_stock, cpu); INIT_WORK(&stock->work, drain_local_stock); } - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); memcg->use_hierarchy = parent->use_hierarchy; @@ -4994,6 +6099,8 @@ mem_cgroup_css_alloc(struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); + /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5004,6 +6111,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -5043,6 +6151,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); + mem_cgroup_destroy_all_caches(memcg); } static void mem_cgroup_css_free(struct cgroup *cont) @@ -5646,6 +6755,19 @@ struct cgroup_subsys mem_cgroup_subsys = { .use_id = 1, }; +/* + * The rest of init is performed during ->css_alloc() for root css which + * happens before initcalls. hotcpu_notifier() can't be done together as + * it would introduce circular locking by adding cgroup_lock -> cpu hotplug + * dependency. Do it from a subsys_initcall(). + */ +static int __init mem_cgroup_init(void) +{ + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + return 0; +} +subsys_initcall(mem_cgroup_init); + #ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { diff --git a/mm/memory.c b/mm/memory.c index e0a9b0ce4f1..bb1369f7b9b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -184,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) return 1; } + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; + tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; @@ -216,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; @@ -3706,6 +3711,14 @@ retry: if (pmd_trans_huge(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; + /* + * If the pmd is splitting, return and retry the + * the fault. Alternative: wait until the split + * is done, and goto retry. + */ + if (pmd_trans_splitting(orig_pmd)) + return 0; + if (pmd_numa(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962e353aa86..d04ed87bfac 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ static bool can_online_high_movable(struct zone *zone) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure every online node has NORMAL memory */ static bool can_online_high_movable(struct zone *zone) { return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, @@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have + * normal memory. + */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure the node has NORMAL memory if it is still online */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { @@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) */ return present_pages == 0; } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d1b315e9862..e2df1c1fb41 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2132,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->mutex */ +/* Caller holds sp->lock */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2196,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - mutex_lock(&sp->mutex); + spin_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); return pol; } @@ -2328,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) sp_free(n); } +static void sp_node_init(struct sp_node *node, unsigned long start, + unsigned long end, struct mempolicy *pol) +{ + node->start = start; + node->end = end; + node->policy = pol; +} + static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { @@ -2344,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, return NULL; } newpol->flags |= MPOL_F_SHARED; - - n->start = start; - n->end = end; - n->policy = newpol; + sp_node_init(n, start, end, newpol); return n; } @@ -2357,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { struct sp_node *n; + struct sp_node *n_new = NULL; + struct mempolicy *mpol_new = NULL; int ret = 0; - mutex_lock(&sp->mutex); +restart: + spin_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2372,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } else { /* Old policy spanning whole new range. */ if (n->end > end) { - struct sp_node *new2; - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) { - ret = -ENOMEM; - goto out; - } + if (!n_new) + goto alloc_new; + + *mpol_new = *n->policy; + atomic_set(&mpol_new->refcnt, 1); + sp_node_init(n_new, n->end, end, mpol_new); + sp_insert(sp, n_new); n->end = start; - sp_insert(sp, new2); + n_new = NULL; + mpol_new = NULL; break; } else n->end = start; @@ -2390,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } if (new) sp_insert(sp, new); -out: - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); + ret = 0; + +err_out: + if (mpol_new) + mpol_put(mpol_new); + if (n_new) + kmem_cache_free(sn_cache, n_new); + return ret; + +alloc_new: + spin_unlock(&sp->lock); + ret = -ENOMEM; + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n_new) + goto err_out; + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!mpol_new) + goto err_out; + goto restart; } /** @@ -2410,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - mutex_init(&sp->mutex); + spin_lock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2476,14 +2504,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - mutex_lock(&p->mutex); + spin_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - mutex_unlock(&p->mutex); + spin_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING @@ -2595,8 +2623,7 @@ void numa_default_policy(void) */ /* - * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag - * Used only for mpol_parse_str() and mpol_to_str() + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. */ static const char * const policy_modes[] = { @@ -2610,28 +2637,20 @@ static const char * const policy_modes[] = #ifdef CONFIG_TMPFS /** - * mpol_parse_str - parse string to mempolicy + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. - * @no_context: flag whether to "contextualize" the mempolicy * * Format of input: * <mode>[=<flags>][:<nodelist>] * - * if @no_context is true, save the input nodemask in w.user_nodemask in - * the returned mempolicy. This will be used to "clone" the mempolicy in - * a specific context [cpuset] at a later time. Used to parse tmpfs mpol - * mount option. Note that if 'static' or 'relative' mode flags were - * specified, the input nodemask will already have been saved. Saving - * it again is redundant, but safe. - * * On success, returns 0, else 1 */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode; - unsigned short uninitialized_var(mode_flags); + unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); @@ -2719,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } else { - int ret; - NODEMASK_SCRATCH(scratch); - if (scratch) { - task_lock(current); - ret = mpol_set_nodemask(new, &nodes, scratch); - task_unlock(current); - } else - ret = -ENOMEM; - NODEMASK_SCRATCH_FREE(scratch); - if (ret) { - mpol_put(new); - goto out; - } - } + /* + * Save nodes for mpol_to_str() to show the tmpfs mount options + * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. + */ + if (mode != MPOL_PREFERRED) + new->v.nodes = nodes; + else if (nodelist) + new->v.preferred_node = first_node(nodes); + else + new->flags |= MPOL_F_LOCAL; + + /* + * Save nodes for contextualization: this will be used to "clone" + * the mempolicy in a specific context [cpuset] at a later time. + */ + new->w.user_nodemask = nodes; + err = 0; out: @@ -2756,13 +2774,12 @@ out: * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted - * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask * * Convert a mempolicy into a string. * Returns the number of characters in buffer (if positive) * or an error (negative) */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; int l; @@ -2788,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_PREFERRED: nodes_clear(nodes); if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; /* pseudo-policy */ + mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; @@ -2796,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - if (no_context) - nodes = pol->w.user_nodemask; - else - nodes = pol->v.nodes; + nodes = pol->v.nodes; break; default: diff --git a/mm/migrate.c b/mm/migrate.c index 3b676b0c5c3..c38778610aa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1679,9 +1679,21 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_xchg_last_nid(new_page, page_last_nid(page)); isolated = numamigrate_isolate_page(pgdat, page); - if (!isolated) { + + /* + * Failing to isolate or a GUP pin prevents migration. The expected + * page count is 2. 1 for anonymous pages without a mapping and 1 + * for the callers pin. If the page was isolated, the page will + * need to be put back on the LRU. + */ + if (!isolated || page_count(page) != 2) { count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); put_page(new_page); + if (isolated) { + putback_lru_page(page); + isolated = 0; + goto out; + } goto out_keep_locked; } diff --git a/mm/mmap.c b/mm/mmap.c index f54b235f29a..35730ee9d51 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2886,7 +2886,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write(&anon_vma->root->rwsem); + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares diff --git a/mm/mprotect.c b/mm/mprotect.c index 3dca970367d..94722a4d6b4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, #ifdef CONFIG_NUMA_BALANCING static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { spin_lock(&mm->page_table_lock); set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); @@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, } #else static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { BUG(); } #endif /* CONFIG_NUMA_BALANCING */ -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, + prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -304,7 +305,8 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); @@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -EINVAL; if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - } - else { + } else { if (vma->vm_start > start) goto out; if (unlikely(grows & PROT_GROWSUP)) { @@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + newflags = vm_flags; + newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f427122449..0713bfbf095 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -201,6 +201,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) zone_reclaimable_pages(z) - z->dirty_balance_reserve; } /* + * Unreclaimable memory (kernel memory or anonymous memory + * without swap) can bring down the dirtyable pages below + * the zone's dirty balance reserve and the above calculation + * will underflow. However we still want to add in nodes + * which are below threshold (negative values) to get a more + * accurate calculation but make sure that the total never + * underflows. + */ + if ((long)x < 0) + x = 0; + + /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure @@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - - dirty_balance_reserve; + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + x -= min(x, dirty_balance_reserve); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) * highmem zone can hold its share of dirty pages, so we don't * care about vm_highmem_is_dirtyable here. */ - return zone_page_state(zone, NR_FREE_PAGES) + - zone_reclaimable_pages(zone) - - zone->dirty_balance_reserve; + unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + + zone_reclaimable_pages(zone); + + /* don't allow this to underflow */ + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + return nr_pages; } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d037c8bc151..df2022ff0c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -221,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -/* - * NOTE: - * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. - * Instead, use {un}set_pageblock_isolate. - */ void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -371,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; int bad = 0; - if (unlikely(compound_order(page) != order) || - unlikely(!PageHead(page))) { + if (unlikely(compound_order(page) != order)) { bad_page(page); bad++; } @@ -1390,14 +1384,8 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); } -/* - * Similar to the split_page family of functions except that the page - * required at the given order and being isolated now to prevent races - * with parallel allocators - */ -int capture_free_page(struct page *page, int alloc_order, int migratetype) +static int __isolate_free_page(struct page *page, unsigned int order) { - unsigned int order; unsigned long watermark; struct zone *zone; int mt; @@ -1405,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) BUG_ON(!PageBuddy(page)); zone = page_zone(page); - order = page_order(page); mt = get_pageblock_migratetype(page); if (mt != MIGRATE_ISOLATE) { @@ -1414,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + __mod_zone_freepage_state(zone, -(1UL << order), mt); } /* Remove page from free list */ @@ -1422,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone->free_area[order].nr_free--; rmv_page_order(page); - if (alloc_order != order) - expand(zone, page, alloc_order, order, - &zone->free_area[order], migratetype); - - /* Set the pageblock if the captured page is at least a pageblock */ + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1437,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << alloc_order; + return 1UL << order; } /* @@ -1455,10 +1438,9 @@ int split_free_page(struct page *page) unsigned int order; int nr_pages; - BUG_ON(!PageBuddy(page)); order = page_order(page); - nr_pages = capture_free_page(page, order, 0); + nr_pages = __isolate_free_page(page, order); if (!nr_pages) return 0; @@ -1656,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -#ifdef CONFIG_MEMORY_ISOLATION -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - if (unlikely(zone->nr_pageblock_isolate)) - return zone->nr_pageblock_isolate * pageblock_nr_pages; - return 0; -} -#else -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - return 0; -} -#endif - bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1685,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - /* - * If the zone has MIGRATE_ISOLATE type free pages, we should consider - * it. nr_zone_isolate_freepages is never accurate so kswapd might not - * sleep although it could do so. But this is more desirable for memory - * hotplug than sleeping which can cause a livelock in the direct - * reclaim path. - */ - free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } @@ -2164,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page = NULL; - if (!order) return NULL; @@ -2177,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction, &page); + contended_compaction); current->flags &= ~PF_MEMALLOC; - /* If compaction captured a page, prep and use it */ - if (page) { - prep_new_page(page, order, gfp_mask); - goto got_page; - } - if (*did_some_progress != COMPACT_SKIPPED) { + struct page *page; + /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2196,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { -got_page: preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; @@ -2613,6 +2566,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2631,6 +2585,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* + * Will only have any effect when __GFP_KMEMCG is set. This is + * verified in the (always inline) callee + */ + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); @@ -2666,6 +2627,8 @@ out: if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; + memcg_kmem_commit_charge(page, memcg, order); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2718,6 +2681,31 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/* + * __free_memcg_kmem_pages and free_memcg_kmem_pages will free + * pages allocated with __GFP_KMEMCG. + * + * Those pages are accounted to a particular memcg, embedded in the + * corresponding page_cgroup. To avoid adding a hit in the allocator to search + * for that information only to find out that it is NULL for users who have no + * interest in that whatsoever, we provide these functions. + * + * The caller knows better which flags it relies on. + */ +void __free_memcg_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + } +} + static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { @@ -5597,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - zone->zone_start_pfn; + pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } @@ -5944,8 +5932,15 @@ done: void free_contig_range(unsigned long pfn, unsigned nr_pages) { - for (; nr_pages--; ++pfn) - __free_page(pfn_to_page(pfn)); + unsigned int count = 0; + + for (; nr_pages--; pfn++) { + struct page *page = pfn_to_page(pfn); + + count += page_count(page) != 1; + __free_page(page); + } + WARN(count != 0, "%d pages are still in use!\n", count); } #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d2264ea460..383bdbb98b0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,28 +8,6 @@ #include <linux/memory.h> #include "internal.h" -/* called while holding zone->lock */ -static void set_pageblock_isolate(struct page *page) -{ - if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) - return; - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - page_zone(page)->nr_pageblock_isolate++; -} - -/* called while holding zone->lock */ -static void restore_pageblock_isolate(struct page *page, int migratetype) -{ - struct zone *zone = page_zone(page); - if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) - return; - - BUG_ON(zone->nr_pageblock_isolate <= 0); - set_pageblock_migratetype(page, migratetype); - zone->nr_pageblock_isolate--; -} - int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -80,7 +58,7 @@ out: unsigned long nr_pages; int migratetype = get_pageblock_migratetype(page); - set_pageblock_isolate(page); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) goto out; nr_pages = move_freepages_block(zone, page, migratetype); __mod_zone_freepage_state(zone, nr_pages, migratetype); - restore_pageblock_isolate(page, migratetype); + set_pageblock_migratetype(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } diff --git a/mm/shmem.c b/mm/shmem.c index 5c90d84c2b0..5dd56f6efdb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ - mpol_to_str(buffer, sizeof(buffer), mpol, 1); + mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } @@ -2463,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (!gid_valid(sbinfo->gid)) goto bad_val; } else if (!strcmp(this_char,"mpol")) { - if (mpol_parse_str(value, &sbinfo->mpol, 1)) + if (mpol_parse_str(value, &sbinfo->mpol)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", diff --git a/mm/slab.c b/mm/slab.c index 33d3363658d..e7667a3584b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -87,7 +87,6 @@ */ #include <linux/slab.h> -#include "slab.h" #include <linux/mm.h> #include <linux/poison.h> #include <linux/swap.h> @@ -128,6 +127,8 @@ #include "internal.h" +#include "slab.h" + /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -162,23 +163,6 @@ */ static bool pfmemalloc_active __read_mostly; -/* Legal flag mask for kmem_cache_create(). */ -#if DEBUG -# define CREATE_MASK (SLAB_RED_ZONE | \ - SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_STORE_USER | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) -#else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) -#endif - /* * kmem_bufctl_t: * @@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = { #undef CACHE }; -static struct arraycache_init initarray_cache __initdata = - { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ -static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; static struct kmem_cache kmem_cache_boot = { - .nodelists = kmem_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -662,6 +642,26 @@ static void init_node_lock_keys(int q) } } +static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +{ + struct kmem_list3 *l3; + l3 = cachep->nodelists[q]; + if (!l3) + return; + + slab_set_lock_classes(cachep, &on_slab_l3_key, + &on_slab_alc_key, q); +} + +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ + int node; + + VM_BUG_ON(OFF_SLAB(cachep)); + for_each_node(node) + on_slab_lock_classes_node(cachep, node); +} + static inline void init_lock_keys(void) { int node; @@ -678,6 +678,14 @@ static inline void init_lock_keys(void) { } +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +} + +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) { } @@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu) free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) slab_set_debugobj_lock_classes_node(cachep, node); + else if (!OFF_SLAB(cachep) && + !(cachep->flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) } /* + * The memory after the last cpu cache pointer is used for the + * the nodelists pointer. + */ +static void setup_nodelists_pointer(struct kmem_cache *cachep) +{ + cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; +} + +/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ void __init kmem_cache_init(void) { - size_t left_over; struct cache_sizes *sizes; struct cache_names *names; int i; - int order; - int node; kmem_cache = &kmem_cache_boot; + setup_nodelists_pointer(kmem_cache); if (num_possible_nodes() == 1) use_alien_caches = 0; - for (i = 0; i < NUM_INIT_LISTS; i++) { + for (i = 0; i < NUM_INIT_LISTS; i++) kmem_list3_init(&initkmem_list3[i]); - if (i < MAX_NUMNODES) - kmem_cache->nodelists[i] = NULL; - } + set_up_list3s(kmem_cache, CACHE_CACHE); /* @@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_mem_id(); - /* 1) create the kmem_cache */ - INIT_LIST_HEAD(&slab_caches); - list_add(&kmem_cache->list, &slab_caches); - kmem_cache->colour_off = cache_line_size(); - kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; - kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + - nr_node_ids * sizeof(struct kmem_list3 *); - kmem_cache->object_size = kmem_cache->size; - kmem_cache->size = ALIGN(kmem_cache->object_size, - cache_line_size()); - kmem_cache->reciprocal_buffer_size = - reciprocal_value(kmem_cache->size); - - for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, kmem_cache->size, - cache_line_size(), 0, &left_over, &kmem_cache->num); - if (kmem_cache->num) - break; - } - BUG_ON(!kmem_cache->num); - kmem_cache->gfporder = order; - kmem_cache->colour = left_over / kmem_cache->colour_off; - kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + - sizeof(struct slab), cache_line_size()); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_list3 *), + SLAB_HWCACHE_ALIGN); + list_add(&kmem_cache->list, &slab_caches); /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; @@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void) * bug. */ - sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; - sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; - sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; - sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); - - if (INDEX_AC != INDEX_L3) { - sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; - sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; - sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; - sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); - } + sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, + sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); + + if (INDEX_AC != INDEX_L3) + sizes[INDEX_L3].cs_cachep = + create_kmalloc_cache(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); slab_early_init = 0; @@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void) * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ - if (!sizes->cs_cachep) { - sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes->cs_cachep->name = names->name; - sizes->cs_cachep->size = sizes->cs_size; - sizes->cs_cachep->object_size = sizes->cs_size; - sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); - list_add(&sizes->cs_cachep->list, &slab_caches); - } + if (!sizes->cs_cachep) + sizes->cs_cachep = create_kmalloc_cache(names->name, + sizes->cs_size, ARCH_KMALLOC_FLAGS); + #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - sizes->cs_dmacachep->name = names->name_dma; - sizes->cs_dmacachep->size = sizes->cs_size; - sizes->cs_dmacachep->object_size = sizes->cs_size; - sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; - __kmem_cache_create(sizes->cs_dmacachep, - ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); - list_add(&sizes->cs_dmacachep->list, &slab_caches); + sizes->cs_dmacachep = create_kmalloc_cache( + names->name_dma, sizes->cs_size, + SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); #endif sizes++; names++; @@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void) ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* @@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (page->pfmemalloc) SetPageSlabPfmemalloc(page + i); } + memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) __ClearPageSlab(page); page++; } + + memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_pages((unsigned long)addr, cachep->gfporder); + free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) if (slab_state == DOWN) { /* - * Note: the first kmem_cache_create must create the cache + * Note: Creation of first cache (kmem_cache). + * The setup_list3s is taken care + * of by the caller of __kmem_cache_create + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + slab_state = PARTIAL; + } else if (slab_state == PARTIAL) { + /* + * Note: the second kmem_cache_create must create the cache * that's used by kmalloc(24), otherwise the creation of * further caches will BUG(). */ @@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) /* * If the cache that's used by kmalloc(sizeof(kmem_list3)) is - * the first cache, then we need to set up all its list3s, + * the second cache, then we need to set up all its list3s, * otherwise the creation of further caches will BUG(). */ set_up_list3s(cachep, SIZE_AC); @@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) else slab_state = PARTIAL_ARRAYCACHE; } else { + /* Remaining boot caches */ cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init), gfp); @@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) /** * __kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. + * @cachep: cache management descriptor * @flags: SLAB flags - * @ctor: A constructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. @@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif - /* - * Always checks flags, a caller might be expecting debug support which - * isn't available. - */ - BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid @@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(BYTES_PER_WORD - 1); } - /* calculate the final buffer alignment: */ - - /* 1) arch recommendation: can be overridden for debug */ - if (flags & SLAB_HWCACHE_ALIGN) { - /* - * Default alignment: as specified by the arch code. Except if - * an object is really small, then squeeze multiple objects into - * one cacheline. - */ - ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - } else { - ralign = BYTES_PER_WORD; - } - /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated @@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(REDZONE_ALIGN - 1); } - /* 2) arch mandated alignment */ - if (ralign < ARCH_SLAB_MINALIGN) { - ralign = ARCH_SLAB_MINALIGN; - } /* 3) caller mandated alignment */ if (ralign < cachep->align) { ralign = cachep->align; @@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else gfp = GFP_NOWAIT; - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; + setup_nodelists_pointer(cachep); #if DEBUG /* @@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); - } + } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes(cachep); return 0; } @@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; local_irq_save(flags); debug_check_no_locks_freed(objp, cachep->object_size); @@ -3969,12 +3935,6 @@ void kfree(const void *objp) } EXPORT_SYMBOL(kfree); -unsigned int kmem_cache_size(struct kmem_cache *cachep) -{ - return cachep->object_size; -} -EXPORT_SYMBOL(kmem_cache_size); - /* * This initializes kmem_list3 or resizes various caches for all nodes. */ @@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info) } /* Always called with the slab_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; @@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return alloc_kmemlist(cachep, gfp); } +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + int ret; + struct kmem_cache *c = NULL; + int i = 0; + + ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + + if (slab_state < FULL) + return ret; + + if ((ret < 0) || !is_root_cache(cachep)) + return ret; + + VM_BUG_ON(!mutex_is_locked(&slab_mutex)); + for_each_memcg_cache_index(i) { + c = cache_from_memcg(cachep, i); + if (c) + /* return value determined by the parent cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); + } + + return ret; +} + /* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; - int limit, shared; + int limit = 0; + int shared = 0; + int batchcount = 0; + + if (!is_root_cache(cachep)) { + struct kmem_cache *root = memcg_root_cache(cachep); + limit = root->limit; + shared = root->shared; + batchcount = root->batchcount; + } + if (limit && shared && batchcount) + goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); + batchcount = (limit + 1) / 2; +skip_setup: + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); @@ -4276,54 +4275,8 @@ out: } #ifdef CONFIG_SLABINFO - -static void print_slabinfo_header(struct seq_file *m) -{ - /* - * Output format version, so at least we can change it - * without _too_ many complaints. - */ -#if STATS - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); -#else - seq_puts(m, "slabinfo - version: 2.1\n"); -#endif - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); - seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); - seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); -#if STATS - seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " - "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); - seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); -#endif - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) +void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&slab_mutex); -} - -static int s_show(struct seq_file *m, void *p) -{ - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p) if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - name, active_objs, num_objs, cachep->size, - cachep->num, (1 << cachep->gfporder)); - seq_printf(m, " : tunables %4u %4u %4u", - cachep->limit, cachep->batchcount, cachep->shared); - seq_printf(m, " : slabdata %6lu %6lu %6lu", - active_slabs, num_slabs, shared_avail); + sinfo->active_objs = active_objs; + sinfo->num_objs = num_objs; + sinfo->active_slabs = active_slabs; + sinfo->num_slabs = num_slabs; + sinfo->shared_avail = shared_avail; + sinfo->limit = cachep->limit; + sinfo->batchcount = cachep->batchcount; + sinfo->shared = cachep->shared; + sinfo->objects_per_slab = cachep->num; + sinfo->cache_order = cachep->gfporder; +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) +{ #if STATS { /* list3 stats */ unsigned long high = cachep->high_mark; @@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p) allochit, allocmiss, freehit, freemiss); } #endif - seq_putc(m, '\n'); - return 0; } -/* - * slabinfo_op - iterator that generates /proc/slabinfo - * - * Output layout: - * cache-name - * num-active-objs - * total-objs - * object size - * num-active-slabs - * total-slabs - * num-pages-per-slab - * + further values on SMP and with statistics enabled - */ - -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - #define MAX_SLABINFO_WRITE 128 /** * slabinfo_write - Tuning for the slab allocator @@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -static ssize_t slabinfo_write(struct file *file, const char __user *buffer, +ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; @@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, return res; } -static int slabinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slabinfo_op); -} - -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .write = slabinfo_write, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_DEBUG_SLAB_LEAK static void *leaks_start(struct seq_file *m, loff_t *pos) @@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p) return 0; } +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + static const struct seq_operations slabstats_op = { .start = leaks_start, .next = s_next, @@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/mm/slab.h b/mm/slab.h index 7deeb449a30..34a98d64219 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -32,19 +32,201 @@ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size); + /* Functions provided by the slab allocators */ extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); +extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, + unsigned long flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + size_t size, unsigned long flags); + +struct mem_cgroup; #ifdef CONFIG_SLUB -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); #else -static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +static inline struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_NOTRACK) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + int __kmem_cache_shutdown(struct kmem_cache *); +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params || s->memcg_params->is_root_cache; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return (is_root_cache(cachep) && !memcg) || + (cachep->memcg_params->memcg == memcg); +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ + if (!is_root_cache(s)) + atomic_add(1 << order, &s->memcg_params->nr_pages); +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ + if (is_root_cache(s)) + return; + + if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) + mem_cgroup_destroy_cache(s); +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return (p == s) || + (s->memcg_params && (p == s->memcg_params->root_cache)); +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + return s->memcg_params->root_cache->name; + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return s->memcg_params->memcg_caches[idx]; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params->root_cache; +} +#else +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return true; +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return NULL; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} +#endif + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __FUNCTION__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 069a24e6440..3f3cd97d3fd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -13,9 +13,12 @@ #include <linux/module.h> #include <linux/cpu.h> #include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/page.h> +#include <linux/memcontrol.h> #include "slab.h" @@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, + size_t size) { struct kmem_cache *s = NULL; @@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size) continue; } - if (!strcmp(s->name, name)) { + /* + * For simplicity, we won't check this in the list of memcg + * caches. We have control over memcg naming, and if there + * aren't duplicates in the global list, there won't be any + * duplicates in the memcg lists as well. + */ + if (!memcg && !strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size) return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, + const char *name, size_t size) { return 0; } #endif +#ifdef CONFIG_MEMCG_KMEM +int memcg_update_all_caches(int num_memcgs) +{ + struct kmem_cache *s; + int ret = 0; + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + ret = memcg_update_cache_size(s, num_memcgs); + /* + * See comment in memcontrol.c, memcg_update_cache_size: + * Instead of freeing the memory, we'll just leave the caches + * up to this point in an updated state. + */ + if (ret) + goto out; + } + + memcg_update_array_size(num_memcgs); +out: + mutex_unlock(&slab_mutex); + return ret; +} +#endif + +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + + /* * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) * as davem. */ -struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *), + struct kmem_cache *parent_cache) { struct kmem_cache *s = NULL; int err = 0; @@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align get_online_cpus(); mutex_lock(&slab_mutex); - if (!kmem_cache_sanity_check(name, size) == 0) + if (!kmem_cache_sanity_check(memcg, name, size) == 0) goto out_locked; + /* + * Some allocators will constraint the set of valid flags to a subset + * of all flags. We expect them to define CACHE_CREATE_MASK in this + * case, and we'll just provide them with a sanitized version of the + * passed flags. + */ + flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(name, size, align, flags, ctor); + s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); if (s) goto out_locked; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (s) { s->object_size = s->size = size; - s->align = align; + s->align = calculate_alignment(flags, align, size); s->ctor = ctor; + + if (memcg_register_cache(memcg, s, parent_cache)) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; + } + s->name = kstrdup(name, GFP_KERNEL); if (!s->name) { kmem_cache_free(kmem_cache, s); @@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align err = __kmem_cache_create(s, flags); if (!err) { - s->refcount = 1; list_add(&s->list, &slab_caches); - + memcg_cache_list_add(memcg, s); } else { kfree(s->name); kmem_cache_free(kmem_cache, s); @@ -157,10 +239,20 @@ out_locked: return s; } + +struct kmem_cache * +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); +} EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *s) { + /* Destroy all the children caches if we aren't a memcg cache */ + kmem_cache_destroy_memcg_children(s); + get_online_cpus(); mutex_lock(&slab_mutex); s->refcount--; @@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); + memcg_release_cache(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } else { @@ -192,3 +285,182 @@ int slab_is_available(void) { return slab_state >= UP; } + +#ifndef CONFIG_SLOB +/* Create a cache during boot when no slab services are available yet */ +void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, + unsigned long flags) +{ + int err; + + s->name = name; + s->size = s->object_size = size; + s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + err = __kmem_cache_create(s, flags); + + if (err) + panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n", + name, size, err); + + s->refcount = -1; /* Exempt from merging for now */ +} + +struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, + unsigned long flags) +{ + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + if (!s) + panic("Out of memory when creating slab %s\n", name); + + create_boot_cache(s, name, size, flags); + list_add(&s->list, &slab_caches); + s->refcount = 1; + return s; +} + +#endif /* !CONFIG_SLOB */ + + +#ifdef CONFIG_SLABINFO +void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name <active_objs> <num_objs> <objsize> " + "<objperslab> <pagesperslab>"); + seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); + seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " + "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); + seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); +#endif + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&slab_mutex); + if (!n) + print_slabinfo_header(m); + + return seq_list_start(&slab_caches, *pos); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + +static void +memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) +{ + struct kmem_cache *c; + struct slabinfo sinfo; + int i; + + if (!is_root_cache(s)) + return; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg(s, i); + if (!c) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + + info->active_slabs += sinfo.active_slabs; + info->num_slabs += sinfo.num_slabs; + info->shared_avail += sinfo.shared_avail; + info->active_objs += sinfo.active_objs; + info->num_objs += sinfo.num_objs; + } +} + +int cache_show(struct kmem_cache *s, struct seq_file *m) +{ + struct slabinfo sinfo; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + + memcg_accumulate_slabinfo(s, &sinfo); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, + sinfo.objects_per_slab, (1 << sinfo.cache_order)); + + seq_printf(m, " : tunables %4u %4u %4u", + sinfo.limit, sinfo.batchcount, sinfo.shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); + slabinfo_show_stats(m, s); + seq_putc(m, '\n'); + return 0; +} + +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (!is_root_cache(s)) + return 0; + return cache_show(s, m); +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ +static const struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .write = slabinfo_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); +#endif /* CONFIG_SLABINFO */ diff --git a/mm/slob.c b/mm/slob.c index 1e921c5e957..a99fdf7a090 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -28,9 +28,8 @@ * from kmalloc are prepended with a 4-byte header with the kmalloc size. * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls * alloc_pages() directly, allocating compound pages so the page order - * does not have to be separately tracked, and also stores the exact - * allocation size in page->private so that it can be used to accurately - * provide ksize(). These objects are detected in kfree() because slob_page() + * does not have to be separately tracked. + * These objects are detected in kfree() because PageSlab() * is false for them. * * SLAB is emulated on top of SLOB by simply calling constructors and @@ -59,7 +58,6 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include "slab.h" #include <linux/mm.h> #include <linux/swap.h> /* struct reclaim_state */ @@ -74,6 +72,7 @@ #include <linux/atomic.h> +#include "slab.h" /* * slob_block has a field 'units', which indicates size of block if +ve, * or offset of next block if -ve (in SLOB_UNITs). @@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp) #define SLOB_UNIT sizeof(slob_t) #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) -#define SLOB_ALIGN L1_CACHE_BYTES /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which @@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) if (likely(order)) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - if (ret) { - struct page *page; - page = virt_to_page(ret); - page->private = size; - } trace_kmalloc_node(caller, ret, size, PAGE_SIZE << order, gfp, node); @@ -506,7 +499,7 @@ void kfree(const void *block) unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else - put_page(sp); + __free_pages(sp, compound_order(sp)); } EXPORT_SYMBOL(kfree); @@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree); size_t ksize(const void *block) { struct page *sp; + int align; + unsigned int *m; BUG_ON(!block); if (unlikely(block == ZERO_SIZE_PTR)) return 0; sp = virt_to_page(block); - if (PageSlab(sp)) { - int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); - unsigned int *m = (unsigned int *)(block - align); - return SLOB_UNITS(*m) * SLOB_UNIT; - } else - return sp->private; + if (unlikely(!PageSlab(sp))) + return PAGE_SIZE << compound_order(sp); + + align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; } EXPORT_SYMBOL(ksize); int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - size_t align = c->size; - if (flags & SLAB_DESTROY_BY_RCU) { /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } c->flags = flags; - /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; - if (c->align < ARCH_SLAB_MINALIGN) - c->align = ARCH_SLAB_MINALIGN; - if (c->align < align) - c->align = align; - return 0; } @@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, PAGE_SIZE << get_order(c->size), flags, node); } @@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); -unsigned int kmem_cache_size(struct kmem_cache *c) -{ - return c->size; -} -EXPORT_SYMBOL(kmem_cache_size); - int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/mm/slub.c b/mm/slub.c index 487f0bdd53c..ba2ca53f6c3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -31,6 +31,7 @@ #include <linux/fault-inject.h> #include <linux/stacktrace.h> #include <linux/prefetch.h> +#include <linux/memcontrol.h> #include <trace/events/kmem.h> @@ -112,9 +113,6 @@ * the fast path and disables lockless freelists. */ -#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) - static inline int kmem_cache_debug(struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG @@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define __OBJECT_POISON 0x80000000UL /* Poison object */ #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ -static int kmem_size = sizeof(struct kmem_cache); - #ifdef CONFIG_SMP static struct notifier_block slab_notifier; #endif @@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { } +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing( if (!check_object(s, page, object, SLUB_RED_ACTIVE)) goto out; - if (unlikely(s != page->slab)) { + if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); - } else if (!page->slab) { + } else if (!page->slab_cache) { printk(KERN_ERR "SLUB <none>: no slab for object 0x%p.\n", object); @@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; + order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - page->slab = s; + memcg_bind_pages(s, order); + page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); @@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; for_each_object(p, s, start, page->objects) { @@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); + + memcg_release_pages(s, order); reset_page_mapcount(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); + __free_memcg_kmem_pages(page, order); } #define need_reserve_slab_rcu \ @@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h) else page = container_of((struct list_head *)h, struct page, lru); - __free_slab(page->slab, page); + __free_slab(page->slab_cache, page); } static void free_slab(struct kmem_cache *s, struct page *page) @@ -1872,12 +1874,14 @@ redo: /* * Unfreeze all the cpu partial slabs. * - * This function must be called with interrupt disabled. + * This function must be called with interrupts disabled + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). */ -static void unfreeze_partials(struct kmem_cache *s) +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct kmem_cache_node *n = NULL, *n2 = NULL; - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); struct page *page, *discard_page = NULL; while ((page = c->partial)) { @@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) * set to the per node partial list. */ local_irq_save(flags); - unfreeze_partials(s); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); local_irq_restore(flags); oldpage = NULL; pobjects = 0; @@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) if (c->page) flush_slab(s, c); - unfreeze_partials(s); + unfreeze_partials(s, c); } } @@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, if (slab_pre_alloc_hook(s, gfpflags)) return NULL; + s = memcg_kmem_get_cache(s, gfpflags); redo: /* @@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, void *prior; void **object = (void *)x; int was_frozen; - int inuse; struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; @@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; do { + if (unlikely(n)) { + spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } prior = page->freelist; counters = page->counters; set_freepointer(s, object, prior); new.counters = counters; was_frozen = new.frozen; new.inuse--; - if ((!new.inuse || !prior) && !was_frozen && !n) { + if ((!new.inuse || !prior) && !was_frozen) { if (!kmem_cache_debug(s) && !prior) @@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } } - inuse = new.inuse; } while (!cmpxchg_double_slab(s, page, prior, counters, @@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; } + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + goto slab_empty; + /* - * was_frozen may have been set after we acquired the list_lock in - * an earlier loop. So we need to check it here again. + * Objects left in the slab. If it was not on the partial list before + * then add it. */ - if (was_frozen) - stat(s, FREE_FROZEN); - else { - if (unlikely(!inuse && n->nr_partial > s->min_partial)) - goto slab_empty; - - /* - * Objects left in the slab. If it was not on the partial list before - * then add it. - */ - if (unlikely(!prior)) { - remove_full(s, page); - add_partial(n, page, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } + if (kmem_cache_debug(s) && unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); return; @@ -2619,19 +2618,10 @@ redo: void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - if (kmem_cache_debug(s) && page->slab != s) { - pr_err("kmem_cache_free: Wrong slab cache. %s but object" - " is from %s\n", page->slab->name, s->name); - WARN_ON_ONCE(1); + s = cache_from_obj(s, x); + if (!s) return; - } - - slab_free(s, page, x, _RET_IP_); - + slab_free(s, virt_to_head_page(x), x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved) return -ENOSYS; } -/* - * Figure out what the alignment of the objects will be. - */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static void init_kmem_cache_node(struct kmem_cache_node *n) { @@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->object_size; - unsigned long align = s->align; int order; /* @@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) #endif /* - * Determine the alignment based on various parameters that the - * user specified and the dynamic determination of cache line size - * on bootup. - */ - align = calculate_alignment(flags, align, s->object_size); - s->align = align; - - /* * SLUB stores one object immediately after another beginning from * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, align); + size = ALIGN(size, s->align); s->size = size; if (forced_order >= 0) order = forced_order; @@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->max = s->oo; return !!oo_objects(s->oo); - } static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) @@ -3127,15 +3081,6 @@ error: return -EINVAL; } -/* - * Determine the size of a slab object - */ -unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->object_size; -} -EXPORT_SYMBOL(kmem_cache_size); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { @@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s) { int rc = kmem_cache_close(s); - if (!rc) + if (!rc) { + /* + * We do the same lock strategy around sysfs_slab_add, see + * __kmem_cache_create. Because this is pretty much the last + * operation we do and the lock will be released shortly after + * that in slab_common.c, we could just move sysfs_slab_remove + * to a later point in common code. We should do that when we + * have a common sysfs framework for all allocators. + */ + mutex_unlock(&slab_mutex); sysfs_slab_remove(s); + mutex_lock(&slab_mutex); + } return rc; } @@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *__init create_kmalloc_cache(const char *name, - int size, unsigned int flags) -{ - struct kmem_cache *s; - - s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - - s->name = name; - s->size = s->object_size = size; - s->align = ARCH_KMALLOC_MINALIGN; - - /* - * This function is called with IRQs disabled during early-boot on - * single CPU so there's no need to take slab_mutex here. - */ - if (kmem_cache_open(s, flags)) - goto panic; - - list_add(&s->list, &slab_caches); - return s; - -panic: - panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); - return NULL; -} - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3424,7 +3354,7 @@ size_t ksize(const void *object) return PAGE_SIZE << compound_order(page); } - return slab_ksize(page->slab); + return slab_ksize(page->slab_cache); } EXPORT_SYMBOL(ksize); @@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x) } slab_lock(page); - if (on_freelist(page->slab, page, object)) { - object_err(page->slab, page, object, "Object is on free-list"); + if (on_freelist(page->slab_cache, page, object)) { + object_err(page->slab_cache, page, object, "Object is on free-list"); rv = false; } else { rv = true; @@ -3478,10 +3408,10 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kmemleak_free(x); - __free_pages(page, compound_order(page)); + __free_memcg_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self, /* * Used for early kmem_cache structures that were allocated using - * the page allocator + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. */ -static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - list_add(&s->list, &slab_caches); - s->refcount = -1; + memcpy(s, static_cache, kmem_cache->object_size); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) if (n) { list_for_each_entry(p, &n->partial, lru) - p->slab = s; + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG list_for_each_entry(p, &n->full, lru) - p->slab = s; + p->slab_cache = s; #endif } } + list_add(&s->list, &slab_caches); + return s; } void __init kmem_cache_init(void) { + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; int i; - int caches = 0; - struct kmem_cache *temp_kmem_cache; - int order; - struct kmem_cache *temp_kmem_cache_node; - unsigned long kmalloc_size; + int caches = 2; if (debug_guardpage_minorder()) slub_max_order = 0; - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); - - /* Allocate two kmem_caches from the page allocator */ - kmalloc_size = ALIGN(kmem_size, cache_line_size()); - order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); - - /* - * Must first have the slab cache available for the allocations of the - * struct kmem_cache_node's. There is special bootstrap code in - * kmem_cache_open for slab_state == DOWN. - */ - kmem_cache_node = (void *)kmem_cache + kmalloc_size; + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; - kmem_cache_node->name = "kmem_cache_node"; - kmem_cache_node->size = kmem_cache_node->object_size = - sizeof(struct kmem_cache_node); - kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; - temp_kmem_cache = kmem_cache; - kmem_cache->name = "kmem_cache"; - kmem_cache->size = kmem_cache->object_size = kmem_size; - kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); - kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache, temp_kmem_cache, kmem_size); + kmem_cache = bootstrap(&boot_kmem_cache); /* * Allocate kmem_cache_node properly from the kmem_cache slab. * kmem_cache_node is separately allocated so no need to * update any list pointers. */ - temp_kmem_cache_node = kmem_cache_node; - - kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); - - kmem_cache_bootstrap_fixup(kmem_cache_node); - - caches++; - kmem_cache_bootstrap_fixup(kmem_cache); - caches++; - /* Free temporary boot structure */ - free_pages((unsigned long)temp_kmem_cache, order); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ @@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, +static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size, if (s->size - size >= sizeof(void *)) continue; + if (!cache_match_memcg(s, memcg)) + continue; + return s; } return NULL; } -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(size, align, flags, name, ctor); + s = find_mergeable(memcg, size, align, flags, name, ctor); if (s) { s->refcount++; /* @@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) if (err) return err; + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + + memcg_propagate_slab_attrs(s); mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); mutex_lock(&slab_mutex); @@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg(s, i); + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } +static void memcg_propagate_slab_attrs(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + + if (!is_root_cache(s)) + return; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!s->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(s->memcg_params->root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s) if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; - int unmergeable; - - if (slab_state < FULL) - /* Defer until later */ - return 0; + int unmergeable = slab_unmergeable(s); - unmergeable = slab_unmergeable(s); if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) -{ - seq_puts(m, "slabinfo - version: 2.1\n"); - seq_puts(m, "# name <active_objs> <num_objs> <object_size> " - "<objperslab> <pagesperslab>"); - seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); - seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&slab_mutex); -} - -static int s_show(struct seq_file *m, void *p) +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_partials = 0; unsigned long nr_slabs = 0; - unsigned long nr_inuse = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; - struct kmem_cache *s; int node; - s = list_entry(p, struct kmem_cache, list); - for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p) nr_free += count_partial(n, count_free); } - nr_inuse = nr_objs - nr_free; - - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, oo_objects(s->oo), - (1 << oo_order(s->oo))); - seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); - seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, - 0UL); - seq_putc(m, '\n'); - return 0; + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); } -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static int slabinfo_open(struct inode *inode, struct file *file) +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) { - return seq_open(file, &slabinfo_op); } -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init slab_proc_init(void) +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { - proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); - return 0; + return -EIO; } -module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ diff --git a/mm/truncate.c b/mm/truncate.c index d51ce92d6e8..c75b736e54b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -577,29 +577,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize) EXPORT_SYMBOL(truncate_setsize); /** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @newsize: file offset to start truncating - * - * This function is deprecated and truncate_setsize or truncate_pagecache - * should be used instead, together with filesystem specific block truncation. - */ -int vmtruncate(struct inode *inode, loff_t newsize) -{ - int error; - - error = inode_newsize_ok(inode, newsize); - if (error) - return error; - - truncate_setsize(inode, newsize); - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -} -EXPORT_SYMBOL(vmtruncate); - -/** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole diff --git a/mm/vmscan.c b/mm/vmscan.c index 7f3096137b8..196709f5ee5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page) } /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, struct scan_control *sc) @@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file, isolated = zone_page_state(zone, NR_ISOLATED_ANON); } + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + return isolated > inactive; } @@ -2440,12 +2452,16 @@ static bool zone_balanced(struct zone *zone, int order, } /* - * pgdat_balanced is used when checking if a node is balanced for high-order - * allocations. Only zones that meet watermarks and are in a zone allowed - * by the callers classzone_idx are added to balanced_pages. The total of - * balanced pages must be at least 25% of the zones allowed by classzone_idx - * for the node to be considered balanced. Forcing all zones to be balanced - * for high orders can cause excessive reclaim when there are imbalanced zones. + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. * The choice of 25% is due to * o a 16M DMA zone that is balanced will not balance a zone on any * reasonable sized machine @@ -2455,17 +2471,43 @@ static bool zone_balanced(struct zone *zone, int order, * Similarly, on x86-64 the Normal zone would need to be at least 1G * to balance a node on its own. These seemed like reasonable ratios. */ -static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, - int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { unsigned long present_pages = 0; + unsigned long balanced_pages = 0; int i; - for (i = 0; i <= classzone_idx; i++) - present_pages += pgdat->node_zones[i].present_pages; + /* Check the watermark levels */ + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; - /* A special case here: if zone has no page, we think it's balanced */ - return balanced_pages >= (present_pages >> 2); + present_pages += zone->present_pages; + + /* + * A special case here: + * + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well! + */ + if (zone->all_unreclaimable) { + balanced_pages += zone->present_pages; + continue; + } + + if (zone_balanced(zone, order, 0, i)) + balanced_pages += zone->present_pages; + else if (!order) + return false; + } + + if (order) + return balanced_pages >= (present_pages >> 2); + else + return true; } /* @@ -2477,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, int classzone_idx) { - int i; - unsigned long balanced = 0; - bool all_zones_ok = true; - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) return false; @@ -2499,39 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, return false; } - /* Check the watermark levels */ - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* - * balance_pgdat() skips over all_unreclaimable after - * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well if kswapd - * is to sleep - */ - if (zone->all_unreclaimable) { - balanced += zone->present_pages; - continue; - } - - if (!zone_balanced(zone, order, 0, i)) - all_zones_ok = false; - else - balanced += zone->present_pages; - } - - /* - * For high-order requests, the balanced zones must contain at least - * 25% of the nodes pages for kswapd to sleep. For order-0, all zones - * must be balanced - */ - if (order) - return pgdat_balanced(pgdat, balanced, classzone_idx); - else - return all_zones_ok; + return pgdat_balanced(pgdat, order, classzone_idx); } /* @@ -2558,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - int all_zones_ok; - unsigned long balanced; + struct zone *unbalanced_zone; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; @@ -2592,8 +2597,7 @@ loop_again: unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; - all_zones_ok = 1; - balanced = 0; + unbalanced_zone = NULL; /* * Scan in the highmem->dma direction for the highest @@ -2731,7 +2735,7 @@ loop_again: } if (!zone_balanced(zone, testorder, 0, end_zone)) { - all_zones_ok = 0; + unbalanced_zone = zone; /* * We are still under min water mark. This * means that we have a GFP_ATOMIC allocation @@ -2749,8 +2753,6 @@ loop_again: * speculatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; } } @@ -2764,7 +2766,7 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) + if (pgdat_balanced(pgdat, order, *classzone_idx)) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2773,8 +2775,8 @@ loop_again: if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { if (has_under_min_watermark_zone) count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); - else - congestion_wait(BLK_RW_ASYNC, HZ/10); + else if (unbalanced_zone) + wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); } /* @@ -2788,12 +2790,7 @@ loop_again: } while (--sc.priority >= 0); out: - /* - * order-0: All zones must meet high watermark for a balanced node - * high-order: Balanced zones must make up at least 25% of the node - * for the node to be balanced - */ - if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { + if (!pgdat_balanced(pgdat, order, *classzone_idx)) { cond_resched(); try_to_freeze(); @@ -3125,8 +3122,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { int nid; |