summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/compaction.c31
-rw-r--r--mm/highmem.c1
-rw-r--r--mm/huge_memory.c116
-rw-r--r--mm/hugetlb.c21
-rw-r--r--mm/hugetlb_cgroup.c19
-rw-r--r--mm/internal.h7
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c16
-rw-r--r--mm/memcontrol.c1263
-rw-r--r--mm/memory-failure.c7
-rw-r--r--mm/memory.c207
-rw-r--r--mm/memory_hotplug.c21
-rw-r--r--mm/mempolicy.c283
-rw-r--r--mm/migrate.c337
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c149
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c59
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c66
-rw-r--r--mm/shmem.c20
-rw-r--r--mm/slab.c383
-rw-r--r--mm/slab.h190
-rw-r--r--mm/slab_common.c292
-rw-r--r--mm/slob.c48
-rw-r--r--mm/slub.c447
-rw-r--r--mm/truncate.c23
-rw-r--r--mm/vmscan.c129
-rw-r--r--mm/vmstat.c16
31 files changed, 3342 insertions, 871 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 71259e052ce..278e3ab1f16 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -149,7 +149,18 @@ config MOVABLE_NODE
depends on NO_BOOTMEM
depends on X86_64
depends on NUMA
- depends on BROKEN
+ default n
+ help
+ Allow a node to have only movable memory. Pages used by the kernel,
+ such as direct mapping pages cannot be migrated. So the corresponding
+ memory device cannot be hotplugged. This option allows users to
+ online all the memory of a node as movable memory so that the whole
+ node can be hotplugged. Users who don't use the memory hotplug
+ feature are fine with this option on since they don't online memory
+ as movable.
+
+ Say Y here if you want to hotplug a whole node.
+ Say N here if you want kernel to use memory on all nodes evenly.
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
diff --git a/mm/compaction.c b/mm/compaction.c
index 12979121822..6b807e46649 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -17,6 +17,21 @@
#include <linux/balloon_compaction.h>
#include "internal.h"
+#ifdef CONFIG_COMPACTION
+static inline void count_compact_event(enum vm_event_item item)
+{
+ count_vm_event(item);
+}
+
+static inline void count_compact_events(enum vm_event_item item, long delta)
+{
+ count_vm_events(item, delta);
+}
+#else
+#define count_compact_event(item) do { } while (0)
+#define count_compact_events(item, delta) do { } while (0)
+#endif
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
#define CREATE_TRACE_POINTS
@@ -303,6 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
+ count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+ if (total_isolated)
+ count_compact_events(COMPACTISOLATED, total_isolated);
return total_isolated;
}
@@ -609,6 +627,10 @@ next_pageblock:
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+ count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ if (nr_isolated)
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+
return low_pfn;
}
@@ -1015,14 +1037,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+ MR_COMPACTION);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
- count_vm_event(COMPACTBLOCKS);
- count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
- if (nr_remaining)
- count_vm_events(COMPACTPAGEFAILED, nr_remaining);
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
@@ -1105,7 +1124,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
if (!order || !may_enter_fs || !may_perform_io)
return rc;
- count_vm_event(COMPACTSTALL);
+ count_compact_event(COMPACTSTALL);
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
diff --git a/mm/highmem.c b/mm/highmem.c
index d999077431d..b32b70cdaed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -105,6 +105,7 @@ struct page *kmap_to_page(void *vaddr)
return virt_to_page(addr);
}
+EXPORT_SYMBOL(kmap_to_page);
static void flush_all_zero_pkmaps(void)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 827d9c81305..9e894edc781 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -19,6 +19,7 @@
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/migrate.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -573,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!*hugepage_kobj)) {
- printk(KERN_ERR "hugepage: failed kobject create\n");
+ printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
if (err) {
- printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
goto delete_obj;
}
err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
if (err) {
- printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
goto remove_hp_group;
}
@@ -690,7 +691,7 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
pmd = pmd_mkwrite(pmd);
@@ -848,7 +849,8 @@ out:
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -1287,6 +1289,81 @@ out:
return page;
}
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target_nid;
+ int current_nid = -1;
+ bool migrated;
+ bool page_locked = false;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+ get_page(page);
+ current_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ put_page(page);
+ goto clear_pmdnuma;
+ }
+
+ /* Acquire the page lock to serialise THP migrations */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+ page_locked = true;
+
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Migrate the THP to the requested node */
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr,
+ page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+ else {
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ goto out_unlock;
+ }
+ goto clear_pmdnuma;
+ }
+
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+
+clear_pmdnuma:
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
+ if (page_locked)
+ unlock_page(page);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1375,7 +1452,7 @@ out:
}
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot)
+ unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
@@ -1383,8 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
- BUG_ON(pmd_write(entry));
+ if (!prot_numa) {
+ entry = pmd_modify(entry, newprot);
+ BUG_ON(pmd_write(entry));
+ } else {
+ struct page *page = pmd_page(*pmd);
+
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
+ !pmd_numa(*pmd)) {
+ entry = pmd_mknuma(entry);
+ }
+ }
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
ret = 1;
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->mutex to
+ * and it won't wait on the anon_vma->root->rwsem to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush(vma, address, pmd);
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
+ page_xchg_last_nid(page_tail, page_last_nid(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
BUG_ON(page_mapcount(page) != 1);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
+ if (pmd_numa(*pmd))
+ entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
return ret;
}
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)
BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
goto out;
ret = 0;
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)
BUG_ON(PageCompound(page));
out_unlock:
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
out:
return ret;
}
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
- anon_vma_lock(vma->anon_vma);
+ anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88e7293b96b..4f3ea0b1e57 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
hugetlb_init_hstates();
-
gather_bootmem_prealloc();
-
report_hugepages();
hugetlb_sysfs_init();
-
hugetlb_register_all_nodes();
+ hugetlb_cgroup_file_init();
return 0;
}
@@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order)
h->next_nid_to_free = first_node(node_states[N_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
- /*
- * Add cgroup control files only if the huge page consists
- * of more than two normal pages. This is because we use
- * page[2].lru.next for storing cgoup details.
- */
- if (order >= HUGETLB_CGROUP_MIN_ORDER)
- hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
parsed_hstate = h;
}
@@ -3016,7 +3007,7 @@ same_page:
return i ? i : -EFAULT;
}
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
@@ -3024,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
@@ -3034,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- if (huge_pmd_unshare(mm, &address, ptep))
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
continue;
+ }
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
+ pages++;
}
}
spin_unlock(&mm->page_table_lock);
@@ -3051,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+
+ return pages << h->order;
}
int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index b5bde7a5c01..9cea7de22ff 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
return buf;
}
-int __init hugetlb_cgroup_file_init(int idx)
+static void __init __hugetlb_cgroup_file_init(int idx)
{
char buf[32];
struct cftype *cft;
@@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
- return 0;
+ return;
+}
+
+void __init hugetlb_cgroup_file_init(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgroup details.
+ */
+ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+ __hugetlb_cgroup_file_init(hstate_index(h));
+ }
}
/*
diff --git a/mm/internal.h b/mm/internal.h
index 52d1fa95719..d597f94cc20 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
unsigned long flags;
+ int nr_pages = hpage_nr_pages(page);
local_irq_save(flags);
- __dec_zone_page_state(page, NR_MLOCK);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
- __inc_zone_page_state(newpage, NR_MLOCK);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
local_irq_restore(flags);
}
}
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a217cc54406..752a705c77c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
struct kmemleak_object *object;
unsigned long addr;
- addr= simple_strtoul(str, NULL, 0);
+ if (kstrtoul(str, 0, &addr))
+ return -EINVAL;
object = find_and_get_object(addr, 0);
if (!object) {
pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 382d930a0bf..51573858938 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1624,7 +1624,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1648,7 +1648,7 @@ again:
if (!search_new_forks || !mapcount)
break;
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
if (!mapcount)
goto out;
}
@@ -1678,7 +1678,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1697,11 +1697,11 @@ again:
ret = try_to_unmap_one(page, vma,
rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
goto out;
}
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1731,7 +1731,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1749,11 +1749,11 @@ again:
ret = rmap_one(page, vma, rmap_item->address, arg);
if (ret != SWAP_AGAIN) {
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
goto out;
}
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c055929c8c..09255ec8159 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -268,6 +272,10 @@ struct mem_cgroup {
};
/*
+ * the counter to account for kernel memory usage.
+ */
+ struct res_counter kmem;
+ /*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/
@@ -282,6 +290,7 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
+ unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
bool oom_lock;
atomic_t under_oom;
@@ -332,8 +341,61 @@ struct mem_cgroup {
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
#endif
+#if defined(CONFIG_MEMCG_KMEM)
+ /* analogous to slab_common's slab_caches list. per-memcg */
+ struct list_head memcg_slab_caches;
+ /* Not a spinlock, we can take a lot of time walking the list */
+ struct mutex slab_caches_mutex;
+ /* Index in the kmem_cache->memcg_params->memcg_caches array */
+ int kmemcg_id;
+#endif
+};
+
+/* internal only representation about the status of kmem accounting. */
+enum {
+ KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+ KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
+/* We account when limit is on, but only after call sites are patched */
+#define KMEM_ACCOUNTED_MASK \
+ ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
+{
+ set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
+{
+ set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
+{
+ clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
+{
+ if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
+ set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
+{
+ return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
+ &memcg->kmem_account_flags);
+}
+#endif
+
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -388,9 +450,13 @@ enum charge_type {
};
/* for encoding cft->private value on file */
-#define _MEM (0)
-#define _MEMSWAP (1)
-#define _OOM_TYPE (2)
+enum res_type {
+ _MEM,
+ _MEMSWAP,
+ _OOM_TYPE,
+ _KMEM,
+};
+
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
}
#endif
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * There are two main reasons for not using the css_id for this:
+ * 1) this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
+ *
+ * 2) In order not to violate the cgroup API, we would like to do all memory
+ * allocation in ->create(). At that point, we haven't yet allocated the
+ * css_id. Having a separate index prevents us from messing with the cgroup
+ * core for this
+ *
+ * The current size of the caches array is stored in
+ * memcg_limited_groups_array_size. It will double each time we have to
+ * increase it.
+ */
+static DEFINE_IDA(kmem_limited_groups);
+int memcg_limited_groups_array_size;
+
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * css_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE 65535
+
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
+struct static_key memcg_kmem_enabled_key;
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
+
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+ if (memcg_kmem_is_active(memcg)) {
+ static_key_slow_dec(&memcg_kmem_enabled_key);
+ ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+ }
+ /*
+ * This check can't live in kmem destruction function,
+ * since the charges will outlive the cgroup
+ */
+ WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+}
+#else
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+ disarm_sock_keys(memcg);
+ disarm_kmem_keys(memcg);
+}
+
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
@@ -1453,6 +1588,10 @@ done:
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+ printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+ res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
+ res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
+ res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
}
/*
@@ -2060,20 +2199,28 @@ struct memcg_stock_pcp {
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
-/*
- * Try to consume stocked charge on this cpu. If success, one page is consumed
- * from local stock and true is returned. If the stock is 0 or charges from a
- * cgroup which is not current target, returns false. This stock will be
- * refilled.
+/**
+ * consume_stock: Try to consume stocked charge on this cpu.
+ * @memcg: memcg to consume from.
+ * @nr_pages: how many pages to charge.
+ *
+ * The charges will only happen if @memcg matches the current cpu's memcg
+ * stock, and at least @nr_pages are available in that stock. Failure to
+ * service an allocation will refill the stock.
+ *
+ * returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
bool ret = true;
+ if (nr_pages > CHARGE_BATCH)
+ return false;
+
stock = &get_cpu_var(memcg_stock);
- if (memcg == stock->cached && stock->nr_pages)
- stock->nr_pages--;
+ if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+ stock->nr_pages -= nr_pages;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
@@ -2250,7 +2397,8 @@ enum {
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages, bool oom_check)
+ unsigned int nr_pages, unsigned int min_pages,
+ bool oom_check)
{
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
@@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
/*
- * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
- * of regular pages (CHARGE_BATCH), or a single regular page (1).
- *
* Never reclaim on behalf of optional batching, retry with a
* single page instead.
*/
- if (nr_pages == CHARGE_BATCH)
+ if (nr_pages > min_pages)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
+ if (gfp_mask & __GFP_NORETRY)
+ return CHARGE_NOMEM;
+
ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;
@@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
*/
- if (nr_pages == 1 && ret)
+ if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
return CHARGE_RETRY;
/*
@@ -2371,7 +2519,7 @@ again:
memcg = *ptr;
if (mem_cgroup_is_root(memcg))
goto done;
- if (nr_pages == 1 && consume_stock(memcg))
+ if (consume_stock(memcg, nr_pages))
goto done;
css_get(&memcg->css);
} else {
@@ -2396,7 +2544,7 @@ again:
rcu_read_unlock();
goto done;
}
- if (nr_pages == 1 && consume_stock(memcg)) {
+ if (consume_stock(memcg, nr_pages)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -2431,7 +2579,8 @@ again:
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+ oom_check);
switch (ret) {
case CHARGE_OK:
break;
@@ -2624,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
memcg_check_events(memcg, page);
}
+static DEFINE_MUTEX(set_limit_mutex);
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
+{
+ return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
+ (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+}
+
+/*
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
+ * in the memcg_cache_params struct.
+ */
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+{
+ struct kmem_cache *cachep;
+
+ VM_BUG_ON(p->is_root_cache);
+ cachep = p->root_cache;
+ return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+}
+
+#ifdef CONFIG_SLABINFO
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ struct memcg_cache_params *params;
+
+ if (!memcg_can_account_kmem(memcg))
+ return -EIO;
+
+ print_slabinfo_header(m);
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list)
+ cache_show(memcg_params_to_cache(params), m);
+ mutex_unlock(&memcg->slab_caches_mutex);
+
+ return 0;
+}
+#endif
+
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+{
+ struct res_counter *fail_res;
+ struct mem_cgroup *_memcg;
+ int ret = 0;
+ bool may_oom;
+
+ ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+ if (ret)
+ return ret;
+
+ /*
+ * Conditions under which we can wait for the oom_killer. Those are
+ * the same conditions tested by the core page allocator
+ */
+ may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+
+ _memcg = memcg;
+ ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+ &_memcg, may_oom);
+
+ if (ret == -EINTR) {
+ /*
+ * __mem_cgroup_try_charge() chosed to bypass to root due to
+ * OOM kill or fatal signal. Since our only options are to
+ * either fail the allocation or charge it to this cgroup, do
+ * it as a temporary condition. But we can't fail. From a
+ * kmem/slab perspective, the cache has already been selected,
+ * by mem_cgroup_kmem_get_cache(), so it is too late to change
+ * our minds.
+ *
+ * This condition will only trigger if the task entered
+ * memcg_charge_kmem in a sane state, but was OOM-killed during
+ * __mem_cgroup_try_charge() above. Tasks that were already
+ * dying when the allocation triggers should have been already
+ * directed to the root cgroup in memcontrol.h
+ */
+ res_counter_charge_nofail(&memcg->res, size, &fail_res);
+ if (do_swap_account)
+ res_counter_charge_nofail(&memcg->memsw, size,
+ &fail_res);
+ ret = 0;
+ } else if (ret)
+ res_counter_uncharge(&memcg->kmem, size);
+
+ return ret;
+}
+
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+{
+ res_counter_uncharge(&memcg->res, size);
+ if (do_swap_account)
+ res_counter_uncharge(&memcg->memsw, size);
+
+ /* Not down to 0 */
+ if (res_counter_uncharge(&memcg->kmem, size))
+ return;
+
+ if (memcg_kmem_test_and_clear_dead(memcg))
+ mem_cgroup_put(memcg);
+}
+
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+{
+ if (!memcg)
+ return;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+int memcg_cache_id(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg->kmemcg_id : -1;
+}
+
+/*
+ * This ends up being protected by the set_limit mutex, during normal
+ * operation, because that is its main call site.
+ *
+ * But when we create a new cache, we can call this as well if its parent
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
+ */
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+{
+ int num, ret;
+
+ num = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (num < 0)
+ return num;
+ /*
+ * After this point, kmem_accounted (that we test atomically in
+ * the beginning of this conditional), is no longer 0. This
+ * guarantees only one process will set the following boolean
+ * to true. We don't need test_and_set because we're protected
+ * by the set_limit_mutex anyway.
+ */
+ memcg_kmem_set_activated(memcg);
+
+ ret = memcg_update_all_caches(num+1);
+ if (ret) {
+ ida_simple_remove(&kmem_limited_groups, num);
+ memcg_kmem_clear_activated(memcg);
+ return ret;
+ }
+
+ memcg->kmemcg_id = num;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+ return 0;
+}
+
+static size_t memcg_caches_array_size(int num_groups)
+{
+ ssize_t size;
+ if (num_groups <= 0)
+ return 0;
+
+ size = 2 * num_groups;
+ if (size < MEMCG_CACHES_MIN_SIZE)
+ size = MEMCG_CACHES_MIN_SIZE;
+ else if (size > MEMCG_CACHES_MAX_SIZE)
+ size = MEMCG_CACHES_MAX_SIZE;
+
+ return size;
+}
+
+/*
+ * We should update the current array size iff all caches updates succeed. This
+ * can only be done from the slab side. The slab mutex needs to be held when
+ * calling this.
+ */
+void memcg_update_array_size(int num)
+{
+ if (num > memcg_limited_groups_array_size)
+ memcg_limited_groups_array_size = memcg_caches_array_size(num);
+}
+
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+{
+ struct memcg_cache_params *cur_params = s->memcg_params;
+
+ VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+
+ if (num_groups > memcg_limited_groups_array_size) {
+ int i;
+ ssize_t size = memcg_caches_array_size(num_groups);
+
+ size *= sizeof(void *);
+ size += sizeof(struct memcg_cache_params);
+
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
+ if (!s->memcg_params) {
+ s->memcg_params = cur_params;
+ return -ENOMEM;
+ }
+
+ s->memcg_params->is_root_cache = true;
+
+ /*
+ * There is the chance it will be bigger than
+ * memcg_limited_groups_array_size, if we failed an allocation
+ * in a cache, in which case all caches updated before it, will
+ * have a bigger array.
+ *
+ * But if that is the case, the data after
+ * memcg_limited_groups_array_size is certainly unused
+ */
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ if (!cur_params->memcg_caches[i])
+ continue;
+ s->memcg_params->memcg_caches[i] =
+ cur_params->memcg_caches[i];
+ }
+
+ /*
+ * Ideally, we would wait until all caches succeed, and only
+ * then free the old one. But this is not worth the extra
+ * pointer per-cache we'd have to have for this.
+ *
+ * It is not a big deal if some caches are left with a size
+ * bigger than the others. And all updates will reset this
+ * anyway.
+ */
+ kfree(cur_params);
+ }
+ return 0;
+}
+
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
+{
+ size_t size = sizeof(struct memcg_cache_params);
+
+ if (!memcg_kmem_enabled())
+ return 0;
+
+ if (!memcg)
+ size += memcg_limited_groups_array_size * sizeof(void *);
+
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
+ if (!s->memcg_params)
+ return -ENOMEM;
+
+ if (memcg) {
+ s->memcg_params->memcg = memcg;
+ s->memcg_params->root_cache = root_cache;
+ }
+ return 0;
+}
+
+void memcg_release_cache(struct kmem_cache *s)
+{
+ struct kmem_cache *root;
+ struct mem_cgroup *memcg;
+ int id;
+
+ /*
+ * This happens, for instance, when a root cache goes away before we
+ * add any memcg.
+ */
+ if (!s->memcg_params)
+ return;
+
+ if (s->memcg_params->is_root_cache)
+ goto out;
+
+ memcg = s->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
+
+ root = s->memcg_params->root_cache;
+ root->memcg_params->memcg_caches[id] = NULL;
+ mem_cgroup_put(memcg);
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_del(&s->memcg_params->list);
+ mutex_unlock(&memcg->slab_caches_mutex);
+
+out:
+ kfree(s->memcg_params);
+}
+
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account--;
+}
+
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *p;
+
+ p = container_of(w, struct memcg_cache_params, destroy);
+
+ cachep = memcg_params_to_cache(p);
+
+ /*
+ * If we get down to 0 after shrink, we could delete right away.
+ * However, memcg_release_pages() already puts us back in the workqueue
+ * in that case. If we proceed deleting, we'll get a dangling
+ * reference, and removing the object from the workqueue in that case
+ * is unnecessary complication. We are not a fast path.
+ *
+ * Note that this case is fundamentally different from racing with
+ * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+ * kmem_cache_shrink, not only we would be reinserting a dead cache
+ * into the queue, but doing so from inside the worker racing to
+ * destroy it.
+ *
+ * So if we aren't down to zero, we'll just schedule a worker and try
+ * again
+ */
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+ kmem_cache_shrink(cachep);
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+ return;
+ } else
+ kmem_cache_destroy(cachep);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+ if (!cachep->memcg_params->dead)
+ return;
+
+ /*
+ * There are many ways in which we can get here.
+ *
+ * We can get to a memory-pressure situation while the delayed work is
+ * still pending to run. The vmscan shrinkers can then release all
+ * cache memory and get us to destruction. If this is the case, we'll
+ * be executed twice, which is a bug (the second time will execute over
+ * bogus data). In this case, cancelling the work should be fine.
+ *
+ * But we can also get here from the worker itself, if
+ * kmem_cache_shrink is enough to shake all the remaining objects and
+ * get the page count to 0. In this case, we'll deadlock if we try to
+ * cancel the work (the worker runs with an internal lock held, which
+ * is the same lock we would hold for cancel_work_sync().)
+ *
+ * Since we can't possibly know who got us here, just refrain from
+ * running if there is already work pending
+ */
+ if (work_pending(&cachep->memcg_params->destroy))
+ return;
+ /*
+ * We have to defer the actual destroying to a workqueue, because
+ * we might currently be in a context that cannot sleep.
+ */
+ schedule_work(&cachep->memcg_params->destroy);
+}
+
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+{
+ char *name;
+ struct dentry *dentry;
+
+ rcu_read_lock();
+ dentry = rcu_dereference(memcg->css.cgroup->dentry);
+ rcu_read_unlock();
+
+ BUG_ON(dentry == NULL);
+
+ name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
+ memcg_cache_id(memcg), dentry->d_name.name);
+
+ return name;
+}
+
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
+{
+ char *name;
+ struct kmem_cache *new;
+
+ name = memcg_cache_name(memcg, s);
+ if (!name)
+ return NULL;
+
+ new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+ (s->flags & ~SLAB_PANIC), s->ctor, s);
+
+ if (new)
+ new->allocflags |= __GFP_KMEMCG;
+
+ kfree(name);
+ return new;
+}
+
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ struct kmem_cache *new_cachep;
+ int idx;
+
+ BUG_ON(!memcg_can_account_kmem(memcg));
+
+ idx = memcg_cache_id(memcg);
+
+ mutex_lock(&memcg_cache_mutex);
+ new_cachep = cachep->memcg_params->memcg_caches[idx];
+ if (new_cachep)
+ goto out;
+
+ new_cachep = kmem_cache_dup(memcg, cachep);
+ if (new_cachep == NULL) {
+ new_cachep = cachep;
+ goto out;
+ }
+
+ mem_cgroup_get(memcg);
+ atomic_set(&new_cachep->memcg_params->nr_pages , 0);
+
+ cachep->memcg_params->memcg_caches[idx] = new_cachep;
+ /*
+ * the readers won't lock, make sure everybody sees the updated value,
+ * so they won't put stuff in the queue again for no reason
+ */
+ wmb();
+out:
+ mutex_unlock(&memcg_cache_mutex);
+ return new_cachep;
+}
+
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+ int i;
+
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ return;
+
+ /*
+ * If the cache is being destroyed, we trust that there is no one else
+ * requesting objects from it. Even if there are, the sanity checks in
+ * kmem_cache_destroy should caught this ill-case.
+ *
+ * Still, we don't want anyone else freeing memcg_caches under our
+ * noses, which can happen if a new memcg comes to life. As usual,
+ * we'll take the set_limit_mutex to protect ourselves against this.
+ */
+ mutex_lock(&set_limit_mutex);
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ c = s->memcg_params->memcg_caches[i];
+ if (!c)
+ continue;
+
+ /*
+ * We will now manually delete the caches, so to avoid races
+ * we need to cancel all pending destruction workers and
+ * proceed with destruction ourselves.
+ *
+ * kmem_cache_destroy() will call kmem_cache_shrink internally,
+ * and that could spawn the workers again: it is likely that
+ * the cache still have active pages until this very moment.
+ * This would lead us back to mem_cgroup_destroy_cache.
+ *
+ * But that will not execute at all if the "dead" flag is not
+ * set, so flip it down to guarantee we are in control.
+ */
+ c->memcg_params->dead = false;
+ cancel_work_sync(&c->memcg_params->destroy);
+ kmem_cache_destroy(c);
+ }
+ mutex_unlock(&set_limit_mutex);
+}
+
+struct create_work {
+ struct mem_cgroup *memcg;
+ struct kmem_cache *cachep;
+ struct work_struct work;
+};
+
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
+
+ if (!memcg_kmem_is_active(memcg))
+ return;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ cachep = memcg_params_to_cache(params);
+ cachep->memcg_params->dead = true;
+ INIT_WORK(&cachep->memcg_params->destroy,
+ kmem_cache_destroy_work_func);
+ schedule_work(&cachep->memcg_params->destroy);
+ }
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+static void memcg_create_cache_work_func(struct work_struct *w)
+{
+ struct create_work *cw;
+
+ cw = container_of(w, struct create_work, work);
+ memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ /* Drop the reference gotten when we enqueued. */
+ css_put(&cw->memcg->css);
+ kfree(cw);
+}
+
+/*
+ * Enqueue the creation of a per-memcg kmem_cache.
+ * Called with rcu_read_lock.
+ */
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ struct create_work *cw;
+
+ cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+ if (cw == NULL)
+ return;
+
+ /* The corresponding put will be done in the workqueue. */
+ if (!css_tryget(&memcg->css)) {
+ kfree(cw);
+ return;
+ }
+
+ cw->memcg = memcg;
+ cw->cachep = cachep;
+
+ INIT_WORK(&cw->work, memcg_create_cache_work_func);
+ schedule_work(&cw->work);
+}
+
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ /*
+ * We need to stop accounting when we kmalloc, because if the
+ * corresponding kmalloc cache is not yet created, the first allocation
+ * in __memcg_create_cache_enqueue will recurse.
+ *
+ * However, it is better to enclose the whole function. Depending on
+ * the debugging options enabled, INIT_WORK(), for instance, can
+ * trigger an allocation. This too, will make us recurse. Because at
+ * this point we can't allow ourselves back into memcg_kmem_get_cache,
+ * the safest choice is to do it like this, wrapping the whole function.
+ */
+ memcg_stop_kmem_account();
+ __memcg_create_cache_enqueue(memcg, cachep);
+ memcg_resume_kmem_account();
+}
+/*
+ * Return the kmem_cache we're supposed to use for a slab allocation.
+ * We try to use the current memcg's version of the cache.
+ *
+ * If the cache does not exist yet, if we are the first user of it,
+ * we either create it immediately, if possible, or create it asynchronously
+ * in a workqueue.
+ * In the latter case, we will let the current allocation go through with
+ * the original cache.
+ *
+ * Can't be called in interrupt context or from kernel threads.
+ * This function needs to be called with rcu_read_lock() held.
+ */
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+ gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ int idx;
+
+ VM_BUG_ON(!cachep->memcg_params);
+ VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+
+ if (!current->mm || current->memcg_kmem_skip_account)
+ return cachep;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+ rcu_read_unlock();
+
+ if (!memcg_can_account_kmem(memcg))
+ return cachep;
+
+ idx = memcg_cache_id(memcg);
+
+ /*
+ * barrier to mare sure we're always seeing the up to date value. The
+ * code updating memcg_caches will issue a write barrier to match this.
+ */
+ read_barrier_depends();
+ if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+ /*
+ * If we are in a safe context (can wait, and not in interrupt
+ * context), we could be be predictable and return right away.
+ * This would guarantee that the allocation being performed
+ * already belongs in the new cache.
+ *
+ * However, there are some clashes that can arrive from locking.
+ * For instance, because we acquire the slab_mutex while doing
+ * kmem_cache_dup, this means no further allocation could happen
+ * with the slab_mutex held.
+ *
+ * Also, because cache creation issue get_online_cpus(), this
+ * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+ * that ends up reversed during cpu hotplug. (cpuset allocates
+ * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+ * better to defer everything.
+ */
+ memcg_create_cache_enqueue(memcg, cachep);
+ return cachep;
+ }
+
+ return cachep->memcg_params->memcg_caches[idx];
+}
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
+
+/*
+ * We need to verify if the allocation against current->mm->owner's memcg is
+ * possible for the given order. But the page is not allocated yet, so we'll
+ * need a further commit step to do the final arrangements.
+ *
+ * It is possible for the task to switch cgroups in this mean time, so at
+ * commit time, we can't rely on task conversion any longer. We'll then use
+ * the handle argument to return to the caller which cgroup we should commit
+ * against. We could also return the memcg directly and avoid the pointer
+ * passing, but a boolean return value gives better semantics considering
+ * the compiled-out case as well.
+ *
+ * Returning true means the allocation is possible.
+ */
+bool
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ *_memcg = NULL;
+ memcg = try_get_mem_cgroup_from_mm(current->mm);
+
+ /*
+ * very rare case described in mem_cgroup_from_task. Unfortunately there
+ * isn't much we can do without complicating this too much, and it would
+ * be gfp-dependent anyway. Just let it go
+ */
+ if (unlikely(!memcg))
+ return true;
+
+ if (!memcg_can_account_kmem(memcg)) {
+ css_put(&memcg->css);
+ return true;
+ }
+
+ ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+ if (!ret)
+ *_memcg = memcg;
+
+ css_put(&memcg->css);
+ return (ret == 0);
+}
+
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
+ int order)
+{
+ struct page_cgroup *pc;
+
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
+
+ /* The page allocation failed. Revert */
+ if (!page) {
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+ return;
+ }
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ pc->mem_cgroup = memcg;
+ SetPageCgroupUsed(pc);
+ unlock_page_cgroup(pc);
+}
+
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct page_cgroup *pc;
+
+
+ pc = lookup_page_cgroup(page);
+ /*
+ * Fast unlocked return. Theoretically might have changed, have to
+ * check again after locking.
+ */
+ if (!PageCgroupUsed(pc))
+ return;
+
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ memcg = pc->mem_cgroup;
+ ClearPageCgroupUsed(pc);
+ }
+ unlock_page_cgroup(pc);
+
+ /*
+ * We trust that only if there is a memcg associated with the page, it
+ * is a valid allocation
+ */
+ if (!memcg)
+ return;
+
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+}
+#else
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -3289,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
struct mem_cgroup **memcgp)
{
struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
struct page_cgroup *pc;
enum charge_type ctype;
*memcgp = NULL;
- VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return;
+ if (PageTransHuge(page))
+ nr_pages <<= compound_order(page);
+
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
@@ -3359,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
* charged to the res_counter since we plan on replacing the
* old one and only one page is going to be left afterwards.
*/
- __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+ __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
}
/* remove redundant charge if migration failed*/
@@ -3483,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
}
#endif
-static DEFINE_MUTEX(set_limit_mutex);
-
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
@@ -3769,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
{
int node, zid;
+ u64 usage;
do {
/* This is for making all *used* pages to be on LRU. */
@@ -3789,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
cond_resched();
/*
+ * Kernel memory may not necessarily be trackable to a specific
+ * process. So they are not migrated, and therefore we can't
+ * expect their value to drop to 0 here.
+ * Having res filled up with kmem only is enough.
+ *
* This is a safety check because mem_cgroup_force_empty_list
* could have raced with mem_cgroup_replace_page_cache callers
* so the lru seemed empty but the page could have been added
* right after the check. RES_USAGE should be safe as we always
* charge before adding to the LRU.
*/
- } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
+ usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+ res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ } while (usage > 0);
}
/*
@@ -3939,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
char str[64];
u64 val;
- int type, name, len;
+ int name, len;
+ enum res_type type;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
@@ -3960,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
else
val = res_counter_read_u64(&memcg->memsw, name);
break;
+ case _KMEM:
+ val = res_counter_read_u64(&memcg->kmem, name);
+ break;
default:
BUG();
}
@@ -3967,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
+
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
+{
+ int ret = -EINVAL;
+#ifdef CONFIG_MEMCG_KMEM
+ bool must_inc_static_branch = false;
+
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ /*
+ * For simplicity, we won't allow this to be disabled. It also can't
+ * be changed if the cgroup has children already, or if tasks had
+ * already joined.
+ *
+ * If tasks join before we set the limit, a person looking at
+ * kmem.usage_in_bytes will have no way to determine when it took
+ * place, which makes the value quite meaningless.
+ *
+ * After it first became limited, changes in the value of the limit are
+ * of course permitted.
+ *
+ * Taking the cgroup_lock is really offensive, but it is so far the only
+ * way to guarantee that no children will appear. There are plenty of
+ * other offenders, and they should all go away. Fine grained locking
+ * is probably the way to go here. When we are fully hierarchical, we
+ * can also get rid of the use_hierarchy check.
+ */
+ cgroup_lock();
+ mutex_lock(&set_limit_mutex);
+ if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+ if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
+ !list_empty(&cont->children))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ ret = res_counter_set_limit(&memcg->kmem, val);
+ VM_BUG_ON(ret);
+
+ ret = memcg_update_cache_sizes(memcg);
+ if (ret) {
+ res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+ goto out;
+ }
+ must_inc_static_branch = true;
+ /*
+ * kmem charges can outlive the cgroup. In the case of slab
+ * pages, for instance, a page contain objects from various
+ * processes, so it is unfeasible to migrate them away. We
+ * need to reference count the memcg because of that.
+ */
+ mem_cgroup_get(memcg);
+ } else
+ ret = res_counter_set_limit(&memcg->kmem, val);
+out:
+ mutex_unlock(&set_limit_mutex);
+ cgroup_unlock();
+
+ /*
+ * We are by now familiar with the fact that we can't inc the static
+ * branch inside cgroup_lock. See disarm functions for details. A
+ * worker here is overkill, but also wrong: After the limit is set, we
+ * must start accounting right away. Since this operation can't fail,
+ * we can safely defer it to here - no rollback will be needed.
+ *
+ * The boolean used to control this is also safe, because
+ * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
+ * able to set it to true;
+ */
+ if (must_inc_static_branch) {
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * setting the active bit after the inc will guarantee no one
+ * starts accounting before all call sites are patched
+ */
+ memcg_kmem_set_active(memcg);
+ }
+
+#endif
+ return ret;
+}
+
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+{
+ int ret = 0;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ goto out;
+
+ memcg->kmem_account_flags = parent->kmem_account_flags;
+#ifdef CONFIG_MEMCG_KMEM
+ /*
+ * When that happen, we need to disable the static branch only on those
+ * memcgs that enabled it. To achieve this, we would be forced to
+ * complicate the code by keeping track of which memcgs were the ones
+ * that actually enabled limits, and which ones got it from its
+ * parents.
+ *
+ * It is a lot simpler just to do static_key_slow_inc() on every child
+ * that is accounted.
+ */
+ if (!memcg_kmem_is_active(memcg))
+ goto out;
+
+ /*
+ * destroy(), called if we fail, will issue static_key_slow_inc() and
+ * mem_cgroup_put() if kmem is enabled. We have to either call them
+ * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+ * this more consistent, since it always leads to the same destroy path
+ */
+ mem_cgroup_get(memcg);
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+
+ mutex_lock(&set_limit_mutex);
+ ret = memcg_update_cache_sizes(memcg);
+ mutex_unlock(&set_limit_mutex);
+#endif
+out:
+ return ret;
+}
+
/*
* The user of this function is...
* RES_LIMIT.
@@ -3975,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
const char *buffer)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- int type, name;
+ enum res_type type;
+ int name;
unsigned long long val;
int ret;
@@ -3997,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
break;
if (type == _MEM)
ret = mem_cgroup_resize_limit(memcg, val);
- else
+ else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
+ else if (type == _KMEM)
+ ret = memcg_update_kmem_limit(cont, val);
+ else
+ return -EINVAL;
break;
case RES_SOFT_LIMIT:
ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4051,7 +5097,8 @@ out:
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- int type, name;
+ int name;
+ enum res_type type;
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
@@ -4063,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
case RES_MAX_USAGE:
if (type == _MEM)
res_counter_reset_max(&memcg->res);
- else
+ else if (type == _MEMSWAP)
res_counter_reset_max(&memcg->memsw);
+ else if (type == _KMEM)
+ res_counter_reset_max(&memcg->kmem);
+ else
+ return -EINVAL;
break;
case RES_FAILCNT:
if (type == _MEM)
res_counter_reset_failcnt(&memcg->res);
- else
+ else if (type == _MEMSWAP)
res_counter_reset_failcnt(&memcg->memsw);
+ else if (type == _KMEM)
+ res_counter_reset_failcnt(&memcg->kmem);
+ else
+ return -EINVAL;
break;
}
@@ -4387,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
@@ -4470,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
@@ -4548,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *event;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4573,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
@@ -4632,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
#ifdef CONFIG_MEMCG_KMEM
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
+ int ret;
+
+ memcg->kmemcg_id = -1;
+ ret = memcg_propagate_kmem(memcg);
+ if (ret)
+ return ret;
+
return mem_cgroup_sockets_init(memcg, ss);
};
static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
mem_cgroup_sockets_destroy(memcg);
+
+ memcg_kmem_mark_dead(memcg);
+
+ if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+ return;
+
+ /*
+ * Charges already down to 0, undo mem_cgroup_get() done in the charge
+ * path here, being careful not to race with memcg_uncharge_kmem: it is
+ * possible that the charges went down to 0 between mark_dead and the
+ * res_counter read, so in that case, we don't need the put
+ */
+ if (memcg_kmem_test_and_clear_dead(memcg))
+ mem_cgroup_put(memcg);
}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4746,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
.read = mem_cgroup_read,
},
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ {
+ .name = "kmem.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+ .write_string = mem_cgroup_write,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "kmem.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "kmem.failcnt",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "kmem.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
+ },
+#ifdef CONFIG_SLABINFO
+ {
+ .name = "kmem.slabinfo",
+ .read_seq_string = mem_cgroup_slabinfo_read,
+ },
+#endif
+#endif
{ }, /* terminate */
};
@@ -4813,16 +5920,29 @@ out_free:
}
/*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context. The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * (scanning all at force_empty is too costly...)
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
*/
-static void free_work(struct work_struct *work)
+
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
+ int node;
int size = sizeof(struct mem_cgroup);
- memcg = container_of(work, struct mem_cgroup, work_freeing);
+ mem_cgroup_remove_from_trees(memcg);
+ free_css_id(&mem_cgroup_subsys, &memcg->css);
+
+ for_each_node(node)
+ free_mem_cgroup_per_zone_info(memcg, node);
+
+ free_percpu(memcg->stat);
+
/*
* We need to make sure that (at least for now), the jump label
* destruction code runs outside of the cgroup lock. This is because
@@ -4834,45 +5954,34 @@ static void free_work(struct work_struct *work)
* to move this code around, and make sure it is outside
* the cgroup_lock.
*/
- disarm_sock_keys(memcg);
+ disarm_static_keys(memcg);
if (size < PAGE_SIZE)
kfree(memcg);
else
vfree(memcg);
}
-static void free_rcu(struct rcu_head *rcu_head)
-{
- struct mem_cgroup *memcg;
-
- memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
- INIT_WORK(&memcg->work_freeing, free_work);
- schedule_work(&memcg->work_freeing);
-}
/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
+ * but in process context. The work_freeing structure is overlaid
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
*/
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void free_work(struct work_struct *work)
{
- int node;
+ struct mem_cgroup *memcg;
- mem_cgroup_remove_from_trees(memcg);
- free_css_id(&mem_cgroup_subsys, &memcg->css);
+ memcg = container_of(work, struct mem_cgroup, work_freeing);
+ __mem_cgroup_free(memcg);
+}
- for_each_node(node)
- free_mem_cgroup_per_zone_info(memcg, node);
+static void free_rcu(struct rcu_head *rcu_head)
+{
+ struct mem_cgroup *memcg;
- free_percpu(memcg->stat);
- call_rcu(&memcg->rcu_freeing, free_rcu);
+ memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
+ INIT_WORK(&memcg->work_freeing, free_work);
+ schedule_work(&memcg->work_freeing);
}
static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4884,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
{
if (atomic_sub_and_test(count, &memcg->refcnt)) {
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- __mem_cgroup_free(memcg);
+ call_rcu(&memcg->rcu_freeing, free_rcu);
if (parent)
mem_cgroup_put(parent);
}
@@ -4981,7 +6090,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
- hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
memcg->use_hierarchy = parent->use_hierarchy;
@@ -4991,6 +6099,8 @@ mem_cgroup_css_alloc(struct cgroup *cont)
if (parent && parent->use_hierarchy) {
res_counter_init(&memcg->res, &parent->res);
res_counter_init(&memcg->memsw, &parent->memsw);
+ res_counter_init(&memcg->kmem, &parent->kmem);
+
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
@@ -5001,6 +6111,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
} else {
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
+ res_counter_init(&memcg->kmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -5040,6 +6151,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
mem_cgroup_reparent_charges(memcg);
+ mem_cgroup_destroy_all_caches(memcg);
}
static void mem_cgroup_css_free(struct cgroup *cont)
@@ -5643,6 +6755,19 @@ struct cgroup_subsys mem_cgroup_subsys = {
.use_id = 1,
};
+/*
+ * The rest of init is performed during ->css_alloc() for root css which
+ * happens before initcalls. hotcpu_notifier() can't be done together as
+ * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
+ * dependency. Do it from a subsys_initcall().
+ */
+static int __init mem_cgroup_init(void)
+{
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+ return 0;
+}
+subsys_initcall(mem_cgroup_init);
+
#ifdef CONFIG_MEMCG_SWAP
static int __init enable_swap_account(char *s)
{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 108c52fa60f..c6e4dd3e1c0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct anon_vma *av;
pgoff_t pgoff;
- av = page_lock_anon_vma(page);
+ av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
}
}
read_unlock(&tasklist_lock);
- page_unlock_anon_vma(av);
+ page_unlock_anon_vma_read(av);
}
/*
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMORY_FAILURE);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index db2e9e797a0..e0a9b0ce4f1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
+#include <linux/migrate.h>
+#include <linux/string.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -1503,6 +1505,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ goto no_page_table;
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
split_huge_page_pmd(vma, address, pmd);
@@ -1532,6 +1536,8 @@ split_fallthrough:
pte = *ptep;
if (!pte_present(pte))
goto no_page;
+ if ((flags & FOLL_NUMA) && pte_numa(pte))
+ goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -1683,6 +1689,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= (gup_flags & FOLL_FORCE) ?
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ /*
+ * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+ * would be called on PROT_NONE ranges. We must never invoke
+ * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+ * page faults would unprotect the PROT_NONE ranges if
+ * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+ * bitflag. So to avoid that, don't set FOLL_NUMA if
+ * FOLL_FORCE is set.
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
i = 0;
do {
@@ -3412,6 +3431,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int current_nid)
+{
+ get_page(page);
+
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ return mpol_misplaced(page, vma, addr);
+}
+
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ int current_nid = -1;
+ int target_nid;
+ bool migrated = false;
+
+ /*
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ *
+ * ptep_modify_prot_start is not called as this is clearing
+ * the _PAGE_NUMA bit and it is not really expected that there
+ * would be concurrent hardware modifications to the PTE.
+ */
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*ptep, pte))) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out;
+ }
+
+ pte = pte_mknonnuma(pte);
+ set_pte_at(mm, addr, ptep, pte);
+ update_mmu_cache(vma, addr, ptep);
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
+ current_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ pte_unmap_unlock(ptep, ptl);
+ if (target_nid == -1) {
+ /*
+ * Account for the fault against the current node if it not
+ * being replaced regardless of where the page is located.
+ */
+ current_nid = numa_node_id();
+ put_page(page);
+ goto out;
+ }
+
+ /* Migrate to the requested node */
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+
+out:
+ if (current_nid != -1)
+ task_numa_fault(current_nid, 1, migrated);
+ return 0;
+}
+
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd;
+ pte_t *pte, *orig_pte;
+ unsigned long _addr = addr & PMD_MASK;
+ unsigned long offset;
+ spinlock_t *ptl;
+ bool numa = false;
+ int local_nid = numa_node_id();
+
+ spin_lock(&mm->page_table_lock);
+ pmd = *pmdp;
+ if (pmd_numa(pmd)) {
+ set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+ numa = true;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (!numa)
+ return 0;
+
+ /* we're in a page fault so some vma must be in the range */
+ BUG_ON(!vma);
+ BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+ offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+ VM_BUG_ON(offset >= PMD_SIZE);
+ orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+ pte += offset >> PAGE_SHIFT;
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ pte_t pteval = *pte;
+ struct page *page;
+ int curr_nid = local_nid;
+ int target_nid;
+ bool migrated;
+ if (!pte_present(pteval))
+ continue;
+ if (!pte_numa(pteval))
+ continue;
+ if (addr >= vma->vm_end) {
+ vma = find_vma(mm, addr);
+ /* there's a pte present so there must be a vma */
+ BUG_ON(!vma);
+ BUG_ON(addr < vma->vm_start);
+ }
+ if (pte_numa(pteval)) {
+ pteval = pte_mknonnuma(pteval);
+ set_pte_at(mm, addr, pte, pteval);
+ }
+ page = vm_normal_page(vma, addr, pteval);
+ if (unlikely(!page))
+ continue;
+ /* only check non-shared pages */
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+
+ /*
+ * Note that the NUMA fault is later accounted to either
+ * the node that is currently running or where the page is
+ * migrated to.
+ */
+ curr_nid = local_nid;
+ target_nid = numa_migrate_prep(page, vma, addr,
+ page_to_nid(page));
+ if (target_nid == -1) {
+ put_page(page);
+ continue;
+ }
+
+ /* Migrate to the requested node */
+ pte_unmap_unlock(pte, ptl);
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ curr_nid = target_nid;
+ task_numa_fault(curr_nid, 1, migrated);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ BUG();
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3450,6 +3633,9 @@ int handle_pte_fault(struct mm_struct *mm,
pte, pmd, flags, entry);
}
+ if (pte_numa(entry))
+ return do_numa_page(mm, vma, address, entry, pte, pmd);
+
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
@@ -3520,8 +3706,11 @@ retry:
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- if (dirty && !pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd)) {
+ if (pmd_numa(orig_pmd))
+ return do_huge_pmd_numa_page(mm, vma, address,
+ orig_pmd, pmd);
+
+ if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
/*
@@ -3536,16 +3725,21 @@ retry:
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
}
+
return 0;
}
}
+ if (pmd_numa(*pmd))
+ return do_pmd_numa_page(mm, vma, address, pmd);
+
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -3925,15 +4119,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_KERNEL);
if (buf) {
- char *p, *s;
+ char *p;
p = d_path(&f->f_path, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
- s = strrchr(p, '/');
- if (s)
- p = s+1;
- printk("%s%s[%lx+%lx]", prefix, p,
+ printk("%s%s[%lx+%lx]", prefix, kbasename(p),
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 518baa896e8..d04ed87bfac 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
}
#ifdef CONFIG_MOVABLE_NODE
-/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+/*
+ * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
+ * normal memory.
+ */
static bool can_online_high_movable(struct zone *zone)
{
return true;
}
-#else /* #ifdef CONFIG_MOVABLE_NODE */
+#else /* CONFIG_MOVABLE_NODE */
/* ensure every online node has NORMAL memory */
static bool can_online_high_movable(struct zone *zone)
{
return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
}
-#endif /* #ifdef CONFIG_MOVABLE_NODE */
+#endif /* CONFIG_MOVABLE_NODE */
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
@@ -1055,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* migrate_pages returns # of failed pages.
*/
ret = migrate_pages(&source, alloc_migrate_target, 0,
- true, MIGRATE_SYNC);
+ true, MIGRATE_SYNC,
+ MR_MEMORY_HOTPLUG);
if (ret)
putback_lru_pages(&source);
}
@@ -1111,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
}
#ifdef CONFIG_MOVABLE_NODE
-/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+/*
+ * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
+ * normal memory.
+ */
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
{
return true;
}
-#else /* #ifdef CONFIG_MOVABLE_NODE */
+#else /* CONFIG_MOVABLE_NODE */
/* ensure the node has NORMAL memory if it is still online */
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
{
@@ -1140,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
*/
return present_pages == 0;
}
-#endif /* #ifdef CONFIG_MOVABLE_NODE */
+#endif /* CONFIG_MOVABLE_NODE */
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aaf54566cb6..d1b315e9862 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
.flags = MPOL_F_LOCAL,
};
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+ struct mempolicy *pol = p->mempolicy;
+ int node;
+
+ if (!pol) {
+ node = numa_node_id();
+ if (node != -1)
+ pol = &preferred_node_policy[node];
+
+ /* preferred_node_policy is not initialised early in boot */
+ if (!pol->mode)
+ pol = NULL;
+ }
+
+ return pol;
+}
+
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
/*
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- return NULL; /* simply delete any existing policy */
+ return NULL;
}
VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
+ } else if (mode == MPOL_LOCAL) {
+ if (!nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
return 0;
}
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ int nr_updated;
+ BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+
+ nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ if (nr_updated)
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+ return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ unsigned long endvma = vma->vm_end;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
}
- if (!is_vm_hugetlb_page(vma) &&
- ((flags & MPOL_MF_STRICT) ||
+
+ if (is_vm_hugetlb_page(vma))
+ goto next;
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ goto next;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))) {
- unsigned long endvma = vma->vm_end;
+ vma_migratable(vma))) {
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
break;
}
}
+next:
prev = vma;
}
return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);
- if (flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new))
return PTR_ERR(new);
+ if (flags & MPOL_MF_LAZY)
+ new->flags |= MPOL_F_MOF;
+
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma)) {
- int nr_failed = 0;
-
+ err = PTR_ERR(vma); /* maybe ... */
+ if (!IS_ERR(vma))
err = mbind_range(mm, start, end, new);
+ if (!err) {
+ int nr_failed = 0;
+
if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
putback_lru_pages(&pagelist);
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
- struct mempolicy *pol = task->mempolicy;
+ struct mempolicy *pol = get_task_policy(task);
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1956,7 +2028,7 @@ retry_cpuset:
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = get_task_policy(current);
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Migrate the page towards the node whose CPU is referencing it */
+ if (pol->flags & MPOL_F_MORON) {
+ int last_nid;
+
+ polnid = numa_node_id();
+
+ /*
+ * Multi-stage node selection is used in conjunction
+ * with a periodic migration fault to build a temporal
+ * task<->page relation. By using a two-stage filter we
+ * remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist
+ * probability, we can equate a task's usage of a
+ * particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and
+ * getting the same result twice in a row, given these
+ * samples are fully independent, is then given by
+ * P(n)^2, provided our sample period is sufficiently
+ * short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making
+ * it less likely we act on an unlikely task<->page
+ * relation.
+ */
+ last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+out:
+ mpol_cond_put(pol);
+
+ return ret;
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
mutex_unlock(&p->mutex);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+
+static void __init check_numabalancing_enable(void)
+{
+ bool numabalancing_default = false;
+
+ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+ numabalancing_default = true;
+
+ if (nr_node_ids > 1 && !numabalancing_override) {
+ printk(KERN_INFO "Enabling automatic NUMA balancing. "
+ "Configure with numa_balancing= or sysctl");
+ set_numabalancing_state(numabalancing_default);
+ }
+}
+
+static int __init setup_numabalancing(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ numabalancing_override = true;
+
+ if (!strcmp(str, "enable")) {
+ set_numabalancing_state(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_numabalancing_state(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+
+ return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
+ for_each_node(nid) {
+ preferred_node_policy[nid] = (struct mempolicy) {
+ .refcnt = ATOMIC_INIT(1),
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .v = { .preferred_node = nid, },
+ };
+ }
+
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
+
+ check_numabalancing_enable();
}
/* Reset policy of current process to default */
@@ -2362,14 +2598,13 @@ void numa_default_policy(void)
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
- [MPOL_LOCAL] = "local"
+ [MPOL_LOCAL] = "local",
};
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (mode > MPOL_LOCAL)
+ if (mode >= MPOL_MAX)
goto out;
switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index cae02711181..3b676b0c5c3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,6 +39,9 @@
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
+
#include "internal.h"
/*
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode)
{
- int expected_count;
+ int expected_count = 0;
void **pslot;
if (!mapping) {
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- if (PageHuge(page))
+ if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
if (PageAnon(page)) {
/*
- * Only page_lock_anon_vma() understands the subtleties of
+ * Only page_lock_anon_vma_read() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
anon_vma = page_get_anon_vma(page);
@@ -998,10 +1001,11 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
+ int nr_succeeded = 0;
int pass = 0;
struct page *page;
struct page *page2;
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from,
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ nr_succeeded++;
break;
default:
/* Permanent failure */
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from,
}
rc = nr_failed + retry;
out:
+ if (nr_succeeded)
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ if (nr_failed)
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
@@ -1176,7 +1187,8 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, MIGRATE_SYNC);
+ (unsigned long)pm, 0, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
}
return err;
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+ int nr_migrate_pages)
+{
+ int z;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable)
+ continue;
+
+ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) +
+ nr_migrate_pages,
+ 0, 0))
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static struct page *alloc_misplaced_dst_page(struct page *page,
+ unsigned long data,
+ int **result)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = alloc_pages_exact_node(nid,
+ (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN) &
+ ~GFP_IOFS, 0);
+ if (newpage)
+ page_xchg_last_nid(newpage, page_last_nid(page));
+
+ return newpage;
+}
+
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+ msecs_to_jiffies(pteupdate_interval_millisecs)))
+ return false;
+
+ if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+ return false;
+
+ return true;
+}
+
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+ bool rate_limited = false;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ spin_lock(&pgdat->numabalancing_migrate_lock);
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies +
+ msecs_to_jiffies(migrate_interval_millisecs);
+ }
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+ rate_limited = true;
+ else
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
+
+ return rate_limited;
+}
+
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+ int ret = 0;
+
+ /* Avoid migrating to a node that is nearly full */
+ if (migrate_balanced_pgdat(pgdat, 1)) {
+ int page_lru;
+
+ if (isolate_lru_page(page)) {
+ put_page(page);
+ return 0;
+ }
+
+ /* Page is isolated */
+ ret = 1;
+ page_lru = page_is_file_cache(page);
+ if (!PageTransHuge(page))
+ inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+ else
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ HPAGE_PMD_NR);
+ }
+
+ /*
+ * Page is either isolated or there is not enough space on the target
+ * node. If isolated, then it has taken a reference count and the
+ * callers reference can be safely dropped without the page
+ * disappearing underneath us during migration. Otherwise the page is
+ * not to be migrated but the callers reference should still be
+ * dropped so it does not leak.
+ */
+ put_page(page);
+
+ return ret;
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ int nr_remaining;
+ LIST_HEAD(migratepages);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1) {
+ put_page(page);
+ goto out;
+ }
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, 1)) {
+ put_page(page);
+ goto out;
+ }
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated)
+ goto out;
+
+ list_add(&page->lru, &migratepages);
+ nr_remaining = migrate_pages(&migratepages,
+ alloc_misplaced_dst_page,
+ node, false, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
+ if (nr_remaining) {
+ putback_lru_pages(&migratepages);
+ isolated = 0;
+ } else
+ count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ BUG_ON(!list_empty(&migratepages));
+out:
+ return isolated;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ struct page *new_page = NULL;
+ struct mem_cgroup *memcg = NULL;
+ int page_lru = page_is_file_cache(page);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1)
+ goto out_dropref;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+ goto out_dropref;
+
+ new_page = alloc_pages_node(node,
+ (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ if (!new_page) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out_dropref;
+ }
+ page_xchg_last_nid(new_page, page_last_nid(page));
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ put_page(new_page);
+ goto out_keep_locked;
+ }
+
+ /* Prepare a page as a migration target */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+ migrate_page_copy(new_page, page);
+ WARN_ON(PageLRU(new_page));
+
+ /* Recheck the target PMD */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry))) {
+ spin_unlock(&mm->page_table_lock);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+ SetPageActive(page);
+ if (TestClearPageUnevictable(new_page))
+ SetPageUnevictable(page);
+ mlock_migrate_page(page, new_page);
+
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+ unlock_page(page);
+ putback_lru_page(page);
+
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out;
+ }
+
+ /*
+ * Traditional migration needs to prepare the memcg charge
+ * transaction early to prevent the old page from being
+ * uncharged when installing migration entries. Here we can
+ * save the potential rollback and start the charge transfer
+ * only when migration is already known to end successfully.
+ */
+ mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = pmd_mknonnuma(entry);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+
+ page_add_new_anon_rmap(new_page, vma, haddr);
+
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+ page_remove_rmap(page);
+ /*
+ * Finish the charge transaction under the page table lock to
+ * prevent split_huge_page() from dividing up the charge
+ * before it's fully transferred to the new page.
+ */
+ mem_cgroup_end_migration(memcg, page, new_page, true);
+ spin_unlock(&mm->page_table_lock);
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the rmap reference */
+ put_page(page); /* Drop the LRU isolation reference */
+
+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+out:
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ -HPAGE_PMD_NR);
+ return isolated;
+
+out_dropref:
+ put_page(page);
+out_keep_locked:
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 2b7d9e78a56..f54b235f29a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -736,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (anon_vma) {
VM_BUG_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma);
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next);
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
+ down_write(&anon_vma->root->rwsem);
/*
* We can safely modify head.next after taking the
- * anon_vma->root->mutex. If some other vma in this mm shares
+ * anon_vma->root->rwsem. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8c3938db6f..94722a4d6b4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{
+ struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
+ unsigned long pages = 0;
+ bool all_same_node = true;
+ int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
+ bool updated = false;
ptent = ptep_modify_prot_start(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
+ if (!prot_numa) {
+ ptent = pte_modify(ptent, newprot);
+ updated = true;
+ } else {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page) {
+ int this_nid = page_to_nid(page);
+ if (last_nid == -1)
+ last_nid = this_nid;
+ if (last_nid != this_nid)
+ all_same_node = false;
+
+ /* only check non-shared pages */
+ if (!pte_numa(oldpte) &&
+ page_mapcount(page) == 1) {
+ ptent = pte_mknuma(ptent);
+ updated = true;
+ }
+ }
+ }
/*
* Avoid taking write faults for pages we know to be
* dirty.
*/
- if (dirty_accountable && pte_dirty(ptent))
+ if (dirty_accountable && pte_dirty(ptent)) {
ptent = pte_mkwrite(ptent);
+ updated = true;
+ }
+ if (updated)
+ pages++;
ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte,
swp_entry_to_pte(entry));
}
+ pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+
+ *ret_all_same_node = all_same_node;
+ return pages;
}
-static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ spin_lock(&mm->page_table_lock);
+ set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+ spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr, unsigned long end,
+ pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
unsigned long next;
+ unsigned long pages = 0;
+ bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
@@ -91,42 +143,60 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
- else if (change_huge_pmd(vma, pmd, addr, newprot))
+ else if (change_huge_pmd(vma, pmd, addr, newprot,
+ prot_numa)) {
+ pages += HPAGE_PMD_NR;
continue;
+ }
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa, &all_same_node);
+
+ /*
+ * If we are changing protections for NUMA hinting faults then
+ * set pmd_numa if the examined pages were all on the same
+ * node. This allows a regular PMD to be handled as one fault
+ * and effectively batches the taking of the PTL
+ */
+ if (prot_numa && all_same_node)
+ change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
+
+ return pages;
}
-static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+static inline unsigned long change_pud_range(struct vm_area_struct *vma,
+ pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
+ unsigned long pages = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(vma, pud, addr, next, newprot,
- dirty_accountable);
+ pages += change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pud++, addr = next, addr != end);
+
+ return pages;
}
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
+ unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(vma, pgd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
- flush_tlb_range(vma, start, end);
+
+ /* Only flush the TLB if we actually modified any entries: */
+ if (pages)
+ flush_tlb_range(vma, start, end);
+
+ return pages;
+}
+
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ int dirty_accountable, int prot_numa)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long pages;
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ if (is_vm_hugetlb_page(vma))
+ pages = hugetlb_change_protection(vma, start, end, newprot);
+ else
+ pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+
+ return pages;
}
int
@@ -213,12 +305,9 @@ success:
dirty_accountable = 1;
}
- mmu_notifier_invalidate_range_start(mm, start, end);
- if (is_vm_hugetlb_page(vma))
- hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
- else
- change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
- mmu_notifier_invalidate_range_end(mm, start, end);
+ change_protection(vma, start, end, vma->vm_page_prot,
+ dirty_accountable, 0);
+
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out;
- }
- else {
+ } else {
if (vma->vm_start > start)
goto out;
if (unlikely(grows & PROT_GROWSUP)) {
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
for (nstart = start ; ; ) {
unsigned long newflags;
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+ newflags = vm_flags;
+ newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index eabb24da6c9..e1031e1f6a6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
}
if (vma->anon_vma) {
anon_vma = vma->anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f427122449..0713bfbf095 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -201,6 +201,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
zone_reclaimable_pages(z) - z->dirty_balance_reserve;
}
/*
+ * Unreclaimable memory (kernel memory or anonymous memory
+ * without swap) can bring down the dirtyable pages below
+ * the zone's dirty balance reserve and the above calculation
+ * will underflow. However we still want to add in nodes
+ * which are below threshold (negative values) to get a more
+ * accurate calculation but make sure that the total never
+ * underflows.
+ */
+ if ((long)x < 0)
+ x = 0;
+
+ /*
* Make sure that the number of highmem pages is never larger
* than the number of the total dirtyable memory. This can only
* occur in very strange VM situations but we want to make sure
@@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
- dirty_balance_reserve;
+ x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ x -= min(x, dirty_balance_reserve);
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
* highmem zone can hold its share of dirty pages, so we don't
* care about vm_highmem_is_dirtyable here.
*/
- return zone_page_state(zone, NR_FREE_PAGES) +
- zone_reclaimable_pages(zone) -
- zone->dirty_balance_reserve;
+ unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
+ zone_reclaimable_pages(zone);
+
+ /* don't allow this to underflow */
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+ return nr_pages;
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 83637dfba11..4ba5e37127f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
int nr_pages = 1 << order;
int bad = 0;
- if (unlikely(compound_order(page) != order) ||
- unlikely(!PageHead(page))) {
+ if (unlikely(compound_order(page) != order)) {
bad_page(page);
bad++;
}
@@ -611,6 +610,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
+ reset_page_last_nid(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -2612,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+ struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2630,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ /*
+ * Will only have any effect when __GFP_KMEMCG is set. This is
+ * verified in the (always inline) callee
+ */
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
@@ -2665,6 +2673,8 @@ out:
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
+ memcg_kmem_commit_charge(page, memcg, order);
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2717,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
+/*
+ * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * pages allocated with __GFP_KMEMCG.
+ *
+ * Those pages are accounted to a particular memcg, embedded in the
+ * corresponding page_cgroup. To avoid adding a hit in the allocator to search
+ * for that information only to find out that it is NULL for users who have no
+ * interest in that whatsoever, we provide these functions.
+ *
+ * The caller knows better which flags it relies on.
+ */
+void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+{
+ memcg_kmem_uncharge_pages(page, order);
+ __free_pages(page, order);
+}
+
+void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+{
+ if (addr != 0) {
+ VM_BUG_ON(!virt_addr_valid((void *)addr));
+ __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+ }
+}
+
static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
{
if (addr) {
@@ -3883,6 +3918,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
+ reset_page_last_nid(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -4526,6 +4562,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+ spin_lock_init(&pgdat->numabalancing_migrate_lock);
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
@@ -5800,7 +5841,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
ret = migrate_pages(&cc->migratepages,
alloc_migrate_target,
- 0, false, MIGRATE_SYNC);
+ 0, false, MIGRATE_SYNC,
+ MR_CMA);
}
putback_movable_pages(&cc->migratepages);
@@ -5936,8 +5978,15 @@ done:
void free_contig_range(unsigned long pfn, unsigned nr_pages)
{
- for (; nr_pages--; ++pfn)
- __free_page(pfn_to_page(pfn));
+ unsigned int count = 0;
+
+ for (; nr_pages--; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ count += page_count(page) != 1;
+ __free_page(page);
+ }
+ WARN(count != 0, "%d pages are still in use!\n", count);
}
#endif
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b..0c8323fe6c8 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
- * Only sets the access flags (dirty, accessed, and
- * writable). Furthermore, we know it always gets set to a "more
+ * Only sets the access flags (dirty, accessed), as well as write
+ * permission. Furthermore, we know it always gets set to a "more
* permissive" setting, which allows most architectures to optimize
* this. We return whether the PTE actually changed, which in turn
* instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
return changed;
}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pte_t pte;
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
+ if (pte_accessible(pte))
+ flush_tlb_page(vma, address);
return pte;
}
#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index face808a489..2c78f8cadc9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex
- * anon_vma->mutex
+ * anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
- * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*/
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
- * Synchronize against page_lock_anon_vma() such that
+ * Synchronize against page_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
- * mutex_trylock() from page_lock_anon_vma(). This orders:
+ * down_read_trylock() from page_lock_anon_vma_read(). This orders:
*
- * page_lock_anon_vma() VS put_anon_vma()
- * mutex_trylock() atomic_dec_and_test()
+ * page_lock_anon_vma_read() VS put_anon_vma()
+ * down_read_trylock() atomic_dec_and_test()
* LOCK MB
- * atomic_read() mutex_is_locked()
+ * atomic_read() rwsem_is_locked()
*
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
- if (mutex_is_locked(&anon_vma->root->mutex)) {
- anon_vma_lock(anon_vma);
+ if (rwsem_is_locked(&anon_vma->root->rwsem)) {
+ anon_vma_lock_write(anon_vma);
anon_vma_unlock(anon_vma);
}
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
* and that may actually touch the spinlock even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
allocated = anon_vma;
}
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
struct anon_vma *new_root = anon_vma->root;
if (new_root != root) {
if (WARN_ON_ONCE(root))
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
root = new_root;
- mutex_lock(&root->mutex);
+ down_write(&root->rwsem);
}
return root;
}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
static inline void unlock_anon_vma_root(struct anon_vma *root)
{
if (root)
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
}
/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
anon_vma_unlock(anon_vma);
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
/*
* Iterate the list once more, it now only contains empty and unlinked
* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
- * needing to acquire the anon_vma->root->mutex.
+ * needing to write-acquire the anon_vma->root->rwsem.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
- mutex_init(&anon_vma->mutex);
+ init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT;
}
@@ -442,7 +442,7 @@ out:
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root);
- if (mutex_trylock(&root_anon_vma->mutex)) {
+ if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
if (!page_mapped(page)) {
- mutex_unlock(&root_anon_vma->mutex);
+ up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
/* we pinned the anon_vma, its safe to sleep */
rcu_read_unlock();
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
- * we'll deadlock on the anon_vma_lock() recursion.
+ * we'll deadlock on the anon_vma_lock_write() recursion.
*/
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma);
anon_vma = NULL;
}
@@ -504,9 +504,9 @@ out:
return anon_vma;
}
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
/*
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,
struct anon_vma_chain *avc;
int referenced = 0;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return referenced;
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return referenced;
}
@@ -1315,7 +1315,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
+ * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return ret;
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return ret;
}
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
int ret = SWAP_AGAIN;
/*
- * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+ * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_sem. Users without mmap_sem are required to
* take a reference count to prevent the anon_vma disappearing
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN)
break;
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 03f9ba8fb8e..5c90d84c2b0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1719,7 +1719,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
* llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
*/
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
- pgoff_t index, pgoff_t end, int origin)
+ pgoff_t index, pgoff_t end, int whence)
{
struct page *page;
struct pagevec pvec;
@@ -1733,13 +1733,13 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
pvec.nr, pvec.pages, indices);
if (!pvec.nr) {
- if (origin == SEEK_DATA)
+ if (whence == SEEK_DATA)
index = end;
break;
}
for (i = 0; i < pvec.nr; i++, index++) {
if (index < indices[i]) {
- if (origin == SEEK_HOLE) {
+ if (whence == SEEK_HOLE) {
done = true;
break;
}
@@ -1751,8 +1751,8 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
page = NULL;
}
if (index >= end ||
- (page && origin == SEEK_DATA) ||
- (!page && origin == SEEK_HOLE)) {
+ (page && whence == SEEK_DATA) ||
+ (!page && whence == SEEK_HOLE)) {
done = true;
break;
}
@@ -1765,15 +1765,15 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
return index;
}
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
pgoff_t start, end;
loff_t new_offset;
- if (origin != SEEK_DATA && origin != SEEK_HOLE)
- return generic_file_llseek_size(file, offset, origin,
+ if (whence != SEEK_DATA && whence != SEEK_HOLE)
+ return generic_file_llseek_size(file, offset, whence,
MAX_LFS_FILESIZE, i_size_read(inode));
mutex_lock(&inode->i_mutex);
/* We're holding i_mutex so we can access i_size directly */
@@ -1785,12 +1785,12 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
else {
start = offset >> PAGE_CACHE_SHIFT;
end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+ new_offset = shmem_seek_hole_data(mapping, start, end, whence);
new_offset <<= PAGE_CACHE_SHIFT;
if (new_offset > offset) {
if (new_offset < inode->i_size)
offset = new_offset;
- else if (origin == SEEK_DATA)
+ else if (whence == SEEK_DATA)
offset = -ENXIO;
else
offset = inode->i_size;
diff --git a/mm/slab.c b/mm/slab.c
index 33d3363658d..e7667a3584b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
*/
#include <linux/slab.h>
-#include "slab.h"
#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/swap.h>
@@ -128,6 +127,8 @@
#include "internal.h"
+#include "slab.h"
+
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
*/
static bool pfmemalloc_active __read_mostly;
-/* Legal flag mask for kmem_cache_create(). */
-#if DEBUG
-# define CREATE_MASK (SLAB_RED_ZONE | \
- SLAB_POISON | SLAB_HWCACHE_ALIGN | \
- SLAB_CACHE_DMA | \
- SLAB_STORE_USER | \
- SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#else
-# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
- SLAB_CACHE_DMA | \
- SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#endif
-
/*
* kmem_bufctl_t:
*
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
#undef CACHE
};
-static struct arraycache_init initarray_cache __initdata =
- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
-static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
static struct kmem_cache kmem_cache_boot = {
- .nodelists = kmem_cache_nodelists,
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
}
}
+static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
+{
+ struct kmem_list3 *l3;
+ l3 = cachep->nodelists[q];
+ if (!l3)
+ return;
+
+ slab_set_lock_classes(cachep, &on_slab_l3_key,
+ &on_slab_alc_key, q);
+}
+
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+ int node;
+
+ VM_BUG_ON(OFF_SLAB(cachep));
+ for_each_node(node)
+ on_slab_lock_classes_node(cachep, node);
+}
+
static inline void init_lock_keys(void)
{
int node;
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
{
}
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+}
+
+static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
+
static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
{
}
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
free_alien_cache(alien);
if (cachep->flags & SLAB_DEBUG_OBJECTS)
slab_set_debugobj_lock_classes_node(cachep, node);
+ else if (!OFF_SLAB(cachep) &&
+ !(cachep->flags & SLAB_DESTROY_BY_RCU))
+ on_slab_lock_classes_node(cachep, node);
}
init_node_lock_keys(node);
@@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
}
/*
+ * The memory after the last cpu cache pointer is used for the
+ * the nodelists pointer.
+ */
+static void setup_nodelists_pointer(struct kmem_cache *cachep)
+{
+ cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+}
+
+/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
*/
void __init kmem_cache_init(void)
{
- size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
int i;
- int order;
- int node;
kmem_cache = &kmem_cache_boot;
+ setup_nodelists_pointer(kmem_cache);
if (num_possible_nodes() == 1)
use_alien_caches = 0;
- for (i = 0; i < NUM_INIT_LISTS; i++) {
+ for (i = 0; i < NUM_INIT_LISTS; i++)
kmem_list3_init(&initkmem_list3[i]);
- if (i < MAX_NUMNODES)
- kmem_cache->nodelists[i] = NULL;
- }
+
set_up_list3s(kmem_cache, CACHE_CACHE);
/*
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
- node = numa_mem_id();
-
/* 1) create the kmem_cache */
- INIT_LIST_HEAD(&slab_caches);
- list_add(&kmem_cache->list, &slab_caches);
- kmem_cache->colour_off = cache_line_size();
- kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
- kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
- kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
- nr_node_ids * sizeof(struct kmem_list3 *);
- kmem_cache->object_size = kmem_cache->size;
- kmem_cache->size = ALIGN(kmem_cache->object_size,
- cache_line_size());
- kmem_cache->reciprocal_buffer_size =
- reciprocal_value(kmem_cache->size);
-
- for (order = 0; order < MAX_ORDER; order++) {
- cache_estimate(order, kmem_cache->size,
- cache_line_size(), 0, &left_over, &kmem_cache->num);
- if (kmem_cache->num)
- break;
- }
- BUG_ON(!kmem_cache->num);
- kmem_cache->gfporder = order;
- kmem_cache->colour = left_over / kmem_cache->colour_off;
- kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
- sizeof(struct slab), cache_line_size());
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ nr_node_ids * sizeof(struct kmem_list3 *),
+ SLAB_HWCACHE_ALIGN);
+ list_add(&kmem_cache->list, &slab_caches);
/* 2+3) create the kmalloc caches */
sizes = malloc_sizes;
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
* bug.
*/
- sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
- sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
- sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
- sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
- sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
- __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
- list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
-
- if (INDEX_AC != INDEX_L3) {
- sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
- sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
- sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
- sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
- sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
- __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
- list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
- }
+ sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
+ sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
+
+ if (INDEX_AC != INDEX_L3)
+ sizes[INDEX_L3].cs_cachep =
+ create_kmalloc_cache(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
slab_early_init = 0;
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches.
*/
- if (!sizes->cs_cachep) {
- sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
- sizes->cs_cachep->name = names->name;
- sizes->cs_cachep->size = sizes->cs_size;
- sizes->cs_cachep->object_size = sizes->cs_size;
- sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
- __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
- list_add(&sizes->cs_cachep->list, &slab_caches);
- }
+ if (!sizes->cs_cachep)
+ sizes->cs_cachep = create_kmalloc_cache(names->name,
+ sizes->cs_size, ARCH_KMALLOC_FLAGS);
+
#ifdef CONFIG_ZONE_DMA
- sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
- sizes->cs_dmacachep->name = names->name_dma;
- sizes->cs_dmacachep->size = sizes->cs_size;
- sizes->cs_dmacachep->object_size = sizes->cs_size;
- sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
- __kmem_cache_create(sizes->cs_dmacachep,
- ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
- list_add(&sizes->cs_dmacachep->list, &slab_caches);
+ sizes->cs_dmacachep = create_kmalloc_cache(
+ names->name_dma, sizes->cs_size,
+ SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
#endif
sizes++;
names++;
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
- BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
memcpy(ptr, cpu_cache_get(kmem_cache),
sizeof(struct arraycache_init));
/*
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page + i);
}
+ memcg_bind_pages(cachep, cachep->gfporder);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
__ClearPageSlab(page);
page++;
}
+
+ memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- free_pages((unsigned long)addr, cachep->gfporder);
+ free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
if (slab_state == DOWN) {
/*
- * Note: the first kmem_cache_create must create the cache
+ * Note: Creation of first cache (kmem_cache).
+ * The setup_list3s is taken care
+ * of by the caller of __kmem_cache_create
+ */
+ cachep->array[smp_processor_id()] = &initarray_generic.cache;
+ slab_state = PARTIAL;
+ } else if (slab_state == PARTIAL) {
+ /*
+ * Note: the second kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
/*
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
- * the first cache, then we need to set up all its list3s,
+ * the second cache, then we need to set up all its list3s,
* otherwise the creation of further caches will BUG().
*/
set_up_list3s(cachep, SIZE_AC);
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
else
slab_state = PARTIAL_ARRAYCACHE;
} else {
+ /* Remaining boot caches */
cachep->array[smp_processor_id()] =
kmalloc(sizeof(struct arraycache_init), gfp);
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
/**
* __kmem_cache_create - Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
+ * @cachep: cache management descriptor
* @flags: SLAB flags
- * @ctor: A constructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (flags & SLAB_DESTROY_BY_RCU)
BUG_ON(flags & SLAB_POISON);
#endif
- /*
- * Always checks flags, a caller might be expecting debug support which
- * isn't available.
- */
- BUG_ON(flags & ~CREATE_MASK);
/*
* Check that size is in terms of words. This is needed to avoid
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size &= ~(BYTES_PER_WORD - 1);
}
- /* calculate the final buffer alignment: */
-
- /* 1) arch recommendation: can be overridden for debug */
- if (flags & SLAB_HWCACHE_ALIGN) {
- /*
- * Default alignment: as specified by the arch code. Except if
- * an object is really small, then squeeze multiple objects into
- * one cacheline.
- */
- ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- } else {
- ralign = BYTES_PER_WORD;
- }
-
/*
* Redzoning and user store require word alignment or possibly larger.
* Note this will be overridden by architecture or caller mandated
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size &= ~(REDZONE_ALIGN - 1);
}
- /* 2) arch mandated alignment */
- if (ralign < ARCH_SLAB_MINALIGN) {
- ralign = ARCH_SLAB_MINALIGN;
- }
/* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
gfp = GFP_NOWAIT;
- cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+ setup_nodelists_pointer(cachep);
#if DEBUG
/*
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
slab_set_debugobj_lock_classes(cachep);
- }
+ } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
+ on_slab_lock_classes(cachep);
return 0;
}
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (slab_should_failslab(cachep, flags))
return NULL;
+ cachep = memcg_kmem_get_cache(cachep, flags);
+
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (slab_should_failslab(cachep, flags))
return NULL;
+ cachep = memcg_kmem_get_cache(cachep, flags);
+
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
+ cachep = cache_from_obj(cachep, objp);
+ if (!cachep)
+ return;
local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
}
EXPORT_SYMBOL(kfree);
-unsigned int kmem_cache_size(struct kmem_cache *cachep)
-{
- return cachep->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
}
/* Always called with the slab_mutex held */
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
return alloc_kmemlist(cachep, gfp);
}
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
+{
+ int ret;
+ struct kmem_cache *c = NULL;
+ int i = 0;
+
+ ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+
+ if (slab_state < FULL)
+ return ret;
+
+ if ((ret < 0) || !is_root_cache(cachep))
+ return ret;
+
+ VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg(cachep, i);
+ if (c)
+ /* return value determined by the parent cache only */
+ __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+ }
+
+ return ret;
+}
+
/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
- int limit, shared;
+ int limit = 0;
+ int shared = 0;
+ int batchcount = 0;
+
+ if (!is_root_cache(cachep)) {
+ struct kmem_cache *root = memcg_root_cache(cachep);
+ limit = root->limit;
+ shared = root->shared;
+ batchcount = root->batchcount;
+ }
+ if (limit && shared && batchcount)
+ goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (limit > 32)
limit = 32;
#endif
- err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
+ batchcount = (limit + 1) / 2;
+skip_setup:
+ err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
@@ -4276,54 +4275,8 @@ out:
}
#ifdef CONFIG_SLABINFO
-
-static void print_slabinfo_header(struct seq_file *m)
-{
- /*
- * Output format version, so at least we can change it
- * without _too_ many complaints.
- */
-#if STATS
- seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
-#else
- seq_puts(m, "slabinfo - version: 2.1\n");
-#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
- "<objperslab> <pagesperslab>");
- seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
- seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#if STATS
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
- seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
-#endif
- seq_putc(m, '\n');
-}
-
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
- loff_t n = *pos;
-
- mutex_lock(&slab_mutex);
- if (!n)
- print_slabinfo_header(m);
-
- return seq_list_start(&slab_caches, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&slab_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
-{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
if (error)
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
- seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- name, active_objs, num_objs, cachep->size,
- cachep->num, (1 << cachep->gfporder));
- seq_printf(m, " : tunables %4u %4u %4u",
- cachep->limit, cachep->batchcount, cachep->shared);
- seq_printf(m, " : slabdata %6lu %6lu %6lu",
- active_slabs, num_slabs, shared_avail);
+ sinfo->active_objs = active_objs;
+ sinfo->num_objs = num_objs;
+ sinfo->active_slabs = active_slabs;
+ sinfo->num_slabs = num_slabs;
+ sinfo->shared_avail = shared_avail;
+ sinfo->limit = cachep->limit;
+ sinfo->batchcount = cachep->batchcount;
+ sinfo->shared = cachep->shared;
+ sinfo->objects_per_slab = cachep->num;
+ sinfo->cache_order = cachep->gfporder;
+}
+
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
+{
#if STATS
{ /* list3 stats */
unsigned long high = cachep->high_mark;
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
allochit, allocmiss, freehit, freemiss);
}
#endif
- seq_putc(m, '\n');
- return 0;
}
-/*
- * slabinfo_op - iterator that generates /proc/slabinfo
- *
- * Output layout:
- * cache-name
- * num-active-objs
- * total-objs
- * object size
- * num-active-slabs
- * total-slabs
- * num-pages-per-slab
- * + further values on SMP and with statistics enabled
- */
-
-static const struct seq_operations slabinfo_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
#define MAX_SLABINFO_WRITE 128
/**
* slabinfo_write - Tuning for the slab allocator
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
-static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
return res;
}
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &slabinfo_op);
-}
-
-static const struct file_operations proc_slabinfo_operations = {
- .open = slabinfo_open,
- .read = seq_read,
- .write = slabinfo_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#ifdef CONFIG_DEBUG_SLAB_LEAK
static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
return 0;
}
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
static const struct seq_operations slabstats_op = {
.start = leaks_start,
.next = s_next,
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
static int __init slab_proc_init(void)
{
- proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
#ifdef CONFIG_DEBUG_SLAB_LEAK
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
#endif
diff --git a/mm/slab.h b/mm/slab.h
index 7deeb449a30..34a98d64219 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
/* The slab cache that manages slab cache information */
extern struct kmem_cache *kmem_cache;
+unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size);
+
/* Functions provided by the slab allocators */
extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
+ unsigned long flags);
+extern void create_boot_cache(struct kmem_cache *, const char *name,
+ size_t size, unsigned long flags);
+
+struct mem_cgroup;
#ifdef CONFIG_SLUB
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *));
+struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *));
#else
-static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+static inline struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
{ return NULL; }
#endif
+/* Legal flag mask for kmem_cache_create(), for various configurations */
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+ SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
+
+#if defined(CONFIG_DEBUG_SLAB)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+#elif defined(CONFIG_SLUB_DEBUG)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DEBUG_FREE)
+#else
+#define SLAB_DEBUG_FLAGS (0)
+#endif
+
+#if defined(CONFIG_SLAB)
+#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+#elif defined(CONFIG_SLUB)
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_NOTRACK)
+#else
+#define SLAB_CACHE_FLAGS (0)
+#endif
+
+#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
+
int __kmem_cache_shutdown(struct kmem_cache *);
+struct seq_file;
+struct file;
+
+struct slabinfo {
+ unsigned long active_objs;
+ unsigned long num_objs;
+ unsigned long active_slabs;
+ unsigned long num_slabs;
+ unsigned long shared_avail;
+ unsigned int limit;
+ unsigned int batchcount;
+ unsigned int shared;
+ unsigned int objects_per_slab;
+ unsigned int cache_order;
+};
+
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos);
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+ return !s->memcg_params || s->memcg_params->is_root_cache;
+}
+
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+ struct mem_cgroup *memcg)
+{
+ return (is_root_cache(cachep) && !memcg) ||
+ (cachep->memcg_params->memcg == memcg);
+}
+
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+ if (!is_root_cache(s))
+ atomic_add(1 << order, &s->memcg_params->nr_pages);
+}
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+ if (is_root_cache(s))
+ return;
+
+ if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
+ mem_cgroup_destroy_cache(s);
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+ struct kmem_cache *p)
+{
+ return (p == s) ||
+ (s->memcg_params && (p == s->memcg_params->root_cache));
+}
+
+/*
+ * We use suffixes to the name in memcg because we can't have caches
+ * created in the system with the same name. But when we print them
+ * locally, better refer to them with the base name
+ */
+static inline const char *cache_name(struct kmem_cache *s)
+{
+ if (!is_root_cache(s))
+ return s->memcg_params->root_cache->name;
+ return s->name;
+}
+
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+ return s->memcg_params->memcg_caches[idx];
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+ if (is_root_cache(s))
+ return s;
+ return s->memcg_params->root_cache;
+}
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+ return true;
+}
+
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+ struct mem_cgroup *memcg)
+{
+ return true;
+}
+
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+}
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+ struct kmem_cache *p)
+{
+ return true;
+}
+
+static inline const char *cache_name(struct kmem_cache *s)
+{
+ return s->name;
+}
+
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+ return NULL;
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+ return s;
+}
+#endif
+
+static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
+{
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ /*
+ * When kmemcg is not being used, both assignments should return the
+ * same value. but we don't want to pay the assignment price in that
+ * case. If it is not compiled in, the compiler should be smart enough
+ * to not do even the assignment. In that case, slab_equal_or_root
+ * will also be a constant.
+ */
+ if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+ return s;
+
+ page = virt_to_head_page(x);
+ cachep = page->slab_cache;
+ if (slab_equal_or_root(cachep, s))
+ return cachep;
+
+ pr_err("%s: Wrong slab cache. %s but object is from %s\n",
+ __FUNCTION__, cachep->name, s->name);
+ WARN_ON_ONCE(1);
+ return s;
+}
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 069a24e6440..3f3cd97d3fd 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -13,9 +13,12 @@
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
+#include <linux/memcontrol.h>
#include "slab.h"
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
#ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(const char *name, size_t size)
+static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+ size_t size)
{
struct kmem_cache *s = NULL;
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
continue;
}
- if (!strcmp(s->name, name)) {
+ /*
+ * For simplicity, we won't check this in the list of memcg
+ * caches. We have control over memcg naming, and if there
+ * aren't duplicates in the global list, there won't be any
+ * duplicates in the memcg lists as well.
+ */
+ if (!memcg && !strcmp(s->name, name)) {
pr_err("%s (%s): Cache name already exists.\n",
__func__, name);
dump_stack();
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
return 0;
}
#else
-static inline int kmem_cache_sanity_check(const char *name, size_t size)
+static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+ const char *name, size_t size)
{
return 0;
}
#endif
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_caches(int num_memcgs)
+{
+ struct kmem_cache *s;
+ int ret = 0;
+ mutex_lock(&slab_mutex);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!is_root_cache(s))
+ continue;
+
+ ret = memcg_update_cache_size(s, num_memcgs);
+ /*
+ * See comment in memcontrol.c, memcg_update_cache_size:
+ * Instead of freeing the memory, we'll just leave the caches
+ * up to this point in an updated state.
+ */
+ if (ret)
+ goto out;
+ }
+
+ memcg_update_array_size(num_memcgs);
+out:
+ mutex_unlock(&slab_mutex);
+ return ret;
+}
+#endif
+
+/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
+
/*
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
* as davem.
*/
-struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+struct kmem_cache *
+kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *),
+ struct kmem_cache *parent_cache)
{
struct kmem_cache *s = NULL;
int err = 0;
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
get_online_cpus();
mutex_lock(&slab_mutex);
- if (!kmem_cache_sanity_check(name, size) == 0)
+ if (!kmem_cache_sanity_check(memcg, name, size) == 0)
goto out_locked;
+ /*
+ * Some allocators will constraint the set of valid flags to a subset
+ * of all flags. We expect them to define CACHE_CREATE_MASK in this
+ * case, and we'll just provide them with a sanitized version of the
+ * passed flags.
+ */
+ flags &= CACHE_CREATE_MASK;
- s = __kmem_cache_alias(name, size, align, flags, ctor);
+ s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
goto out_locked;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (s) {
s->object_size = s->size = size;
- s->align = align;
+ s->align = calculate_alignment(flags, align, size);
s->ctor = ctor;
+
+ if (memcg_register_cache(memcg, s, parent_cache)) {
+ kmem_cache_free(kmem_cache, s);
+ err = -ENOMEM;
+ goto out_locked;
+ }
+
s->name = kstrdup(name, GFP_KERNEL);
if (!s->name) {
kmem_cache_free(kmem_cache, s);
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
err = __kmem_cache_create(s, flags);
if (!err) {
-
s->refcount = 1;
list_add(&s->list, &slab_caches);
-
+ memcg_cache_list_add(memcg, s);
} else {
kfree(s->name);
kmem_cache_free(kmem_cache, s);
@@ -157,10 +239,20 @@ out_locked:
return s;
}
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{
+ return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+}
EXPORT_SYMBOL(kmem_cache_create);
void kmem_cache_destroy(struct kmem_cache *s)
{
+ /* Destroy all the children caches if we aren't a memcg cache */
+ kmem_cache_destroy_memcg_children(s);
+
get_online_cpus();
mutex_lock(&slab_mutex);
s->refcount--;
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
+ memcg_release_cache(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
} else {
@@ -192,3 +285,182 @@ int slab_is_available(void)
{
return slab_state >= UP;
}
+
+#ifndef CONFIG_SLOB
+/* Create a cache during boot when no slab services are available yet */
+void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
+ unsigned long flags)
+{
+ int err;
+
+ s->name = name;
+ s->size = s->object_size = size;
+ s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+ err = __kmem_cache_create(s, flags);
+
+ if (err)
+ panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
+ name, size, err);
+
+ s->refcount = -1; /* Exempt from merging for now */
+}
+
+struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
+ unsigned long flags)
+{
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+
+ if (!s)
+ panic("Out of memory when creating slab %s\n", name);
+
+ create_boot_cache(s, name, size, flags);
+ list_add(&s->list, &slab_caches);
+ s->refcount = 1;
+ return s;
+}
+
+#endif /* !CONFIG_SLOB */
+
+
+#ifdef CONFIG_SLABINFO
+void print_slabinfo_header(struct seq_file *m)
+{
+ /*
+ * Output format version, so at least we can change it
+ * without _too_ many complaints.
+ */
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+ seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+ "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+
+ mutex_lock(&slab_mutex);
+ if (!n)
+ print_slabinfo_header(m);
+
+ return seq_list_start(&slab_caches, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
+static void
+memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
+{
+ struct kmem_cache *c;
+ struct slabinfo sinfo;
+ int i;
+
+ if (!is_root_cache(s))
+ return;
+
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg(s, i);
+ if (!c)
+ continue;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(c, &sinfo);
+
+ info->active_slabs += sinfo.active_slabs;
+ info->num_slabs += sinfo.num_slabs;
+ info->shared_avail += sinfo.shared_avail;
+ info->active_objs += sinfo.active_objs;
+ info->num_objs += sinfo.num_objs;
+ }
+}
+
+int cache_show(struct kmem_cache *s, struct seq_file *m)
+{
+ struct slabinfo sinfo;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(s, &sinfo);
+
+ memcg_accumulate_slabinfo(s, &sinfo);
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+ cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+ sinfo.objects_per_slab, (1 << sinfo.cache_order));
+
+ seq_printf(m, " : tunables %4u %4u %4u",
+ sinfo.limit, sinfo.batchcount, sinfo.shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
+ slabinfo_show_stats(m, s);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+ if (!is_root_cache(s))
+ return 0;
+ return cache_show(s, m);
+}
+
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+static const struct seq_operations slabinfo_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+ .open = slabinfo_open,
+ .read = seq_read,
+ .write = slabinfo_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+ proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
+ return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
diff --git a/mm/slob.c b/mm/slob.c
index 1e921c5e957..a99fdf7a090 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -28,9 +28,8 @@
* from kmalloc are prepended with a 4-byte header with the kmalloc size.
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
* alloc_pages() directly, allocating compound pages so the page order
- * does not have to be separately tracked, and also stores the exact
- * allocation size in page->private so that it can be used to accurately
- * provide ksize(). These objects are detected in kfree() because slob_page()
+ * does not have to be separately tracked.
+ * These objects are detected in kfree() because PageSlab()
* is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
#include <linux/kernel.h>
#include <linux/slab.h>
-#include "slab.h"
#include <linux/mm.h>
#include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
#include <linux/atomic.h>
+#include "slab.h"
/*
* slob_block has a field 'units', which indicates size of block if +ve,
* or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
#define SLOB_UNIT sizeof(slob_t)
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
-#define SLOB_ALIGN L1_CACHE_BYTES
/*
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
if (likely(order))
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
- if (ret) {
- struct page *page;
- page = virt_to_page(ret);
- page->private = size;
- }
trace_kmalloc_node(caller, ret,
size, PAGE_SIZE << order, gfp, node);
@@ -506,7 +499,7 @@ void kfree(const void *block)
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
} else
- put_page(sp);
+ __free_pages(sp, compound_order(sp));
}
EXPORT_SYMBOL(kfree);
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
size_t ksize(const void *block)
{
struct page *sp;
+ int align;
+ unsigned int *m;
BUG_ON(!block);
if (unlikely(block == ZERO_SIZE_PTR))
return 0;
sp = virt_to_page(block);
- if (PageSlab(sp)) {
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
- unsigned int *m = (unsigned int *)(block - align);
- return SLOB_UNITS(*m) * SLOB_UNIT;
- } else
- return sp->private;
+ if (unlikely(!PageSlab(sp)))
+ return PAGE_SIZE << compound_order(sp);
+
+ align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ m = (unsigned int *)(block - align);
+ return SLOB_UNITS(*m) * SLOB_UNIT;
}
EXPORT_SYMBOL(ksize);
int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
{
- size_t align = c->size;
-
if (flags & SLAB_DESTROY_BY_RCU) {
/* leave room for rcu footer at the end of object */
c->size += sizeof(struct slob_rcu);
}
c->flags = flags;
- /* ignore alignment unless it's forced */
- c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
- if (c->align < ARCH_SLAB_MINALIGN)
- c->align = ARCH_SLAB_MINALIGN;
- if (c->align < align)
- c->align = align;
-
return 0;
}
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
PAGE_SIZE << get_order(c->size),
flags, node);
}
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
-unsigned int kmem_cache_size(struct kmem_cache *c)
-{
- return c->size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
int __kmem_cache_shutdown(struct kmem_cache *c)
{
/* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index 487f0bdd53c..ba2ca53f6c3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
#include <linux/fault-inject.h>
#include <linux/stacktrace.h>
#include <linux/prefetch.h>
+#include <linux/memcontrol.h>
#include <trace/events/kmem.h>
@@ -112,9 +113,6 @@
* the fast path and disables lockless freelists.
*/
-#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DEBUG_FREE)
-
static inline int kmem_cache_debug(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#define __OBJECT_POISON 0x80000000UL /* Poison object */
#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
-static int kmem_size = sizeof(struct kmem_cache);
-
#ifdef CONFIG_SMP
static struct notifier_block slab_notifier;
#endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
-
+static void memcg_propagate_slab_attrs(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void sysfs_slab_remove(struct kmem_cache *s) { }
+static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
goto out;
- if (unlikely(s != page->slab)) {
+ if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
slab_err(s, page, "Attempt to free object(0x%p) "
"outside of slab", object);
- } else if (!page->slab) {
+ } else if (!page->slab_cache) {
printk(KERN_ERR
"SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
void *start;
void *last;
void *p;
+ int order;
BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
goto out;
+ order = compound_order(page);
inc_slabs_node(s, page_to_nid(page), page->objects);
- page->slab = s;
+ memcg_bind_pages(s, order);
+ page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
+ memset(start, POISON_INUSE, PAGE_SIZE << order);
last = start;
for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
+
+ memcg_release_pages(s, order);
reset_page_mapcount(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_pages(page, order);
+ __free_memcg_kmem_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
else
page = container_of((struct list_head *)h, struct page, lru);
- __free_slab(page->slab, page);
+ __free_slab(page->slab_cache, page);
}
static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1872,12 +1874,14 @@ redo:
/*
* Unfreeze all the cpu partial slabs.
*
- * This function must be called with interrupt disabled.
+ * This function must be called with interrupts disabled
+ * for the cpu using c (or some other guarantee must be there
+ * to guarantee no concurrent accesses).
*/
-static void unfreeze_partials(struct kmem_cache *s)
+static void unfreeze_partials(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
{
struct kmem_cache_node *n = NULL, *n2 = NULL;
- struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
struct page *page, *discard_page = NULL;
while ((page = c->partial)) {
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
* set to the per node partial list.
*/
local_irq_save(flags);
- unfreeze_partials(s);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
oldpage = NULL;
pobjects = 0;
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
if (c->page)
flush_slab(s, c);
- unfreeze_partials(s);
+ unfreeze_partials(s, c);
}
}
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
+ s = memcg_kmem_get_cache(s, gfpflags);
redo:
/*
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
void *prior;
void **object = (void *)x;
int was_frozen;
- int inuse;
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
do {
+ if (unlikely(n)) {
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ n = NULL;
+ }
prior = page->freelist;
counters = page->counters;
set_freepointer(s, object, prior);
new.counters = counters;
was_frozen = new.frozen;
new.inuse--;
- if ((!new.inuse || !prior) && !was_frozen && !n) {
+ if ((!new.inuse || !prior) && !was_frozen) {
if (!kmem_cache_debug(s) && !prior)
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
}
}
- inuse = new.inuse;
} while (!cmpxchg_double_slab(s, page,
prior, counters,
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
}
+ if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+ goto slab_empty;
+
/*
- * was_frozen may have been set after we acquired the list_lock in
- * an earlier loop. So we need to check it here again.
+ * Objects left in the slab. If it was not on the partial list before
+ * then add it.
*/
- if (was_frozen)
- stat(s, FREE_FROZEN);
- else {
- if (unlikely(!inuse && n->nr_partial > s->min_partial))
- goto slab_empty;
-
- /*
- * Objects left in the slab. If it was not on the partial list before
- * then add it.
- */
- if (unlikely(!prior)) {
- remove_full(s, page);
- add_partial(n, page, DEACTIVATE_TO_TAIL);
- stat(s, FREE_ADD_PARTIAL);
- }
+ if (kmem_cache_debug(s) && unlikely(!prior)) {
+ remove_full(s, page);
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
return;
@@ -2619,19 +2618,10 @@ redo:
void kmem_cache_free(struct kmem_cache *s, void *x)
{
- struct page *page;
-
- page = virt_to_head_page(x);
-
- if (kmem_cache_debug(s) && page->slab != s) {
- pr_err("kmem_cache_free: Wrong slab cache. %s but object"
- " is from %s\n", page->slab->name, s->name);
- WARN_ON_ONCE(1);
+ s = cache_from_obj(s, x);
+ if (!s)
return;
- }
-
- slab_free(s, page, x, _RET_IP_);
-
+ slab_free(s, virt_to_head_page(x), x, _RET_IP_);
trace_kmem_cache_free(_RET_IP_, x);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
return -ENOSYS;
}
-/*
- * Figure out what the alignment of the objects will be.
- */
-static unsigned long calculate_alignment(unsigned long flags,
- unsigned long align, unsigned long size)
-{
- /*
- * If the user wants hardware cache aligned objects then follow that
- * suggestion if the object is sufficiently large.
- *
- * The hardware cache alignment cannot override the specified
- * alignment though. If that is greater then use it.
- */
- if (flags & SLAB_HWCACHE_ALIGN) {
- unsigned long ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- align = max(align, ralign);
- }
-
- if (align < ARCH_SLAB_MINALIGN)
- align = ARCH_SLAB_MINALIGN;
-
- return ALIGN(align, sizeof(void *));
-}
-
static void
init_kmem_cache_node(struct kmem_cache_node *n)
{
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
unsigned long flags = s->flags;
unsigned long size = s->object_size;
- unsigned long align = s->align;
int order;
/*
@@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
#endif
/*
- * Determine the alignment based on various parameters that the
- * user specified and the dynamic determination of cache line size
- * on bootup.
- */
- align = calculate_alignment(flags, align, s->object_size);
- s->align = align;
-
- /*
* SLUB stores one object immediately after another beginning from
* offset 0. In order to align the objects we have to simply size
* each object to conform to the alignment.
*/
- size = ALIGN(size, align);
+ size = ALIGN(size, s->align);
s->size = size;
if (forced_order >= 0)
order = forced_order;
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
s->max = s->oo;
return !!oo_objects(s->oo);
-
}
static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3081,6 @@ error:
return -EINVAL;
}
-/*
- * Determine the size of a slab object
- */
-unsigned int kmem_cache_size(struct kmem_cache *s)
-{
- return s->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
static void list_slab_objects(struct kmem_cache *s, struct page *page,
const char *text)
{
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
{
int rc = kmem_cache_close(s);
- if (!rc)
+ if (!rc) {
+ /*
+ * We do the same lock strategy around sysfs_slab_add, see
+ * __kmem_cache_create. Because this is pretty much the last
+ * operation we do and the lock will be released shortly after
+ * that in slab_common.c, we could just move sysfs_slab_remove
+ * to a later point in common code. We should do that when we
+ * have a common sysfs framework for all allocators.
+ */
+ mutex_unlock(&slab_mutex);
sysfs_slab_remove(s);
+ mutex_lock(&slab_mutex);
+ }
return rc;
}
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
__setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *__init create_kmalloc_cache(const char *name,
- int size, unsigned int flags)
-{
- struct kmem_cache *s;
-
- s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-
- s->name = name;
- s->size = s->object_size = size;
- s->align = ARCH_KMALLOC_MINALIGN;
-
- /*
- * This function is called with IRQs disabled during early-boot on
- * single CPU so there's no need to take slab_mutex here.
- */
- if (kmem_cache_open(s, flags))
- goto panic;
-
- list_add(&s->list, &slab_caches);
- return s;
-
-panic:
- panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
- return NULL;
-}
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
- flags |= __GFP_COMP | __GFP_NOTRACK;
+ flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
page = alloc_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
return PAGE_SIZE << compound_order(page);
}
- return slab_ksize(page->slab);
+ return slab_ksize(page->slab_cache);
}
EXPORT_SYMBOL(ksize);
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
}
slab_lock(page);
- if (on_freelist(page->slab, page, object)) {
- object_err(page->slab, page, object, "Object is on free-list");
+ if (on_freelist(page->slab_cache, page, object)) {
+ object_err(page->slab_cache, page, object, "Object is on free-list");
rv = false;
} else {
rv = true;
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kmemleak_free(x);
- __free_pages(page, compound_order(page));
+ __free_memcg_kmem_pages(page, compound_order(page));
return;
}
- slab_free(page->slab, page, object, _RET_IP_);
+ slab_free(page->slab_cache, page, object, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
/*
* Used for early kmem_cache structures that were allocated using
- * the page allocator
+ * the page allocator. Allocate them properly then fix up the pointers
+ * that may be pointing to the wrong kmem_cache structure.
*/
-static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
{
int node;
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
- list_add(&s->list, &slab_caches);
- s->refcount = -1;
+ memcpy(s, static_cache, kmem_cache->object_size);
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
if (n) {
list_for_each_entry(p, &n->partial, lru)
- p->slab = s;
+ p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
list_for_each_entry(p, &n->full, lru)
- p->slab = s;
+ p->slab_cache = s;
#endif
}
}
+ list_add(&s->list, &slab_caches);
+ return s;
}
void __init kmem_cache_init(void)
{
+ static __initdata struct kmem_cache boot_kmem_cache,
+ boot_kmem_cache_node;
int i;
- int caches = 0;
- struct kmem_cache *temp_kmem_cache;
- int order;
- struct kmem_cache *temp_kmem_cache_node;
- unsigned long kmalloc_size;
+ int caches = 2;
if (debug_guardpage_minorder())
slub_max_order = 0;
- kmem_size = offsetof(struct kmem_cache, node) +
- nr_node_ids * sizeof(struct kmem_cache_node *);
-
- /* Allocate two kmem_caches from the page allocator */
- kmalloc_size = ALIGN(kmem_size, cache_line_size());
- order = get_order(2 * kmalloc_size);
- kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
-
- /*
- * Must first have the slab cache available for the allocations of the
- * struct kmem_cache_node's. There is special bootstrap code in
- * kmem_cache_open for slab_state == DOWN.
- */
- kmem_cache_node = (void *)kmem_cache + kmalloc_size;
+ kmem_cache_node = &boot_kmem_cache_node;
+ kmem_cache = &boot_kmem_cache;
- kmem_cache_node->name = "kmem_cache_node";
- kmem_cache_node->size = kmem_cache_node->object_size =
- sizeof(struct kmem_cache_node);
- kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ create_boot_cache(kmem_cache_node, "kmem_cache_node",
+ sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
- temp_kmem_cache = kmem_cache;
- kmem_cache->name = "kmem_cache";
- kmem_cache->size = kmem_cache->object_size = kmem_size;
- kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *),
+ SLAB_HWCACHE_ALIGN);
- kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
- memcpy(kmem_cache, temp_kmem_cache, kmem_size);
+ kmem_cache = bootstrap(&boot_kmem_cache);
/*
* Allocate kmem_cache_node properly from the kmem_cache slab.
* kmem_cache_node is separately allocated so no need to
* update any list pointers.
*/
- temp_kmem_cache_node = kmem_cache_node;
-
- kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
- memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
-
- kmem_cache_bootstrap_fixup(kmem_cache_node);
-
- caches++;
- kmem_cache_bootstrap_fixup(kmem_cache);
- caches++;
- /* Free temporary boot structure */
- free_pages((unsigned long)temp_kmem_cache, order);
+ kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-static struct kmem_cache *find_mergeable(size_t size,
+static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
size_t align, unsigned long flags, const char *name,
void (*ctor)(void *))
{
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
if (s->size - size >= sizeof(void *))
continue;
+ if (!cache_match_memcg(s, memcg))
+ continue;
+
return s;
}
return NULL;
}
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- s = find_mergeable(size, align, flags, name, ctor);
+ s = find_mergeable(memcg, size, align, flags, name, ctor);
if (s) {
s->refcount++;
/*
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
if (err)
return err;
+ /* Mutex is not taken during early boot */
+ if (slab_state <= UP)
+ return 0;
+
+ memcg_propagate_slab_attrs(s);
mutex_unlock(&slab_mutex);
err = sysfs_slab_add(s);
mutex_lock(&slab_mutex);
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
+#ifdef CONFIG_MEMCG_KMEM
+ if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
+ int i;
+
+ mutex_lock(&slab_mutex);
+ if (s->max_attr_size < len)
+ s->max_attr_size = len;
+ /*
+ * This is a best effort propagation, so this function's return
+ * value will be determined by the parent cache only. This is
+ * basically because not all attributes will have a well
+ * defined semantics for rollbacks - most of the actions will
+ * have permanent effects.
+ *
+ * Returning the error value of any of the children that fail
+ * is not 100 % defined, in the sense that users seeing the
+ * error code won't be able to know anything about the state of
+ * the cache.
+ *
+ * Only returning the error code for the parent cache at least
+ * has well defined semantics. The cache being written to
+ * directly either failed or succeeded, in which case we loop
+ * through the descendants with best-effort propagation.
+ */
+ for_each_memcg_cache_index(i) {
+ struct kmem_cache *c = cache_from_memcg(s, i);
+ if (c)
+ attribute->store(c, buf, len);
+ }
+ mutex_unlock(&slab_mutex);
+ }
+#endif
return err;
}
+static void memcg_propagate_slab_attrs(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ int i;
+ char *buffer = NULL;
+
+ if (!is_root_cache(s))
+ return;
+
+ /*
+ * This mean this cache had no attribute written. Therefore, no point
+ * in copying default values around
+ */
+ if (!s->max_attr_size)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
+ char mbuf[64];
+ char *buf;
+ struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+
+ if (!attr || !attr->store || !attr->show)
+ continue;
+
+ /*
+ * It is really bad that we have to allocate here, so we will
+ * do it only as a fallback. If we actually allocate, though,
+ * we can just use the allocated buffer until the end.
+ *
+ * Most of the slub attributes will tend to be very small in
+ * size, but sysfs allows buffers up to a page, so they can
+ * theoretically happen.
+ */
+ if (buffer)
+ buf = buffer;
+ else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+ buf = mbuf;
+ else {
+ buffer = (char *) get_zeroed_page(GFP_KERNEL);
+ if (WARN_ON(!buffer))
+ continue;
+ buf = buffer;
+ }
+
+ attr->show(s->memcg_params->root_cache, buf);
+ attr->store(s, buf, strlen(buf));
+ }
+
+ if (buffer)
+ free_page((unsigned long)buffer);
+#endif
+}
+
static const struct sysfs_ops slab_sysfs_ops = {
.show = slab_attr_show,
.store = slab_attr_store,
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
if (p != name + 1)
*p++ = '-';
p += sprintf(p, "%07d", s->size);
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+#endif
+
BUG_ON(p > name + ID_STR_LENGTH - 1);
return name;
}
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
const char *name;
- int unmergeable;
-
- if (slab_state < FULL)
- /* Defer until later */
- return 0;
+ int unmergeable = slab_unmergeable(s);
- unmergeable = slab_unmergeable(s);
if (unmergeable) {
/*
* Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
* The /proc/slabinfo ABI
*/
#ifdef CONFIG_SLABINFO
-static void print_slabinfo_header(struct seq_file *m)
-{
- seq_puts(m, "slabinfo - version: 2.1\n");
- seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
- "<objperslab> <pagesperslab>");
- seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
- seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
- seq_putc(m, '\n');
-}
-
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
- loff_t n = *pos;
-
- mutex_lock(&slab_mutex);
- if (!n)
- print_slabinfo_header(m);
-
- return seq_list_start(&slab_caches, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&slab_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
{
unsigned long nr_partials = 0;
unsigned long nr_slabs = 0;
- unsigned long nr_inuse = 0;
unsigned long nr_objs = 0;
unsigned long nr_free = 0;
- struct kmem_cache *s;
int node;
- s = list_entry(p, struct kmem_cache, list);
-
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
nr_free += count_partial(n, count_free);
}
- nr_inuse = nr_objs - nr_free;
-
- seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
- nr_objs, s->size, oo_objects(s->oo),
- (1 << oo_order(s->oo)));
- seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
- seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
- 0UL);
- seq_putc(m, '\n');
- return 0;
+ sinfo->active_objs = nr_objs - nr_free;
+ sinfo->num_objs = nr_objs;
+ sinfo->active_slabs = nr_slabs;
+ sinfo->num_slabs = nr_slabs;
+ sinfo->objects_per_slab = oo_objects(s->oo);
+ sinfo->cache_order = oo_order(s->oo);
}
-static const struct seq_operations slabinfo_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
-static int slabinfo_open(struct inode *inode, struct file *file)
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
{
- return seq_open(file, &slabinfo_op);
}
-static const struct file_operations proc_slabinfo_operations = {
- .open = slabinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init slab_proc_init(void)
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
{
- proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
- return 0;
+ return -EIO;
}
-module_init(slab_proc_init);
#endif /* CONFIG_SLABINFO */
diff --git a/mm/truncate.c b/mm/truncate.c
index d51ce92d6e8..c75b736e54b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -577,29 +577,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize)
EXPORT_SYMBOL(truncate_setsize);
/**
- * vmtruncate - unmap mappings "freed" by truncate() syscall
- * @inode: inode of the file used
- * @newsize: file offset to start truncating
- *
- * This function is deprecated and truncate_setsize or truncate_pagecache
- * should be used instead, together with filesystem specific block truncation.
- */
-int vmtruncate(struct inode *inode, loff_t newsize)
-{
- int error;
-
- error = inode_newsize_ok(inode, newsize);
- if (error)
- return error;
-
- truncate_setsize(inode, newsize);
- if (inode->i_op->truncate)
- inode->i_op->truncate(inode);
- return 0;
-}
-EXPORT_SYMBOL(vmtruncate);
-
-/**
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
* @inode: inode
* @lstart: offset of beginning of hole
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7f3096137b8..16b42af393a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
}
/*
- * Are there way too many processes in the direct reclaim path already?
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get resheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
isolated = zone_page_state(zone, NR_ISOLATED_ANON);
}
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ inactive >>= 3;
+
return isolated > inactive;
}
@@ -2440,12 +2452,16 @@ static bool zone_balanced(struct zone *zone, int order,
}
/*
- * pgdat_balanced is used when checking if a node is balanced for high-order
- * allocations. Only zones that meet watermarks and are in a zone allowed
- * by the callers classzone_idx are added to balanced_pages. The total of
- * balanced pages must be at least 25% of the zones allowed by classzone_idx
- * for the node to be considered balanced. Forcing all zones to be balanced
- * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * pgdat_balanced() is used when checking if a node is balanced.
+ *
+ * For order-0, all zones must be balanced!
+ *
+ * For high-order allocations only zones that meet watermarks and are in a
+ * zone allowed by the callers classzone_idx are added to balanced_pages. The
+ * total of balanced pages must be at least 25% of the zones allowed by
+ * classzone_idx for the node to be considered balanced. Forcing all zones to
+ * be balanced for high orders can cause excessive reclaim when there are
+ * imbalanced zones.
* The choice of 25% is due to
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
@@ -2455,17 +2471,43 @@ static bool zone_balanced(struct zone *zone, int order,
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
*/
-static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
- int classzone_idx)
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
unsigned long present_pages = 0;
+ unsigned long balanced_pages = 0;
int i;
- for (i = 0; i <= classzone_idx; i++)
- present_pages += pgdat->node_zones[i].present_pages;
+ /* Check the watermark levels */
+ for (i = 0; i <= classzone_idx; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
- /* A special case here: if zone has no page, we think it's balanced */
- return balanced_pages >= (present_pages >> 2);
+ present_pages += zone->present_pages;
+
+ /*
+ * A special case here:
+ *
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well!
+ */
+ if (zone->all_unreclaimable) {
+ balanced_pages += zone->present_pages;
+ continue;
+ }
+
+ if (zone_balanced(zone, order, 0, i))
+ balanced_pages += zone->present_pages;
+ else if (!order)
+ return false;
+ }
+
+ if (order)
+ return balanced_pages >= (present_pages >> 2);
+ else
+ return true;
}
/*
@@ -2477,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)
{
- int i;
- unsigned long balanced = 0;
- bool all_zones_ok = true;
-
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return false;
@@ -2499,39 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
}
- /* Check the watermark levels */
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- /*
- * balance_pgdat() skips over all_unreclaimable after
- * DEF_PRIORITY. Effectively, it considers them balanced so
- * they must be considered balanced here as well if kswapd
- * is to sleep
- */
- if (zone->all_unreclaimable) {
- balanced += zone->present_pages;
- continue;
- }
-
- if (!zone_balanced(zone, order, 0, i))
- all_zones_ok = false;
- else
- balanced += zone->present_pages;
- }
-
- /*
- * For high-order requests, the balanced zones must contain at least
- * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
- * must be balanced
- */
- if (order)
- return pgdat_balanced(pgdat, balanced, classzone_idx);
- else
- return all_zones_ok;
+ return pgdat_balanced(pgdat, order, classzone_idx);
}
/*
@@ -2558,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
- int all_zones_ok;
- unsigned long balanced;
+ struct zone *unbalanced_zone;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
@@ -2592,8 +2597,7 @@ loop_again:
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
- all_zones_ok = 1;
- balanced = 0;
+ unbalanced_zone = NULL;
/*
* Scan in the highmem->dma direction for the highest
@@ -2731,7 +2735,7 @@ loop_again:
}
if (!zone_balanced(zone, testorder, 0, end_zone)) {
- all_zones_ok = 0;
+ unbalanced_zone = zone;
/*
* We are still under min water mark. This
* means that we have a GFP_ATOMIC allocation
@@ -2749,8 +2753,6 @@ loop_again:
* speculatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
- if (i <= *classzone_idx)
- balanced += zone->present_pages;
}
}
@@ -2764,7 +2766,7 @@ loop_again:
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);
- if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
+ if (pgdat_balanced(pgdat, order, *classzone_idx))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2773,8 +2775,8 @@ loop_again:
if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
if (has_under_min_watermark_zone)
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
- else
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ else if (unbalanced_zone)
+ wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
}
/*
@@ -2788,12 +2790,7 @@ loop_again:
} while (--sc.priority >= 0);
out:
- /*
- * order-0: All zones must meet high watermark for a balanced node
- * high-order: Balanced zones must make up at least 25% of the node
- * for the node to be balanced
- */
- if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
+ if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
cond_resched();
try_to_freeze();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df14808f0a3..9800306c819 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
"pgrotated",
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_pte_updates",
+ "numa_hint_faults",
+ "numa_hint_faults_local",
+ "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+ "pgmigrate_success",
+ "pgmigrate_fail",
+#endif
#ifdef CONFIG_COMPACTION
- "compact_blocks_moved",
- "compact_pages_moved",
- "compact_pagemigrate_failed",
+ "compact_migrate_scanned",
+ "compact_free_scanned",
+ "compact_isolated",
"compact_stall",
"compact_fail",
"compact_success",