summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-06-20 22:29:08 -0700
committerDavid S. Miller <davem@davemloft.net>2011-06-20 22:29:08 -0700
commit9f6ec8d697c08963d83880ccd35c13c5ace716ea (patch)
treead8d93cf6fcdd09b86ade09f5fcbbc66cdb1cca2 /mm
parent4aa3a715551c93eda32d79bd52042ce500bd5383 (diff)
parent56299378726d5f2ba8d3c8cbbd13cb280ba45e4f (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
Conflicts: drivers/net/wireless/iwlwifi/iwl-agn-rxon.c drivers/net/wireless/rtlwifi/pci.c net/netfilter/ipvs/ip_vs_core.c
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c76
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/huge_memory.c5
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/memcontrol.c81
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/page_cgroup.c71
-rw-r--r--mm/rmap.c106
-rw-r--r--mm/slab.c9
-rw-r--r--mm/slub.c12
-rw-r--r--mm/thrash.c105
-rw-r--r--mm/vmscan.c20
17 files changed, 383 insertions, 154 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 021a2960ef9..6cc604bd564 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone,
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
+ /*
+ * Initialise the free scanner. The starting point is where we last
+ * scanned from (or the end of the zone if starting). The low point
+ * is the end of the pageblock the migration scanner is using.
+ */
pfn = cc->free_pfn;
low_pfn = cc->migrate_pfn + pageblock_nr_pages;
- high_pfn = low_pfn;
+
+ /*
+ * Take care that if the migration scanner is at the end of the zone
+ * that the free scanner does not accidentally move to the next zone
+ * in the next isolation cycle.
+ */
+ high_pfn = min(low_pfn, pfn);
/*
* Isolate free pages until enough are available to migrate the
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone)
return isolated > (inactive + active) / 2;
}
+/* possible outcome of isolate_migratepages */
+typedef enum {
+ ISOLATE_ABORT, /* Abort compaction now */
+ ISOLATE_NONE, /* No pages isolated, continue scanning */
+ ISOLATE_SUCCESS, /* Pages isolated, migrate */
+} isolate_migrate_t;
+
/*
* Isolate all pages that can be migrated from the block pointed to by
* the migrate scanner within compact_control.
*/
-static unsigned long isolate_migratepages(struct zone *zone,
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
/* Do not cross the free scanner or scan within a memory hole */
if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
cc->migrate_pfn = end_pfn;
- return 0;
+ return ISOLATE_NONE;
}
/*
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone,
* delay for some time until fewer pages are isolated
*/
while (unlikely(too_many_isolated(zone))) {
+ /* async migration should just abort */
+ if (!cc->sync)
+ return ISOLATE_ABORT;
+
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
- return 0;
+ return ISOLATE_ABORT;
}
/* Time to isolate some pages for migration */
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
- return cc->nr_migratepages;
+ return ISOLATE_SUCCESS;
}
/*
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone,
if (cc->free_pfn <= cc->migrate_pfn)
return COMPACT_COMPLETE;
- /* Compaction run is not finished if the watermark is not met */
- watermark = low_wmark_pages(zone);
- watermark += (1 << cc->order);
-
- if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
- return COMPACT_CONTINUE;
-
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone,
if (cc->order == -1)
return COMPACT_CONTINUE;
+ /* Compaction run is not finished if the watermark is not met */
+ watermark = low_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
+ if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+ return COMPACT_CONTINUE;
+
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
/* Job done if page is free of the right migratetype */
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
unsigned long watermark;
/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
* Watermarks for order-0 must be met for compaction. Note the 2UL.
* This is because during migration, copies of pages need to be
* allocated and for a short time, the footprint is higher
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
return COMPACT_SKIPPED;
/*
- * order == -1 is expected when compacting via
- * /proc/sys/vm/compact_memory
- */
- if (order == -1)
- return COMPACT_CONTINUE;
-
- /*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
*
- * index of -1 implies allocations might succeed dependingon watermarks
+ * index of -1000 implies allocations might succeed depending on
+ * watermarks
* index towards 0 implies failure is due to lack of memory
* index towards 1000 implies failure is due to fragmentation
*
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
return COMPACT_SKIPPED;
- if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+ if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+ 0, 0))
return COMPACT_PARTIAL;
return COMPACT_CONTINUE;
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
unsigned long nr_migrate, nr_remaining;
int err;
- if (!isolate_migratepages(zone, cc))
+ switch (isolate_migratepages(zone, cc)) {
+ case ISOLATE_ABORT:
+ ret = COMPACT_PARTIAL;
+ goto out;
+ case ISOLATE_NONE:
continue;
+ case ISOLATE_SUCCESS:
+ ;
+ }
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
+out:
/* Release free pages and check accounting */
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
diff --git a/mm/filemap.c b/mm/filemap.c
index d7b10578a64..a8251a8d345 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file)
error = security_inode_killpriv(dentry);
if (!error && killsuid)
error = __remove_suid(dentry, killsuid);
- if (!error)
+ if (!error && (inode->i_sb->s_flags & MS_NOSEC))
inode->i_flags |= S_NOSEC;
return error;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 615d9743a3c..81532f297fd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2234,11 +2234,8 @@ static void khugepaged_loop(void)
while (likely(khugepaged_enabled())) {
#ifndef CONFIG_NUMA
hpage = khugepaged_alloc_hugepage();
- if (unlikely(!hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (unlikely(!hpage))
break;
- }
- count_vm_event(THP_COLLAPSE_ALLOC);
#else
if (IS_ERR(hpage)) {
khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f33bb319b73..bfcf153bc82 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(chg);
+ return ERR_PTR(-VM_FAULT_OOM);
if (chg)
if (hugetlb_get_quota(inode->i_mapping, chg))
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -1111,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
prep_new_huge_page(h, page, page_to_nid(page));
+ /*
+ * If we had gigantic hugepages allocated at boot time, we need
+ * to restore the 'stolen' pages to totalram_pages in order to
+ * fix confusing memory reports from free(1) and another
+ * side-effects, like CommitLimit going negative.
+ */
+ if (h->order > (MAX_ORDER - 1))
+ totalram_pages += 1 << h->order;
}
}
diff --git a/mm/ksm.c b/mm/ksm.c
index d708b3ef226..9a68b0cf0a1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
ksm_scan.mm_slot = slot;
spin_unlock(&ksm_mmlist_lock);
+ /*
+ * Although we tested list_empty() above, a racing __ksm_exit
+ * of the last mm on the list may have removed it since then.
+ */
+ if (slot == &ksm_mm_head)
+ return NULL;
next_mm:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3a..cf7d027a884 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -359,7 +359,7 @@ enum charge_type {
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -735,7 +735,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup, css);
}
-static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *mem = NULL;
@@ -1663,15 +1663,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
- if (root_mem->memsw_is_minimum)
+ if (!check_soft && root_mem->memsw_is_minimum)
noswap = true;
while (1) {
victim = mem_cgroup_select_victim(root_mem);
if (victim == root_mem) {
loop++;
- if (loop >= 1)
- drain_all_stock_async();
+ /*
+ * We are not draining per cpu cached charges during
+ * soft limit reclaim because global reclaim doesn't
+ * care about charges. It tries to free some memory and
+ * charges will not give any.
+ */
+ if (!check_soft && loop >= 1)
+ drain_all_stock_async(root_mem);
if (loop >= 2) {
/*
* If we have not been able to reclaim
@@ -1934,9 +1940,11 @@ struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct work_struct work;
+ unsigned long flags;
+#define FLUSHING_CACHED_CHARGE (0)
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
/*
* Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +1992,7 @@ static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
/*
@@ -2008,26 +2017,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
{
- int cpu;
- /* This function is for scheduling "drain" in asynchronous way.
- * The result of "drain" is not directly handled by callers. Then,
- * if someone is calling drain, we don't have to call drain more.
- * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
- * there is a race. We just do loose check here.
+ int cpu, curcpu;
+ /*
+ * If someone calls draining, avoid adding more kworker runs.
*/
- if (atomic_read(&memcg_drain_count))
+ if (!mutex_trylock(&percpu_charge_mutex))
return;
/* Notify other cpus that system-wide "drain" is running */
- atomic_inc(&memcg_drain_count);
get_online_cpus();
+ /*
+ * Get a hint for avoiding draining charges on the current cpu,
+ * which must be exhausted by our charging. It is not required that
+ * this be a precise check, so we use raw_smp_processor_id() instead of
+ * getcpu()/putcpu().
+ */
+ curcpu = raw_smp_processor_id();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- schedule_work_on(cpu, &stock->work);
+ struct mem_cgroup *mem;
+
+ if (cpu == curcpu)
+ continue;
+
+ mem = stock->cached;
+ if (!mem)
+ continue;
+ if (mem != root_mem) {
+ if (!root_mem->use_hierarchy)
+ continue;
+ /* check whether "mem" is under tree of "root_mem" */
+ if (!css_is_ancestor(&mem->css, &root_mem->css))
+ continue;
+ }
+ if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+ schedule_work_on(cpu, &stock->work);
}
put_online_cpus();
- atomic_dec(&memcg_drain_count);
+ mutex_unlock(&percpu_charge_mutex);
/* We don't wait for flush_work */
}
@@ -2035,9 +2063,9 @@ static void drain_all_stock_async(void)
static void drain_all_stock_sync(void)
{
/* called when force_empty is called */
- atomic_inc(&memcg_drain_count);
+ mutex_lock(&percpu_charge_mutex);
schedule_on_each_cpu(drain_local_stock);
- atomic_dec(&memcg_drain_count);
+ mutex_unlock(&percpu_charge_mutex);
}
/*
@@ -4640,6 +4668,7 @@ static struct cftype mem_cgroup_files[] = {
{
.name = "numa_stat",
.open = mem_control_numa_stat_open,
+ .mode = S_IRUGO,
},
#endif
};
@@ -5414,18 +5443,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *old_cont,
struct task_struct *p)
{
- struct mm_struct *mm;
+ struct mm_struct *mm = get_task_mm(p);
- if (!mc.to)
- /* no need to move charge */
- return;
-
- mm = get_task_mm(p);
if (mm) {
- mem_cgroup_move_charge(mm);
+ if (mc.to)
+ mem_cgroup_move_charge(mm);
+ put_swap_token(mm);
mmput(mm);
}
- mem_cgroup_clear_mc();
+ if (mc.to)
+ mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5c8f7e08928..eac0ba56149 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,7 @@
#include <linux/swapops.h>
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1468,7 +1469,8 @@ int soft_offline_page(struct page *page, int flags)
put_page(page);
if (!ret) {
LIST_HEAD(pagelist);
-
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
0, true);
diff --git a/mm/memory.c b/mm/memory.c
index 6953d3926e0..87d935333f0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1112,11 +1112,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
+ pte_t *start_pte;
pte_t *pte;
again:
init_rss_vec(rss);
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = start_pte;
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
@@ -1196,7 +1198,7 @@ again:
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(start_pte, ptl);
/*
* mmu_gather ran out of room to batch pages, we break out of
@@ -1296,7 +1298,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
+ * @tlb: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9f646374e32..02159c75513 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -494,6 +494,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+ /*
+ * The node we allocated has no zone fallback lists. For avoiding
+ * to access not-initialized zonelist, build here.
+ */
+ build_all_zonelists(NULL);
+
return pgdat;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index e4a5c912983..666e4e67741 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
*/
__dec_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(newpage, NR_FILE_PAGES);
- if (PageSwapBacked(page)) {
+ if (!PageSwapCache(page) && PageSwapBacked(page)) {
__dec_zone_page_state(page, NR_SHMEM);
__inc_zone_page_state(newpage, NR_SHMEM);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index bbdc9af5e11..d49736ff8a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -906,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
if (anon_vma)
return anon_vma;
try_prev:
- /*
- * It is potentially slow to have to call find_vma_prev here.
- * But it's only on the first write fault on the vma, not
- * every time, and we could devise a way to avoid it later
- * (e.g. stash info in next's anon_vma_node when assigning
- * an anon_vma, or when trying vma_merge). Another time.
- */
- BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
+ near = vma->vm_prev;
if (!near)
goto none;
@@ -2044,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return -EINVAL;
/* Find the first overlapping VMA */
- vma = find_vma_prev(mm, start, &prev);
+ vma = find_vma(mm, start);
if (!vma)
return 0;
+ prev = vma->vm_prev;
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 74ccff61d1b..53bffc6c293 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr)
}
#endif
-static int __meminit init_section_page_cgroup(unsigned long pfn)
+static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
{
struct page_cgroup *base, *pc;
struct mem_section *section;
unsigned long table_size;
unsigned long nr;
- int nid, index;
+ int index;
nr = pfn_to_section_nr(pfn);
section = __nr_to_section(nr);
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
if (section->page_cgroup)
return 0;
- nid = page_to_nid(pfn_to_page(pfn));
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
base = alloc_page_cgroup(table_size, nid);
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
pc = base + index;
init_page_cgroup(pc, nr);
}
-
+ /*
+ * The passed "pfn" may not be aligned to SECTION. For the calculation
+ * we need to apply a mask.
+ */
+ pfn &= PAGE_SECTION_MASK;
section->page_cgroup = base - pfn;
total_usage += table_size;
return 0;
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
start = start_pfn & ~(PAGES_PER_SECTION - 1);
end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+ if (nid == -1) {
+ /*
+ * In this case, "nid" already exists and contains valid memory.
+ * "start_pfn" passed to us is a pfn which is an arg for
+ * online__pages(), and start_pfn should exist.
+ */
+ nid = pfn_to_nid(start_pfn);
+ VM_BUG_ON(!node_state(nid, N_ONLINE));
+ }
+
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
if (!pfn_present(pfn))
continue;
- fail = init_section_page_cgroup(pfn);
+ fail = init_section_page_cgroup(pfn, nid);
}
if (!fail)
return 0;
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
void __init page_cgroup_init(void)
{
unsigned long pfn;
- int fail = 0;
+ int nid;
if (mem_cgroup_disabled())
return;
- for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
- if (!pfn_present(pfn))
- continue;
- fail = init_section_page_cgroup(pfn);
- }
- if (fail) {
- printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
- panic("Out of memory");
- } else {
- hotplug_memory_notifier(page_cgroup_callback, 0);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = node_start_pfn(nid);
+ end_pfn = node_end_pfn(nid);
+ /*
+ * start_pfn and end_pfn may not be aligned to SECTION and the
+ * page->flags of out of node pages are not initialized. So we
+ * scan [start_pfn, the biggest section's pfn < end_pfn) here.
+ */
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+
+ if (!pfn_valid(pfn))
+ continue;
+ /*
+ * Nodes's pfns can be overlapping.
+ * We know some arch can have a nodes layout such as
+ * -------------pfn-------------->
+ * N0 | N1 | N2 | N0 | N1 | N2|....
+ */
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+ if (init_section_page_cgroup(pfn, nid))
+ goto oom;
+ }
}
+ hotplug_memory_notifier(page_cgroup_callback, 0);
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
- " want memory cgroups\n");
+ printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
+ "don't want memory cgroups\n");
+ return;
+oom:
+ printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+ panic("Out of memory");
}
void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0eb463ea88d..27dfd3b82b0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -112,9 +112,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
kmem_cache_free(anon_vma_cachep, anon_vma);
}
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
- return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +159,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
@@ -200,6 +200,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
return -ENOMEM;
}
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+ struct anon_vma *new_root = anon_vma->root;
+ if (new_root != root) {
+ if (WARN_ON_ONCE(root))
+ mutex_unlock(&root->mutex);
+ root = new_root;
+ mutex_lock(&root->mutex);
+ }
+ return root;
+}
+
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+ if (root)
+ mutex_unlock(&root->mutex);
+}
+
static void anon_vma_chain_link(struct vm_area_struct *vma,
struct anon_vma_chain *avc,
struct anon_vma *anon_vma)
@@ -208,13 +234,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_lock(anon_vma);
/*
* It's critical to add new vmas to the tail of the anon_vma,
* see comment in huge_memory.c:__split_huge_page().
*/
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
- anon_vma_unlock(anon_vma);
}
/*
@@ -224,13 +248,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *root = NULL;
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- avc = anon_vma_chain_alloc();
- if (!avc)
- goto enomem_failure;
- anon_vma_chain_link(dst, avc, pavc->anon_vma);
+ struct anon_vma *anon_vma;
+
+ avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ if (unlikely(!avc)) {
+ unlock_anon_vma_root(root);
+ root = NULL;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+ }
+ anon_vma = pavc->anon_vma;
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_chain_link(dst, avc, anon_vma);
}
+ unlock_anon_vma_root(root);
return 0;
enomem_failure:
@@ -263,7 +298,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
@@ -280,7 +315,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_lock(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_unlock(anon_vma);
return 0;
@@ -291,36 +328,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
return -ENOMEM;
}
-static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
-{
- struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
- int empty;
-
- /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
- if (!anon_vma)
- return;
-
- anon_vma_lock(anon_vma);
- list_del(&anon_vma_chain->same_anon_vma);
-
- /* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head);
- anon_vma_unlock(anon_vma);
-
- if (empty)
- put_anon_vma(anon_vma);
-}
-
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
+ struct anon_vma *root = NULL;
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
* from newest to oldest, ensuring the root anon_vma gets freed last.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
- anon_vma_unlink(avc);
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ root = lock_anon_vma_root(root, anon_vma);
+ list_del(&avc->same_anon_vma);
+
+ /*
+ * Leave empty anon_vmas on the list - we'll need
+ * to free them outside the lock.
+ */
+ if (list_empty(&anon_vma->head))
+ continue;
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+ unlock_anon_vma_root(root);
+
+ /*
+ * Iterate the list once more, it now only contains empty and unlinked
+ * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ * needing to acquire the anon_vma->root->mutex.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ put_anon_vma(anon_vma);
+
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
diff --git a/mm/slab.c b/mm/slab.c
index bcfa4987c8a..d96e223de77 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3604,13 +3604,14 @@ free_done:
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+ void *caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
- objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+ objp = cache_free_debugcheck(cachep, objp, caller);
kmemcheck_slab_free(cachep, objp, obj_size(cachep));
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
debug_check_no_locks_freed(objp, obj_size(cachep));
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, obj_size(cachep));
- __cache_free(cachep, objp);
+ __cache_free(cachep, objp, __builtin_return_address(0));
local_irq_restore(flags);
trace_kmem_cache_free(_RET_IP_, objp);
@@ -3831,7 +3832,7 @@ void kfree(const void *objp)
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, obj_size(c));
debug_check_no_obj_freed(objp, obj_size(c));
- __cache_free(c, (void *)objp);
+ __cache_free(c, (void *)objp, __builtin_return_address(0));
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b..35f351f2619 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
-#ifdef CONFIG_CMPXCHG_LOCAL
/*
- * Must align to double word boundary for the double cmpxchg instructions
- * to work.
+ * Must align to double word boundary for the double cmpxchg
+ * instructions to work; see __pcpu_double_call_return_bool().
*/
- s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
-#else
- /* Regular alignment is sufficient */
- s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
-#endif
+ s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
+ 2 * sizeof(void *));
if (!s->cpu_slab)
return 0;
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd..fabf2d0f516 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,14 +21,40 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
+#include <linux/memcontrol.h>
+
+#include <trace/events/vmscan.h>
+
+#define TOKEN_AGING_INTERVAL (0xFF)
static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
+struct mem_cgroup *swap_token_memcg;
static unsigned int global_faults;
+static unsigned int last_aging;
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = try_get_mem_cgroup_from_mm(mm);
+ if (memcg)
+ css_put(mem_cgroup_css(memcg));
+
+ return memcg;
+}
+#else
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+ return NULL;
+}
+#endif
void grab_swap_token(struct mm_struct *mm)
{
int current_interval;
+ unsigned int old_prio = mm->token_priority;
global_faults++;
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm)
return;
/* First come first served */
- if (swap_token_mm == NULL) {
- mm->token_priority = mm->token_priority + 2;
- swap_token_mm = mm;
- goto out;
+ if (!swap_token_mm)
+ goto replace_token;
+
+ if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
+ swap_token_mm->token_priority /= 2;
+ last_aging = global_faults;
}
- if (mm != swap_token_mm) {
- if (current_interval < mm->last_interval)
- mm->token_priority++;
- else {
- if (likely(mm->token_priority > 0))
- mm->token_priority--;
- }
- /* Check if we deserve the token */
- if (mm->token_priority > swap_token_mm->token_priority) {
- mm->token_priority += 2;
- swap_token_mm = mm;
- }
- } else {
- /* Token holder came in again! */
+ if (mm == swap_token_mm) {
mm->token_priority += 2;
+ goto update_priority;
+ }
+
+ if (current_interval < mm->last_interval)
+ mm->token_priority++;
+ else {
+ if (likely(mm->token_priority > 0))
+ mm->token_priority--;
}
+ /* Check if we deserve the token */
+ if (mm->token_priority > swap_token_mm->token_priority)
+ goto replace_token;
+
+update_priority:
+ trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
+
out:
mm->faultstamp = global_faults;
mm->last_interval = current_interval;
spin_unlock(&swap_token_lock);
+ return;
+
+replace_token:
+ mm->token_priority += 2;
+ trace_replace_swap_token(swap_token_mm, mm);
+ swap_token_mm = mm;
+ swap_token_memcg = swap_token_memcg_from_mm(mm);
+ last_aging = global_faults;
+ goto out;
}
/* Called on process exit. */
void __put_swap_token(struct mm_struct *mm)
{
spin_lock(&swap_token_lock);
- if (likely(mm == swap_token_mm))
+ if (likely(mm == swap_token_mm)) {
+ trace_put_swap_token(swap_token_mm);
swap_token_mm = NULL;
+ swap_token_memcg = NULL;
+ }
spin_unlock(&swap_token_lock);
}
+
+static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
+{
+ if (!a)
+ return true;
+ if (!b)
+ return true;
+ if (a == b)
+ return true;
+ return false;
+}
+
+void disable_swap_token(struct mem_cgroup *memcg)
+{
+ /* memcg reclaim don't disable unrelated mm token. */
+ if (match_memcg(memcg, swap_token_memcg)) {
+ spin_lock(&swap_token_lock);
+ if (match_memcg(memcg, swap_token_memcg)) {
+ trace_disable_swap_token(swap_token_mm);
+ swap_token_mm = NULL;
+ swap_token_memcg = NULL;
+ }
+ spin_unlock(&swap_token_lock);
+ }
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9c..8ff834e19c2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1124,8 +1124,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
nr_lumpy_dirty++;
scan++;
} else {
- /* the page is freed already. */
- if (!page_count(cursor_page))
+ /*
+ * Check if the page is freed already.
+ *
+ * We can't use page_count() as that
+ * requires compound_head and we don't
+ * have a pin on the page here. If a
+ * page is tail, we may or may not
+ * have isolated the head, so assume
+ * it's not free, it'd be tricky to
+ * track the head status without a
+ * page pin.
+ */
+ if (!PageTail(cursor_page) &&
+ !atomic_read(&cursor_page->_count))
continue;
break;
}
@@ -2081,7 +2093,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
if (!priority)
- disable_swap_token();
+ disable_swap_token(sc->mem_cgroup);
total_scanned += shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
@@ -2407,7 +2419,7 @@ loop_again:
/* The swap token gets in the way of swapout... */
if (!priority)
- disable_swap_token();
+ disable_swap_token(NULL);
all_zones_ok = 1;
balanced = 0;