summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHerbert Xu <herbert@gondor.apana.org.au>2014-11-12 22:11:15 +0800
committerHerbert Xu <herbert@gondor.apana.org.au>2014-11-12 22:11:15 +0800
commit4c7912e9198b141b6cd85999d28b48cb5a037aa0 (patch)
tree901b2bc822976f6092fd06c709876e96d78bd50d /mm
parentbdcf83b7831e41f666b50ee6cd946c733bbea1ef (diff)
parent206c5f60a3d902bc4b56dab2de3e88de5eb06108 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux
Merging 3.18-rc4 in order to pick up the memzero_explicit helper.
Diffstat (limited to 'mm')
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/cma.c68
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/huge_memory.c15
-rw-r--r--mm/memcontrol.c105
-rw-r--r--mm/memory.c1
-rw-r--r--mm/memory_hotplug.c5
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/oom_kill.c17
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/page_cgroup.c1
-rw-r--r--mm/rmap.c88
-rw-r--r--mm/shmem.c36
-rw-r--r--mm/slab_common.c10
-rw-r--r--mm/truncate.c61
16 files changed, 305 insertions, 166 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index b3cbe19f71b..fcad8322ef3 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -68,11 +68,13 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
* to be released by the balloon driver.
*/
if (trylock_page(page)) {
+#ifdef CONFIG_BALLOON_COMPACTION
if (!PagePrivate(page)) {
/* raced with isolation */
unlock_page(page);
continue;
}
+#endif
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
balloon_page_delete(page);
__count_vm_event(BALLOON_DEFLATE);
diff --git a/mm/cma.c b/mm/cma.c
index 963bc4add9a..fde706e1284 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -124,6 +124,7 @@ static int __init cma_activate_area(struct cma *cma)
err:
kfree(cma->bitmap);
+ cma->count = 0;
return -EINVAL;
}
@@ -217,9 +218,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
phys_addr_t highmem_start = __pa(high_memory);
int ret = 0;
- pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
- __func__, (unsigned long)size, (unsigned long)base,
- (unsigned long)limit, (unsigned long)alignment);
+ pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
+ __func__, &size, &base, &limit, &alignment);
if (cma_area_count == ARRAY_SIZE(cma_areas)) {
pr_err("Not enough slots for CMA reserved regions!\n");
@@ -244,52 +244,72 @@ int __init cma_declare_contiguous(phys_addr_t base,
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
+ if (!base)
+ fixed = false;
+
/* size should be aligned with order_per_bit */
if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
return -EINVAL;
/*
- * adjust limit to avoid crossing low/high memory boundary for
- * automatically allocated regions
+ * If allocating at a fixed base the request region must not cross the
+ * low/high memory boundary.
*/
- if (((limit == 0 || limit > memblock_end) &&
- (memblock_end - size < highmem_start &&
- memblock_end > highmem_start)) ||
- (!fixed && limit > highmem_start && limit - size < highmem_start)) {
- limit = highmem_start;
- }
-
- if (fixed && base < highmem_start && base+size > highmem_start) {
+ if (fixed && base < highmem_start && base + size > highmem_start) {
ret = -EINVAL;
- pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
- (unsigned long)base, (unsigned long)highmem_start);
+ pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
+ &base, &highmem_start);
goto err;
}
+ /*
+ * If the limit is unspecified or above the memblock end, its effective
+ * value will be the memblock end. Set it explicitly to simplify further
+ * checks.
+ */
+ if (limit == 0 || limit > memblock_end)
+ limit = memblock_end;
+
/* Reserve memory */
- if (base && fixed) {
+ if (fixed) {
if (memblock_is_region_reserved(base, size) ||
memblock_reserve(base, size) < 0) {
ret = -EBUSY;
goto err;
}
} else {
- phys_addr_t addr = memblock_alloc_range(size, alignment, base,
- limit);
+ phys_addr_t addr = 0;
+
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (base < highmem_start && limit > highmem_start) {
+ addr = memblock_alloc_range(size, alignment,
+ highmem_start, limit);
+ limit = highmem_start;
+ }
+
if (!addr) {
- ret = -ENOMEM;
- goto err;
- } else {
- base = addr;
+ addr = memblock_alloc_range(size, alignment, base,
+ limit);
+ if (!addr) {
+ ret = -ENOMEM;
+ goto err;
+ }
}
+
+ base = addr;
}
ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
if (ret)
goto err;
- pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M,
- (unsigned long)base);
+ pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
+ &base);
return 0;
err:
diff --git a/mm/compaction.c b/mm/compaction.c
index edba18aed17..ec74cf0123e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -784,6 +784,9 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
cc->nr_migratepages = 0;
break;
}
+
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ break;
}
acct_isolated(cc->zone, cc);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74c78aa8bc2..de984159cf0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -200,7 +200,7 @@ retry:
preempt_disable();
if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
- __free_page(zero_page);
+ __free_pages(zero_page, compound_order(zero_page));
goto retry;
}
@@ -232,7 +232,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
- __free_page(zero_page);
+ __free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma)))
+ if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
@@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
- if (unlikely(khugepaged_enter_vma_merge(vma)))
+ if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
return -ENOMEM;
break;
case MADV_NOHUGEPAGE:
@@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm)
return 0;
}
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
unsigned long hstart, hend;
if (!vma->anon_vma)
@@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
+ VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
- return khugepaged_enter(vma);
+ return khugepaged_enter(vma, vm_flags);
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 23976fd885f..d6ac0e33e15 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
* start move here.
*/
-/* for quick checking without looking up memcg */
-atomic_t memcg_moving __read_mostly;
-
static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{
- atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account);
synchronize_rcu();
}
@@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
* Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller.
*/
- if (memcg) {
- atomic_dec(&memcg_moving);
+ if (memcg)
atomic_dec(&memcg->moving_account);
- }
}
/*
@@ -2204,41 +2198,52 @@ cleanup:
return true;
}
-/*
- * Used to update mapped file or writeback or other statistics.
+/**
+ * mem_cgroup_begin_page_stat - begin a page state statistics transaction
+ * @page: page that is going to change accounted state
+ * @locked: &memcg->move_lock slowpath was taken
+ * @flags: IRQ-state flags for &memcg->move_lock
*
- * Notes: Race condition
+ * This function must mark the beginning of an accounted page state
+ * change to prevent double accounting when the page is concurrently
+ * being moved to another memcg:
*
- * Charging occurs during page instantiation, while the page is
- * unmapped and locked in page migration, or while the page table is
- * locked in THP migration. No race is possible.
+ * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ * if (TestClearPageState(page))
+ * mem_cgroup_update_page_stat(memcg, state, -1);
+ * mem_cgroup_end_page_stat(memcg, locked, flags);
*
- * Uncharge happens to pages with zero references, no race possible.
+ * The RCU lock is held throughout the transaction. The fast path can
+ * get away without acquiring the memcg->move_lock (@locked is false)
+ * because page moving starts with an RCU grace period.
*
- * Charge moving between groups is protected by checking mm->moving
- * account and taking the move_lock in the slowpath.
+ * The RCU lock also protects the memcg from being freed when the page
+ * state that is going to change is the only thing preventing the page
+ * from being uncharged. E.g. end-writeback clearing PageWriteback(),
+ * which allows migration to go ahead and uncharge the page before the
+ * account transaction might be complete.
*/
-
-void __mem_cgroup_begin_update_page_stat(struct page *page,
- bool *locked, unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
+ bool *locked,
+ unsigned long *flags)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc;
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
pc = lookup_page_cgroup(page);
again:
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
- return;
- /*
- * If this memory cgroup is not under account moving, we don't
- * need to take move_lock_mem_cgroup(). Because we already hold
- * rcu_read_lock(), any calls to move_account will be delayed until
- * rcu_read_unlock().
- */
- VM_BUG_ON(!rcu_read_lock_held());
+ return NULL;
+
+ *locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
- return;
+ return memcg;
move_lock_mem_cgroup(memcg, flags);
if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
@@ -2246,36 +2251,40 @@ again:
goto again;
}
*locked = true;
+
+ return memcg;
}
-void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
+/**
+ * mem_cgroup_end_page_stat - finish a page state statistics transaction
+ * @memcg: the memcg that was accounted against
+ * @locked: value received from mem_cgroup_begin_page_stat()
+ * @flags: value received from mem_cgroup_begin_page_stat()
+ */
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
+ unsigned long flags)
{
- struct page_cgroup *pc = lookup_page_cgroup(page);
+ if (memcg && locked)
+ move_unlock_mem_cgroup(memcg, &flags);
- /*
- * It's guaranteed that pc->mem_cgroup never changes while
- * lock is held because a routine modifies pc->mem_cgroup
- * should take move_lock_mem_cgroup().
- */
- move_unlock_mem_cgroup(pc->mem_cgroup, flags);
+ rcu_read_unlock();
}
-void mem_cgroup_update_page_stat(struct page *page,
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx, int val)
{
- struct mem_cgroup *memcg;
- struct page_cgroup *pc = lookup_page_cgroup(page);
- unsigned long uninitialized_var(flags);
-
- if (mem_cgroup_disabled())
- return;
-
VM_BUG_ON(!rcu_read_lock_held());
- memcg = pc->mem_cgroup;
- if (unlikely(!memcg || !PageCgroupUsed(pc)))
- return;
- this_cpu_add(memcg->stat->count[idx], val);
+ if (memcg)
+ this_cpu_add(memcg->stat->count[idx], val);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 1cc6bfbd872..3e503831e04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1147,6 +1147,7 @@ again:
print_bad_pte(vma, addr, ptent, page);
if (unlikely(!__tlb_remove_page(tlb, page))) {
force_flush = 1;
+ addr += PAGE_SIZE;
break;
}
continue;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 29d8693d0c6..252e1dbbed8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1912,7 +1912,6 @@ void try_offline_node(int nid)
unsigned long start_pfn = pgdat->node_start_pfn;
unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
unsigned long pfn;
- struct page *pgdat_page = virt_to_page(pgdat);
int i;
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -1941,10 +1940,6 @@ void try_offline_node(int nid)
node_set_offline(nid);
unregister_one_node(nid);
- if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
- /* node data is allocated from boot memory */
- return;
-
/* free waittable in each zone */
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f855206e7f..87e82b38453 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
end, prev->vm_pgoff, NULL);
if (err)
return NULL;
- khugepaged_enter_vma_merge(prev);
+ khugepaged_enter_vma_merge(prev, vm_flags);
return prev;
}
@@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
next->vm_pgoff - pglen, NULL);
if (err)
return NULL;
- khugepaged_enter_vma_merge(area);
+ khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
@@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
vma_unlock_anon_vma(vma);
- khugepaged_enter_vma_merge(vma);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(vma->vm_mm);
return error;
}
@@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
vma_unlock_anon_vma(vma);
- khugepaged_enter_vma_merge(vma);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(vma->vm_mm);
return error;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bbf405a3a18..5340f6b9131 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
dump_tasks(memcg, nodemask);
}
+/*
+ * Number of OOM killer invocations (including memcg OOM killer).
+ * Primarily used by PM freezer to check for potential races with
+ * OOM killed frozen task.
+ */
+static atomic_t oom_kills = ATOMIC_INIT(0);
+
+int oom_kills_count(void)
+{
+ return atomic_read(&oom_kills);
+}
+
+void note_oom_kill(void)
+{
+ atomic_inc(&oom_kills);
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ff24c9d8311..19ceae87522 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2116,23 +2116,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
- * Helper function for set_page_writeback family.
- *
- * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
- * while calling this function.
- * See test_set_page_writeback for example.
- *
- * NOTE: Unlike account_page_dirtied this does not rely on being atomic
- * wrt interrupts.
- */
-void account_page_writeback(struct page *page)
-{
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
- inc_zone_page_state(page, NR_WRITEBACK);
-}
-EXPORT_SYMBOL(account_page_writeback);
-
-/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -2344,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- int ret;
- bool locked;
unsigned long memcg_flags;
+ struct mem_cgroup *memcg;
+ bool locked;
+ int ret;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2369,22 +2353,23 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- int ret;
- bool locked;
unsigned long memcg_flags;
+ struct mem_cgroup *memcg;
+ bool locked;
+ int ret;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2410,9 +2395,11 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
} else {
ret = TestSetPageWriteback(page);
}
- if (!ret)
- account_page_writeback(page);
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
+ if (!ret) {
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITEBACK);
+ }
+ mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 736d8e1b638..9cd36b82244 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2252,6 +2252,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
}
/*
+ * PM-freezer should be notified that there might be an OOM killer on
+ * its way to kill and wake somebody up. This is too early and we might
+ * end up not killing anything but false positives are acceptable.
+ * See freeze_processes.
+ */
+ note_oom_kill();
+
+ /*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3708264d283..5331c2bd85a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -171,6 +171,7 @@ static void free_page_cgroup(void *addr)
sizeof(struct page_cgroup) * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 116a5053415..19886fb2f13 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1042,15 +1042,46 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- bool locked;
+ struct mem_cgroup *memcg;
unsigned long flags;
+ bool locked;
- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg, locked, flags);
+}
+
+static void page_remove_file_rmap(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+ bool locked;
+
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ goto out;
+
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page)))
+ goto out;
+
+ /*
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption disabled.
+ */
+ __dec_zone_page_state(page, NR_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+out:
+ mem_cgroup_end_page_stat(memcg, locked, flags);
}
/**
@@ -1061,46 +1092,33 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
- bool anon = PageAnon(page);
- bool locked;
- unsigned long flags;
-
- /*
- * The anon case has no mem_cgroup page_stat to update; but may
- * uncharge_page() below, where the lock ordering can deadlock if
- * we hold the lock against page_stat move: so avoid it on anon.
- */
- if (!anon)
- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page);
+ return;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
/*
- * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
- * and not charged by memcg for now.
- *
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
- * these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (unlikely(PageHuge(page)))
- goto out;
- if (anon) {
- if (PageTransHuge(page))
- __dec_zone_page_state(page,
- NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
- } else {
- __dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
- }
+ if (PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+ -hpage_nr_pages(page));
+
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1110,10 +1128,6 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
- return;
-out:
- if (!anon)
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index cd6fc7590e5..185836ba53e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2345,6 +2345,32 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
return 0;
}
+static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+{
+ struct dentry *whiteout;
+ int error;
+
+ whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
+ if (!whiteout)
+ return -ENOMEM;
+
+ error = shmem_mknod(old_dir, whiteout,
+ S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+ dput(whiteout);
+ if (error)
+ return error;
+
+ /*
+ * Cheat and hash the whiteout while the old dentry is still in
+ * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
+ *
+ * d_lookup() will consistently find one of them at this point,
+ * not sure which one, but that isn't even important.
+ */
+ d_rehash(whiteout);
+ return 0;
+}
+
/*
* The VFS layer already does all the dentry stuff for rename,
* we just have to decrement the usage count for the target if
@@ -2356,7 +2382,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
struct inode *inode = old_dentry->d_inode;
int they_are_dirs = S_ISDIR(inode->i_mode);
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
@@ -2365,6 +2391,14 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
+ if (flags & RENAME_WHITEOUT) {
+ int error;
+
+ error = shmem_whiteout(old_dir, old_dentry);
+ if (error)
+ return error;
+ }
+
if (new_dentry->d_inode) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs) {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3a6e0cfdf03..406944207b6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -93,16 +93,6 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
s->object_size);
continue;
}
-
-#if !defined(CONFIG_SLUB)
- if (!strcmp(s->name, name)) {
- pr_err("%s (%s): Cache name already exists.\n",
- __func__, name);
- dump_stack();
- s = NULL;
- return -EINVAL;
- }
-#endif
}
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
diff --git a/mm/truncate.c b/mm/truncate.c
index 96d167372d8..f1e4d605236 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -20,6 +20,7 @@
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
#include <linux/cleancache.h>
+#include <linux/rmap.h>
#include "internal.h"
static void clear_exceptional_entry(struct address_space *mapping,
@@ -714,17 +715,73 @@ EXPORT_SYMBOL(truncate_pagecache);
* necessary) to @newsize. It will be typically be called from the filesystem's
* setattr function when ATTR_SIZE is passed in.
*
- * Must be called with inode_mutex held and before all filesystem specific
- * block truncation has been performed.
+ * Must be called with a lock serializing truncates and writes (generally
+ * i_mutex but e.g. xfs uses a different lock) and before all filesystem
+ * specific block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
+ loff_t oldsize = inode->i_size;
+
i_size_write(inode, newsize);
+ if (newsize > oldsize)
+ pagecache_isize_extended(inode, oldsize, newsize);
truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);
/**
+ * pagecache_isize_extended - update pagecache after extension of i_size
+ * @inode: inode for which i_size was extended
+ * @from: original inode size
+ * @to: new inode size
+ *
+ * Handle extension of inode size either caused by extending truncate or by
+ * write starting after current i_size. We mark the page straddling current
+ * i_size RO so that page_mkwrite() is called on the nearest write access to
+ * the page. This way filesystem can be sure that page_mkwrite() is called on
+ * the page before user writes to the page via mmap after the i_size has been
+ * changed.
+ *
+ * The function must be called after i_size is updated so that page fault
+ * coming after we unlock the page will already see the new i_size.
+ * The function must be called while we still hold i_mutex - this not only
+ * makes sure i_size is stable but also that userspace cannot observe new
+ * i_size value before we are prepared to store mmap writes at new inode size.
+ */
+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
+{
+ int bsize = 1 << inode->i_blkbits;
+ loff_t rounded_from;
+ struct page *page;
+ pgoff_t index;
+
+ WARN_ON(to > inode->i_size);
+
+ if (from >= to || bsize == PAGE_CACHE_SIZE)
+ return;
+ /* Page straddling @from will not have any hole block created? */
+ rounded_from = round_up(from, bsize);
+ if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
+ return;
+
+ index = from >> PAGE_CACHE_SHIFT;
+ page = find_lock_page(inode->i_mapping, index);
+ /* Page not cached? Nothing to do */
+ if (!page)
+ return;
+ /*
+ * See clear_page_dirty_for_io() for details why set_page_dirty()
+ * is needed.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+}
+EXPORT_SYMBOL(pagecache_isize_extended);
+
+/**
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
* @inode: inode
* @lstart: offset of beginning of hole