summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c23
-rw-r--r--mm/percpu-vm.c17
-rw-r--r--mm/percpu.c62
-rw-r--r--mm/slab.c5
-rw-r--r--mm/slub.c42
-rw-r--r--mm/vmalloc.c27
10 files changed, 114 insertions, 79 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a0860640378..71034f41a2b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -724,6 +724,14 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi_unregister(bdi);
+ /*
+ * If bdi_unregister() had already been called earlier, the
+ * wakeup_timer could still be armed because bdi_prune_sb()
+ * can race with the bdi_wakeup_thread_delayed() calls from
+ * __mark_inode_dirty().
+ */
+ del_timer_sync(&bdi->wb.wakeup_timer);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dae27ba3be2..bb28a5f9db8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2422,6 +2422,8 @@ retry_avoidcopy:
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma))) {
+ page_cache_release(new_page);
+ page_cache_release(old_page);
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 73419c55eda..b982290fd96 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -454,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void)
* between processes, it syncs the pagetable across all
* processes.
*/
-struct vm_struct *alloc_vm_area(size_t size)
+struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
BUG();
return NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 471dedb463a..76f2c5ae908 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -185,6 +185,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
if (!p)
return 0;
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ task_unlock(p);
+ return 0;
+ }
+
/*
* The memory controller may have a limit of 0 bytes, so avoid a divide
* by zero, if necessary.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a3278f00523..71252486bc6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -128,7 +128,6 @@ unsigned long global_dirty_limit;
*
*/
static struct prop_descriptor vm_completions;
-static struct prop_descriptor vm_dirties;
/*
* couple the period to the dirty_ratio:
@@ -154,7 +153,6 @@ static void update_completion_period(void)
{
int shift = calc_period_shift();
prop_change_shift(&vm_completions, shift);
- prop_change_shift(&vm_dirties, shift);
writeback_set_ratelimit();
}
@@ -235,11 +233,6 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL_GPL(bdi_writeout_inc);
-void task_dirty_inc(struct task_struct *tsk)
-{
- prop_inc_single(&vm_dirties, &tsk->dirties);
-}
-
/*
* Obtain an accurate fraction of the BDI's portion.
*/
@@ -1133,17 +1126,17 @@ pause:
pages_dirtied,
pause,
start_time);
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ __set_current_state(TASK_KILLABLE);
io_schedule_timeout(pause);
- dirty_thresh = hard_dirty_limit(dirty_thresh);
/*
- * max-pause area. If dirty exceeded but still within this
- * area, no need to sleep for more than 200ms: (a) 8 pages per
- * 200ms is typically more than enough to curb heavy dirtiers;
- * (b) the pause time limit makes the dirtiers more responsive.
+ * This is typically equal to (nr_dirty < dirty_thresh) and can
+ * also keep "1000+ dd on a slow USB stick" under control.
*/
- if (nr_dirty < dirty_thresh)
+ if (task_ratelimit)
+ break;
+
+ if (fatal_signal_pending(current))
break;
}
@@ -1395,7 +1388,6 @@ void __init page_writeback_init(void)
shift = calc_period_shift();
prop_descriptor_init(&vm_completions, shift);
- prop_descriptor_init(&vm_dirties, shift);
}
/**
@@ -1724,7 +1716,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
__inc_zone_page_state(page, NR_DIRTIED);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
- task_dirty_inc(current);
task_io_account_write(PAGE_CACHE_SIZE);
}
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index ea534960a04..12a48a88c0d 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -50,14 +50,13 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
if (!pages || !bitmap) {
if (may_alloc && !pages)
- pages = pcpu_mem_alloc(pages_size);
+ pages = pcpu_mem_zalloc(pages_size);
if (may_alloc && !bitmap)
- bitmap = pcpu_mem_alloc(bitmap_size);
+ bitmap = pcpu_mem_zalloc(bitmap_size);
if (!pages || !bitmap)
return NULL;
}
- memset(pages, 0, pages_size);
bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
*bitmapp = bitmap;
@@ -143,8 +142,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vunmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -206,8 +205,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_tlb_kernel_range(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -284,8 +283,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
/**
diff --git a/mm/percpu.c b/mm/percpu.c
index bf80e55dbed..3bb810a7200 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
-/* cpus with the lowest and highest unit numbers */
-static unsigned int pcpu_first_unit_cpu __read_mostly;
-static unsigned int pcpu_last_unit_cpu __read_mostly;
+/* cpus with the lowest and highest unit addresses */
+static unsigned int pcpu_low_unit_cpu __read_mostly;
+static unsigned int pcpu_high_unit_cpu __read_mostly;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
@@ -273,11 +273,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
/**
- * pcpu_mem_alloc - allocate memory
+ * pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
*
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vmalloc() is used. The returned
+ * kzalloc() is used; otherwise, vzalloc() is used. The returned
* memory is always zeroed.
*
* CONTEXT:
@@ -286,7 +286,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
-static void *pcpu_mem_alloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size)
{
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
@@ -302,7 +302,7 @@ static void *pcpu_mem_alloc(size_t size)
* @ptr: memory to free
* @size: size of the area
*
- * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc().
+ * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
*/
static void pcpu_mem_free(void *ptr, size_t size)
{
@@ -384,7 +384,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
unsigned long flags;
- new = pcpu_mem_alloc(new_size);
+ new = pcpu_mem_zalloc(new_size);
if (!new)
return -ENOMEM;
@@ -604,11 +604,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
struct pcpu_chunk *chunk;
- chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
if (!chunk)
return NULL;
- chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+ chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
+ sizeof(chunk->map[0]));
if (!chunk->map) {
kfree(chunk);
return NULL;
@@ -977,6 +978,17 @@ bool is_kernel_percpu_address(unsigned long addr)
* address. The caller is responsible for ensuring @addr stays valid
* until this function finishes.
*
+ * percpu allocator has special setup for the first chunk, which currently
+ * supports either embedding in linear address space or vmalloc mapping,
+ * and, from the second one, the backing allocator (currently either vm or
+ * km) provides translation.
+ *
+ * The addr can be tranlated simply without checking if it falls into the
+ * first chunk. But the current code reflects better how percpu allocator
+ * actually works, and the verification can discover both bugs in percpu
+ * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
+ * code.
+ *
* RETURNS:
* The physical address for @addr.
*/
@@ -984,19 +996,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
bool in_first_chunk = false;
- unsigned long first_start, first_end;
+ unsigned long first_low, first_high;
unsigned int cpu;
/*
- * The following test on first_start/end isn't strictly
+ * The following test on unit_low/high isn't strictly
* necessary but will speed up lookups of addresses which
* aren't in the first chunk.
*/
- first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
- first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
- pcpu_unit_pages);
- if ((unsigned long)addr >= first_start &&
- (unsigned long)addr < first_end) {
+ first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
+ first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
+ pcpu_unit_pages);
+ if ((unsigned long)addr >= first_low &&
+ (unsigned long)addr < first_high) {
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
@@ -1233,7 +1245,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
- pcpu_first_unit_cpu = NR_CPUS;
+
+ pcpu_low_unit_cpu = NR_CPUS;
+ pcpu_high_unit_cpu = NR_CPUS;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1253,9 +1267,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
- if (pcpu_first_unit_cpu == NR_CPUS)
- pcpu_first_unit_cpu = cpu;
- pcpu_last_unit_cpu = cpu;
+ /* determine low/high unit_cpu */
+ if (pcpu_low_unit_cpu == NR_CPUS ||
+ unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+ pcpu_low_unit_cpu = cpu;
+ if (pcpu_high_unit_cpu == NR_CPUS ||
+ unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+ pcpu_high_unit_cpu = cpu;
}
}
pcpu_nr_units = unit;
@@ -1889,7 +1907,7 @@ void __init percpu_init_late(void)
BUILD_BUG_ON(size > PAGE_SIZE);
- map = pcpu_mem_alloc(size);
+ map = pcpu_mem_zalloc(size);
BUG_ON(!map);
spin_lock_irqsave(&pcpu_lock, flags);
diff --git a/mm/slab.c b/mm/slab.c
index 708efe88615..83311c9aaf9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -595,6 +595,7 @@ static enum {
PARTIAL_AC,
PARTIAL_L3,
EARLY,
+ LATE,
FULL
} g_cpucache_up;
@@ -671,7 +672,7 @@ static void init_node_lock_keys(int q)
{
struct cache_sizes *s = malloc_sizes;
- if (g_cpucache_up != FULL)
+ if (g_cpucache_up < LATE)
return;
for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -1666,6 +1667,8 @@ void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
+ g_cpucache_up = LATE;
+
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
diff --git a/mm/slub.c b/mm/slub.c
index 7d2a996c307..ed3334d9b6d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1862,7 +1862,7 @@ static void unfreeze_partials(struct kmem_cache *s)
{
struct kmem_cache_node *n = NULL;
struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
- struct page *page;
+ struct page *page, *discard_page = NULL;
while ((page = c->partial)) {
enum slab_modes { M_PARTIAL, M_FREE };
@@ -1904,7 +1904,8 @@ static void unfreeze_partials(struct kmem_cache *s)
if (l == M_PARTIAL)
remove_partial(n, page);
else
- add_partial(n, page, 1);
+ add_partial(n, page,
+ DEACTIVATE_TO_TAIL);
l = m;
}
@@ -1915,14 +1916,22 @@ static void unfreeze_partials(struct kmem_cache *s)
"unfreezing slab"));
if (m == M_FREE) {
- stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, page);
- stat(s, FREE_SLAB);
+ page->next = discard_page;
+ discard_page = page;
}
}
if (n)
spin_unlock(&n->list_lock);
+
+ while (discard_page) {
+ page = discard_page;
+ discard_page = discard_page->next;
+
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
}
/*
@@ -1969,7 +1978,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
page->pobjects = pobjects;
page->next = oldpage;
- } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+ } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
stat(s, CPU_PARTIAL_FREE);
return pobjects;
}
@@ -4435,30 +4444,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_possible_cpu(cpu) {
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ int node = ACCESS_ONCE(c->node);
struct page *page;
- if (!c || c->node < 0)
+ if (node < 0)
continue;
-
- if (c->page) {
- if (flags & SO_TOTAL)
- x = c->page->objects;
+ page = ACCESS_ONCE(c->page);
+ if (page) {
+ if (flags & SO_TOTAL)
+ x = page->objects;
else if (flags & SO_OBJECTS)
- x = c->page->inuse;
+ x = page->inuse;
else
x = 1;
total += x;
- nodes[c->node] += x;
+ nodes[node] += x;
}
page = c->partial;
if (page) {
x = page->pobjects;
- total += x;
- nodes[c->node] += x;
+ total += x;
+ nodes[node] += x;
}
- per_cpu[c->node]++;
+ per_cpu[node]++;
}
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b669aa6f6ca..3231bf33287 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2141,23 +2141,30 @@ void __attribute__((weak)) vmalloc_sync_all(void)
static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
{
- /* apply_to_page_range() does all the hard work. */
+ pte_t ***p = data;
+
+ if (p) {
+ *(*p) = pte;
+ (*p)++;
+ }
return 0;
}
/**
* alloc_vm_area - allocate a range of kernel address space
* @size: size of the area
+ * @ptes: returns the PTEs for the address space
*
* Returns: NULL on failure, vm_struct on success
*
* This function reserves a range of kernel address space, and
* allocates pagetables to map that range. No actual mappings
- * are created. If the kernel address space is not shared
- * between processes, it syncs the pagetable across all
- * processes.
+ * are created.
+ *
+ * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
+ * allocated for the VM area are returned.
*/
-struct vm_struct *alloc_vm_area(size_t size)
+struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
struct vm_struct *area;
@@ -2171,19 +2178,11 @@ struct vm_struct *alloc_vm_area(size_t size)
* of kernel virtual address space and mapped into init_mm.
*/
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- area->size, f, NULL)) {
+ size, f, ptes ? &ptes : NULL)) {
free_vm_area(area);
return NULL;
}
- /*
- * If the allocated address space is passed to a hypercall
- * before being used then we cannot rely on a page fault to
- * trigger an update of the page tables. So sync all the page
- * tables here.
- */
- vmalloc_sync_all();
-
return area;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);