diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 13 | ||||
-rw-r--r-- | mm/huge_memory.c | 16 | ||||
-rw-r--r-- | mm/hugetlb.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 103 | ||||
-rw-r--r-- | mm/mempolicy.c | 11 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 55 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/percpu-vm.c | 17 | ||||
-rw-r--r-- | mm/percpu.c | 68 | ||||
-rw-r--r-- | mm/slab.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 42 | ||||
-rw-r--r-- | mm/vmalloc.c | 31 | ||||
-rw-r--r-- | mm/vmscan.c | 26 |
17 files changed, 292 insertions, 129 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a0860640378..71034f41a2b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -724,6 +724,14 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); + /* + * If bdi_unregister() had already been called earlier, the + * wakeup_timer could still be armed because bdi_prune_sb() + * can race with the bdi_wakeup_thread_delayed() calls from + * __mark_inode_dirty(). + */ + del_timer_sync(&bdi->wb.wakeup_timer); + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); diff --git a/mm/filemap.c b/mm/filemap.c index c0018f2d50e..5f0a3c91fda 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1828,7 +1828,7 @@ repeat: page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + err = add_to_page_cache_lru(page, mapping, index, gfp); if (unlikely(err)) { page_cache_release(page); if (err == -EEXIST) @@ -1925,10 +1925,7 @@ static struct page *wait_on_page_read(struct page *page) * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with - * any new page allocations done using the specified allocation flags. Note - * that the Radix tree operations will still use GFP_KERNEL, so you can't - * expect to do this atomically or anything like that - but you can pass in - * other page requirements. + * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. */ @@ -2407,7 +2404,6 @@ static ssize_t generic_perform_write(struct file *file, iov_iter_count(i)); again: - /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the @@ -2463,7 +2459,10 @@ again: written += copied; balance_dirty_pages_ratelimited(mapping); - + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } } while (iov_iter_count(i)); return written ? written : status; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4298abaae15..36b3d988b4e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,12 +2259,8 @@ static void khugepaged_do_scan(struct page **hpage) static void khugepaged_alloc_sleep(void) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } #ifndef CONFIG_NUMA @@ -2313,14 +2309,10 @@ static void khugepaged_loop(void) if (unlikely(kthread_should_stop())) break; if (khugepaged_has_work()) { - DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) continue; - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_scan_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); } else if (khugepaged_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dae27ba3be2..2316840b337 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } @@ -900,7 +901,6 @@ retry: h->resv_huge_pages += delta; ret = 0; - spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -914,6 +914,7 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } + spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: @@ -2422,6 +2423,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { + page_cache_release(new_page); + page_cache_release(old_page); /* Caller expects lock to be held */ spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aff93c98ac..94da8ee9e2c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -50,6 +50,8 @@ #include <linux/cpu.h> #include <linux/oom.h> #include "internal.h" +#include <net/sock.h> +#include <net/tcp_memcontrol.h> #include <asm/uaccess.h> @@ -286,6 +288,10 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + +#ifdef CONFIG_INET + struct tcp_memcontrol tcp_mem; +#endif }; /* Stuffs for move charges at task migration. */ @@ -365,7 +371,58 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); + +/* Writing them here to avoid exposing memcg's inner layout */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_INET +#include <net/sock.h> +#include <net/ip.h> + +static bool mem_cgroup_is_root(struct mem_cgroup *memcg); +void sock_update_memcg(struct sock *sk) +{ + /* A socket spends its whole life in the same cgroup */ + if (sk->sk_cgrp) { + WARN_ON(1); + return; + } + if (static_branch(&memcg_socket_limit_enabled)) { + struct mem_cgroup *memcg; + + BUG_ON(!sk->sk_prot->proto_cgroup); + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (!mem_cgroup_is_root(memcg)) { + mem_cgroup_get(memcg); + sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); + } + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) { + struct mem_cgroup *memcg; + WARN_ON(!sk->sk_cgrp->memcg); + memcg = sk->sk_cgrp->memcg; + mem_cgroup_put(memcg); + } +} + +struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg || mem_cgroup_is_root(memcg)) + return NULL; + + return &memcg->tcp_mem.cg_proto; +} +EXPORT_SYMBOL(tcp_proto_cgroup); +#endif /* CONFIG_INET */ +#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -745,7 +802,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, mem_cgroup_subsys_id), struct mem_cgroup, @@ -4612,6 +4669,36 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + /* + * Part of this would be better living in a separate allocation + * function, leaving us with just the cgroup tree population work. + * We, however, depend on state such as network's proto_list that + * is only initialized after cgroup creation. I found the less + * cumbersome way to deal with it to defer it all to populate time + */ + return mem_cgroup_sockets_init(cont, ss); +}; + +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + mem_cgroup_sockets_destroy(cont, ss); +} +#else +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + return 0; +} + +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ +} +#endif + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4843,12 +4930,13 @@ static void mem_cgroup_put(struct mem_cgroup *memcg) /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { if (!memcg->res.parent) return NULL; return mem_cgroup_from_res_counter(memcg->res.parent, res); } +EXPORT_SYMBOL(parent_mem_cgroup); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) @@ -4907,9 +4995,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int cpu; enable_swap_cgroup(); parent = NULL; - root_mem_cgroup = memcg; if (mem_cgroup_soft_limit_tree_init()) goto free_out; + root_mem_cgroup = memcg; for_each_possible_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); @@ -4948,7 +5036,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &memcg->css; free_out: __mem_cgroup_free(memcg); - root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -4965,6 +5052,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_destroy(ss, cont); + mem_cgroup_put(memcg); } @@ -4978,6 +5067,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, if (!ret) ret = register_memsw_files(cont, ss); + + if (!ret) + ret = register_kmem_files(cont, ss); + return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index adc39548181..c3fdbcb1765 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,6 +636,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; + pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; @@ -643,13 +644,21 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (!vma || vma->vm_start > start) return -EFAULT; + if (start > vma->vm_start) + prev = vma; + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + + pgoff = vma->vm_pgoff + + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, new_pol); if (prev) { vma = prev; diff --git a/mm/migrate.c b/mm/migrate.c index 578e29174fa..177aca424a0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -871,9 +871,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); -out: unlock_page(hpage); +out: if (rc != -EAGAIN) { list_del(&hpage->lru); put_page(hpage); diff --git a/mm/nommu.c b/mm/nommu.c index 73419c55eda..b982290fd96 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -454,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void) * between processes, it syncs the pagetable across all * processes. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); return NULL; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 471dedb463a..069b64e521f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -176,7 +176,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask, unsigned long totalpages) { - int points; + long points; if (oom_unkillable_task(p, mem, nodemask)) return 0; @@ -185,6 +185,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, if (!p) return 0; + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + task_unlock(p); + return 0; + } + /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a3278f00523..50f08241f98 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -128,7 +128,6 @@ unsigned long global_dirty_limit; * */ static struct prop_descriptor vm_completions; -static struct prop_descriptor vm_dirties; /* * couple the period to the dirty_ratio: @@ -154,7 +153,6 @@ static void update_completion_period(void) { int shift = calc_period_shift(); prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); writeback_set_ratelimit(); } @@ -235,11 +233,6 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -void task_dirty_inc(struct task_struct *tsk) -{ - prop_inc_single(&vm_dirties, &tsk->dirties); -} - /* * Obtain an accurate fraction of the BDI's portion. */ @@ -418,8 +411,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) * * Returns @bdi's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. - * And the "limit" in the name is not seriously taken as hard limit in - * balance_dirty_pages(). + * + * Note that balance_dirty_pages() will only seriously take it as a hard limit + * when sleeping max_pause per page is not enough to keep the dirty pages under + * control. For example, when the device is completely stalled due to some error + * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * In the other normal situations, it acts more gently by throttling the tasks + * more (rather than completely block them) when the bdi dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices @@ -601,6 +599,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, */ if (unlikely(bdi_thresh > thresh)) bdi_thresh = thresh; + /* + * It's very possible that bdi_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); /* * scale global setpoint to bdi's: @@ -984,8 +989,7 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, * * 8 serves as the safety ratio. */ - if (bdi_dirty) - t = min(t, bdi_dirty * HZ / (8 * bw + 1)); + t = min(t, bdi_dirty * HZ / (8 * bw + 1)); /* * The pause time will be settled within range (max_pause/4, max_pause). @@ -1133,17 +1137,30 @@ pause: pages_dirtied, pause, start_time); - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); - dirty_thresh = hard_dirty_limit(dirty_thresh); /* - * max-pause area. If dirty exceeded but still within this - * area, no need to sleep for more than 200ms: (a) 8 pages per - * 200ms is typically more than enough to curb heavy dirtiers; - * (b) the pause time limit makes the dirtiers more responsive. + * This is typically equal to (nr_dirty < dirty_thresh) and can + * also keep "1000+ dd on a slow USB stick" under control. */ - if (nr_dirty < dirty_thresh) + if (task_ratelimit) + break; + + /* + * In the case of an unresponding NFS server and the NFS dirty + * pages exceeds dirty_thresh, give the other good bdi's a pipe + * to go through, so that tasks on them still remain responsive. + * + * In theory 1 page is enough to keep the comsumer-producer + * pipe going: the flusher cleans 1 page => the task dirties 1 + * more page. However bdi_dirty has accounting errors. So use + * the larger and more IO friendly bdi_stat_error. + */ + if (bdi_dirty <= bdi_stat_error(bdi)) + break; + + if (fatal_signal_pending(current)) break; } @@ -1395,7 +1412,6 @@ void __init page_writeback_init(void) shift = calc_period_shift(); prop_descriptor_init(&vm_completions, shift); - prop_descriptor_init(&vm_dirties, shift); } /** @@ -1724,7 +1740,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); - task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dd443d89d8..2b8ba3aebf6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -356,8 +356,8 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } @@ -3377,9 +3377,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) unsigned long block_migratetype; int reserve; - /* Get the start pfn, end pfn and the number of blocks to reserve */ + /* + * Get the start pfn, end pfn and the number of blocks to reserve + * We have to be careful to be aligned to pageblock_nr_pages to + * make sure that we always check pfn_valid for the first page in + * the block. + */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; + start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index ea534960a04..12a48a88c0d 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -50,14 +50,13 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, if (!pages || !bitmap) { if (may_alloc && !pages) - pages = pcpu_mem_alloc(pages_size); + pages = pcpu_mem_zalloc(pages_size); if (may_alloc && !bitmap) - bitmap = pcpu_mem_alloc(bitmap_size); + bitmap = pcpu_mem_zalloc(bitmap_size); if (!pages || !bitmap) return NULL; } - memset(pages, 0, pages_size); bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); *bitmapp = bitmap; @@ -143,8 +142,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -206,8 +205,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -284,8 +283,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** diff --git a/mm/percpu.c b/mm/percpu.c index bf80e55dbed..716eb4acf2f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; -/* cpus with the lowest and highest unit numbers */ -static unsigned int pcpu_first_unit_cpu __read_mostly; -static unsigned int pcpu_last_unit_cpu __read_mostly; +/* cpus with the lowest and highest unit addresses */ +static unsigned int pcpu_low_unit_cpu __read_mostly; +static unsigned int pcpu_high_unit_cpu __read_mostly; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; @@ -273,11 +273,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) /** - * pcpu_mem_alloc - allocate memory + * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, - * kzalloc() is used; otherwise, vmalloc() is used. The returned + * kzalloc() is used; otherwise, vzalloc() is used. The returned * memory is always zeroed. * * CONTEXT: @@ -286,7 +286,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_mem_alloc(size_t size) +static void *pcpu_mem_zalloc(size_t size) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; @@ -302,7 +302,7 @@ static void *pcpu_mem_alloc(size_t size) * @ptr: memory to free * @size: size of the area * - * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). + * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr, size_t size) { @@ -384,7 +384,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; - new = pcpu_mem_alloc(new_size); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -604,11 +604,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; - chunk = pcpu_mem_alloc(pcpu_chunk_struct_size); + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; - chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * + sizeof(chunk->map[0])); if (!chunk->map) { kfree(chunk); return NULL; @@ -977,6 +978,17 @@ bool is_kernel_percpu_address(unsigned long addr) * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * + * percpu allocator has special setup for the first chunk, which currently + * supports either embedding in linear address space or vmalloc mapping, + * and, from the second one, the backing allocator (currently either vm or + * km) provides translation. + * + * The addr can be tranlated simply without checking if it falls into the + * first chunk. But the current code reflects better how percpu allocator + * actually works, and the verification can discover both bugs in percpu + * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current + * code. + * * RETURNS: * The physical address for @addr. */ @@ -984,19 +996,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; - unsigned long first_start, first_end; + unsigned long first_low, first_high; unsigned int cpu; /* - * The following test on first_start/end isn't strictly + * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. */ - first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); - first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, - pcpu_unit_pages); - if ((unsigned long)addr >= first_start && - (unsigned long)addr < first_end) { + first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); + first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_low && + (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); @@ -1011,9 +1023,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) if (!is_vmalloc_addr(addr)) return __pa(addr); else - return page_to_phys(vmalloc_to_page(addr)); + return page_to_phys(vmalloc_to_page(addr)) + + offset_in_page(addr); } else - return page_to_phys(pcpu_addr_to_page(addr)); + return page_to_phys(pcpu_addr_to_page(addr)) + + offset_in_page(addr); } /** @@ -1233,7 +1247,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; - pcpu_first_unit_cpu = NR_CPUS; + + pcpu_low_unit_cpu = NR_CPUS; + pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; @@ -1253,9 +1269,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; - if (pcpu_first_unit_cpu == NR_CPUS) - pcpu_first_unit_cpu = cpu; - pcpu_last_unit_cpu = cpu; + /* determine low/high unit_cpu */ + if (pcpu_low_unit_cpu == NR_CPUS || + unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) + pcpu_low_unit_cpu = cpu; + if (pcpu_high_unit_cpu == NR_CPUS || + unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) + pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; @@ -1889,7 +1909,7 @@ void __init percpu_init_late(void) BUILD_BUG_ON(size > PAGE_SIZE); - map = pcpu_mem_alloc(size); + map = pcpu_mem_zalloc(size); BUG_ON(!map); spin_lock_irqsave(&pcpu_lock, flags); diff --git a/mm/slab.c b/mm/slab.c index 708efe88615..83311c9aaf9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -595,6 +595,7 @@ static enum { PARTIAL_AC, PARTIAL_L3, EARLY, + LATE, FULL } g_cpucache_up; @@ -671,7 +672,7 @@ static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; - if (g_cpucache_up != FULL) + if (g_cpucache_up < LATE) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { @@ -1666,6 +1667,8 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; + g_cpucache_up = LATE; + /* Annotate slab for lockdep -- annotate the malloc caches */ init_lock_keys(); diff --git a/mm/slub.c b/mm/slub.c index 7d2a996c307..ed3334d9b6d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1862,7 +1862,7 @@ static void unfreeze_partials(struct kmem_cache *s) { struct kmem_cache_node *n = NULL; struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - struct page *page; + struct page *page, *discard_page = NULL; while ((page = c->partial)) { enum slab_modes { M_PARTIAL, M_FREE }; @@ -1904,7 +1904,8 @@ static void unfreeze_partials(struct kmem_cache *s) if (l == M_PARTIAL) remove_partial(n, page); else - add_partial(n, page, 1); + add_partial(n, page, + DEACTIVATE_TO_TAIL); l = m; } @@ -1915,14 +1916,22 @@ static void unfreeze_partials(struct kmem_cache *s) "unfreezing slab")); if (m == M_FREE) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, page); - stat(s, FREE_SLAB); + page->next = discard_page; + discard_page = page; } } if (n) spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } } /* @@ -1969,7 +1978,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->pobjects = pobjects; page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); stat(s, CPU_PARTIAL_FREE); return pobjects; } @@ -4435,30 +4444,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + int node = ACCESS_ONCE(c->node); struct page *page; - if (!c || c->node < 0) + if (node < 0) continue; - - if (c->page) { - if (flags & SO_TOTAL) - x = c->page->objects; + page = ACCESS_ONCE(c->page); + if (page) { + if (flags & SO_TOTAL) + x = page->objects; else if (flags & SO_OBJECTS) - x = c->page->inuse; + x = page->inuse; else x = 1; total += x; - nodes[c->node] += x; + nodes[node] += x; } page = c->partial; if (page) { x = page->pobjects; - total += x; - nodes[c->node] += x; + total += x; + nodes[node] += x; } - per_cpu[c->node]++; + per_cpu[node]++; } } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b669aa6f6ca..27be2f0d4cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1290,7 +1290,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) { - static struct vmap_area *va; + struct vmap_area *va; struct vm_struct *area; BUG_ON(in_interrupt()); @@ -1633,6 +1633,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + if (!addr) + return NULL; /* * In this function, newly allocated vm_struct is not added @@ -2141,23 +2143,30 @@ void __attribute__((weak)) vmalloc_sync_all(void) static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) { - /* apply_to_page_range() does all the hard work. */ + pte_t ***p = data; + + if (p) { + *(*p) = pte; + (*p)++; + } return 0; } /** * alloc_vm_area - allocate a range of kernel address space * @size: size of the area + * @ptes: returns the PTEs for the address space * * Returns: NULL on failure, vm_struct on success * * This function reserves a range of kernel address space, and * allocates pagetables to map that range. No actual mappings - * are created. If the kernel address space is not shared - * between processes, it syncs the pagetable across all - * processes. + * are created. + * + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { struct vm_struct *area; @@ -2171,19 +2180,11 @@ struct vm_struct *alloc_vm_area(size_t size) * of kernel virtual address space and mapped into init_mm. */ if (apply_to_page_range(&init_mm, (unsigned long)area->addr, - area->size, f, NULL)) { + size, f, ptes ? &ptes : NULL)) { free_vm_area(area); return NULL; } - /* - * If the allocated address space is passed to a hypercall - * before being used then we cannot rely on a page fault to - * trigger an update of the page tables. So sync all the page - * tables here. - */ - vmalloc_sync_all(); - return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index a1893c05079..f54a05b7a61 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, */ void register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + atomic_long_set(&shrinker->nr_in_batch, 0); down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); @@ -247,25 +247,26 @@ unsigned long shrink_slab(struct shrink_control *shrink, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; - unsigned long total_scan; - unsigned long max_pass; + long total_scan; + long max_pass; int shrink_ret = 0; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + if (max_pass <= 0) + continue; + /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - do { - nr = shrinker->nr; - } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); total_scan = nr; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -325,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink, * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - do { - nr = shrinker->nr; - new_nr = total_scan + nr; - if (total_scan <= 0) - break; - } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_in_batch); + else + new_nr = atomic_long_read(&shrinker->nr_in_batch); trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } |