From bebeb3d68b24bb4132d452c5707fe321208bcbcd Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:37 -0800 Subject: mm: introduce mm_populate() for populating new vmas When creating new mappings using the MAP_POPULATE / MAP_LOCKED flags (or with MCL_FUTURE in effect), we want to populate the pages within the newly created vmas. This may take a while as we may have to read pages from disk, so ideally we want to do this outside of the write-locked mmap_sem region. This change introduces mm_populate(), which is used to defer populating such mappings until after the mmap_sem write lock has been released. This is implemented as a generalization of the former do_mlock_pages(), which accomplished the same task but was using during mlock() / mlockall(). Signed-off-by: Michel Lespinasse Reported-by: Andy Lutomirski Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/util.c') diff --git a/mm/util.c b/mm/util.c index c55e26b17d9..13467e043e9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -355,12 +355,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, { unsigned long ret; struct mm_struct *mm = current->mm; + bool populate; ret = security_mmap_file(file, prot, flag); if (!ret) { down_write(&mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, + &populate); up_write(&mm->mmap_sem); + if (!IS_ERR_VALUE(ret) && populate) + mm_populate(ret, len); } return ret; } -- cgit v1.2.3-70-g09d2 From 41badc15cbad0350de34408c1b0c690f9df76d4b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:47 -0800 Subject: mm: make do_mmap_pgoff return populate as a size in bytes, not as a bool do_mmap_pgoff() rounds up the desired size to the next PAGE_SIZE multiple, however there was no equivalent code in mm_populate(), which caused issues. This could be fixed by introduced the same rounding in mm_populate(), however I think it's preferable to make do_mmap_pgoff() return populate as a size rather than as a boolean, so we don't have to duplicate the size rounding logic in mm_populate(). Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 5 ++--- include/linux/mm.h | 2 +- ipc/shm.c | 4 ++-- mm/mmap.c | 6 +++--- mm/nommu.c | 4 ++-- mm/util.c | 6 +++--- 6 files changed, 13 insertions(+), 14 deletions(-) (limited to 'mm/util.c') diff --git a/fs/aio.c b/fs/aio.c index 82eec7c7b4b..064bfbe3756 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -101,9 +101,8 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; - unsigned long size; + unsigned long size, populate; int nr_pages; - bool populate; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ @@ -150,7 +149,7 @@ static int aio_setup_ring(struct kioctx *ctx) return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, info->mmap_size); + mm_populate(info->mmap_base, populate); ctx->user_id = info->mmap_base; diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a5fcdeaa3a..95db68e34b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1475,7 +1475,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, bool *populate); + unsigned long pgoff, unsigned long *populate); extern int do_munmap(struct mm_struct *, unsigned long, size_t); #ifdef CONFIG_MMU diff --git a/ipc/shm.c b/ipc/shm.c index 9f047ba69e6..be3ec9ae454 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -971,7 +971,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, struct shm_file_data *sfd; struct path path; fmode_t f_mode; - bool populate = false; + unsigned long populate = 0; err = -EINVAL; if (shmid < 0) @@ -1078,7 +1078,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, invalid: up_write(¤t->mm->mmap_sem); if (populate) - mm_populate(addr, size); + mm_populate(addr, populate); out_fput: fput(file); diff --git a/mm/mmap.c b/mm/mmap.c index 39a3944e165..44bb4d86988 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1163,13 +1163,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint) unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, - bool *populate) + unsigned long *populate) { struct mm_struct * mm = current->mm; struct inode *inode; vm_flags_t vm_flags; - *populate = false; + *populate = 0; /* * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1307,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, addr = mmap_region(file, addr, len, vm_flags, pgoff); if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) - *populate = true; + *populate = len; return addr; } diff --git a/mm/nommu.c b/mm/nommu.c index 7296a5a280e..18c1b932e2c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1251,7 +1251,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long prot, unsigned long flags, unsigned long pgoff, - bool *populate) + unsigned long *populate) { struct vm_area_struct *vma; struct vm_region *region; @@ -1261,7 +1261,7 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - *populate = false; + *populate = 0; /* decide whether we should attempt the mapping, and if so what sort of * mapping */ diff --git a/mm/util.c b/mm/util.c index 13467e043e9..3704bf1bef9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -355,7 +355,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, { unsigned long ret; struct mm_struct *mm = current->mm; - bool populate; + unsigned long populate; ret = security_mmap_file(file, prot, flag); if (!ret) { @@ -363,8 +363,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate); up_write(&mm->mmap_sem); - if (!IS_ERR_VALUE(ret) && populate) - mm_populate(ret, len); + if (populate) + mm_populate(ret, populate); } return ret; } -- cgit v1.2.3-70-g09d2 From 9800339b5e0f0e24ab3dac349e0de80d2018832e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 22 Feb 2013 16:34:35 -0800 Subject: mm: don't inline page_mapping() According to akpm, this saves 1/2k text and makes things simple for the next patch. Numbers from Minchan: add/remove: 1/0 grow/shrink: 6/22 up/down: 92/-516 (-424) function old new delta page_mapping - 48 +48 do_task_stat 2292 2308 +16 page_remove_rmap 240 248 +8 load_elf_binary 4500 4508 +8 update_queue 532 536 +4 scsi_probe_and_add_lun 2892 2896 +4 lookup_fast 644 648 +4 vcs_read 1040 1036 -4 __ip_route_output_key 1904 1900 -4 ip_route_input_noref 2508 2500 -8 shmem_file_aio_read 784 772 -12 __isolate_lru_page 272 256 -16 shmem_replace_page 708 688 -20 mark_buffer_dirty 228 208 -20 __set_page_dirty_buffers 240 220 -20 __remove_mapping 276 256 -20 update_mmu_cache 500 476 -24 set_page_dirty_balance 92 68 -24 set_page_dirty 172 148 -24 page_evictable 88 64 -24 page_cache_pipe_buf_steal 248 224 -24 clear_page_dirty_for_io 340 316 -24 test_set_page_writeback 400 372 -28 test_clear_page_writeback 516 488 -28 invalidate_inode_page 156 128 -28 page_mkclean 432 400 -32 flush_dcache_page 360 328 -32 __set_page_dirty_nobuffers 324 280 -44 shrink_page_list 2412 2356 -56 Signed-off-by: Shaohua Li Suggested-by: Andrew Morton Cc: Hugh Dickins Acked-by: Rik van Riel Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +------------ mm/util.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'mm/util.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 473abbda942..1d4122bf6f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -809,18 +809,7 @@ void page_address_init(void); #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) -extern struct address_space swapper_space; -static inline struct address_space *page_mapping(struct page *page) -{ - struct address_space *mapping = page->mapping; - - VM_BUG_ON(PageSlab(page)); - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; - return mapping; -} +extern struct address_space *page_mapping(struct page *page); /* Neutral page->mapping pointer to address_space or anon_vma or other */ static inline void *page_rmapping(struct page *page) diff --git a/mm/util.c b/mm/util.c index 3704bf1bef9..16a73195a37 100644 --- a/mm/util.c +++ b/mm/util.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "internal.h" @@ -382,6 +383,21 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +struct address_space *page_mapping(struct page *page) +{ + struct address_space *mapping = page->mapping; + + VM_BUG_ON(PageSlab(page)); +#ifdef CONFIG_SWAP + if (unlikely(PageSwapCache(page))) + mapping = &swapper_space; + else +#endif + if ((unsigned long)mapping & PAGE_MAPPING_ANON) + mapping = NULL; + return mapping; +} + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -- cgit v1.2.3-70-g09d2 From 33806f06da654092182410d974b6d3c5396ea3eb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 22 Feb 2013 16:34:37 -0800 Subject: swap: make each swap partition have one address_space When I use several fast SSD to do swap, swapper_space.tree_lock is heavily contended. This makes each swap partition have one address_space to reduce the lock contention. There is an array of address_space for swap. The swap entry type is the index to the array. In my test with 3 SSD, this increases the swapout throughput 20%. [akpm@linux-foundation.org: revert unneeded change to __add_to_swap_cache] Signed-off-by: Shaohua Li Cc: Hugh Dickins Acked-by: Rik van Riel Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 4 ++-- include/linux/swap.h | 9 +++++---- mm/memcontrol.c | 4 ++-- mm/mincore.c | 5 +++-- mm/swap.c | 9 +++++++-- mm/swap_state.c | 55 ++++++++++++++++++++++++++++++++++++---------------- mm/swapfile.c | 5 +++-- mm/util.c | 10 +++++++--- 8 files changed, 67 insertions(+), 34 deletions(-) (limited to 'mm/util.c') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c3dac611c3c..1efaaa19c4f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), diff --git a/include/linux/swap.h b/include/linux/swap.h index 8c66486a8ca..235c039892e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -8,7 +8,7 @@ #include #include #include - +#include #include #include @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); /* linux/mm/swap_state.c */ -extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages +extern struct address_space swapper_spaces[]; +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) +extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #define nr_swap_pages 0L #define total_swap_pages 0L -#define total_swapcache_pages 0UL +#define total_swapcache_pages() 0UL #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c878b1c6951..f85861531f2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6307,7 +6307,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(&swapper_space, ent.val); + page = find_get_page(swap_address_space(ent), ent.val); if (do_swap_account) entry->val = ent.val; @@ -6348,7 +6348,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) *entry = swap; - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif return page; diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb..da2be56a7b8 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif if (page) { @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { #ifdef CONFIG_SWAP pgoff = entry.val; - *vec = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(swap_address_space(entry), + pgoff); #else WARN_ON(1); *vec = 1; diff --git a/mm/swap.c b/mm/swap.c index 6310dc2008f..8a529a01e8f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); - #ifdef CONFIG_SWAP - bdi_init(swapper_space.backing_dev_info); + int i; + + bdi_init(swapper_spaces[0].backing_dev_info); + for (i = 0; i < MAX_SWAPFILES; i++) { + spin_lock_init(&swapper_spaces[i].tree_lock); + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); + } #endif /* Use a smaller cluster for small-memory machines */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 0cb36fb1f61..8d6644c5d0c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, }; -struct address_space swapper_space = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), - .a_ops = &swap_aops, - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), - .backing_dev_info = &swap_backing_dev_info, +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,9 +53,19 @@ static struct { unsigned long find_total; } swap_cache_info; +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + void show_swap_cache_info(void) { - printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); @@ -70,6 +80,7 @@ void show_swap_cache_info(void) static int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapCache(page)); @@ -79,14 +90,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); set_page_private(page, entry.val); - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, + entry.val, page); if (likely(!error)) { - total_swapcache_pages++; + address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { /* @@ -122,14 +135,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { + swp_entry_t entry; + struct address_space *address_space; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); - radix_tree_delete(&swapper_space.page_tree, page_private(page)); + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); - total_swapcache_pages--; + address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } @@ -195,12 +213,14 @@ int add_to_swap(struct page *page) void delete_from_swap_cache(struct page *page) { swp_entry_t entry; + struct address_space *address_space; entry.val = page_private(page); - spin_lock_irq(&swapper_space.tree_lock); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); __delete_from_swap_cache(page); - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry, page); page_cache_release(page); @@ -263,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page) INC_CACHE_INFO(find_success); @@ -290,7 +310,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(swap_address_space(entry), + entry.val); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index e97a0e5aea9..e51864e6fe8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -79,7 +79,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -699,7 +699,8 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), + entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; diff --git a/mm/util.c b/mm/util.c index 16a73195a37..ab1424dbe2e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "internal.h" @@ -389,9 +390,12 @@ struct address_space *page_mapping(struct page *page) VM_BUG_ON(PageSlab(page)); #ifdef CONFIG_SWAP - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + mapping = swap_address_space(entry); + } else #endif if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; -- cgit v1.2.3-70-g09d2