From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2011 18:19:44 -0600 Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Wu Fengguang --- mm/filemap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index d7b10578a64..1e492c3dd6f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -81,7 +81,7 @@ * ->i_mutex * ->i_alloc_sem (various) * - * inode_wb_list_lock + * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * @@ -99,9 +99,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (zap_pte_range->set_page_dirty) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * -- cgit v1.2.3-70-g09d2 From 5e5358e7cf48aa079b8761a7d806ad536023745c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:23 -0700 Subject: mm: cleanup descriptions of filler arg The often-NULL data arg to read_cache_page() and read_mapping_page() functions is misdescribed as "destination for read data": no, it's the first arg to the filler function, often struct file * to ->readpage(). Satisfy checkpatch.pl on those filler prototypes, and tidy up the declarations in linux/pagemap.h. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 +++++------- mm/filemap.c | 12 ++++++------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8e38d4c140f..cfaaa6949b8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -255,26 +255,24 @@ static inline struct page *grab_cache_page(struct address_space *mapping, extern struct page * grab_cache_page_nowait(struct address_space *mapping, pgoff_t index); extern struct page * read_cache_page_async(struct address_space *mapping, - pgoff_t index, filler_t *filler, - void *data); + pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page(struct address_space *mapping, - pgoff_t index, filler_t *filler, - void *data); + pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); static inline struct page *read_mapping_page_async( - struct address_space *mapping, - pgoff_t index, void *data) + struct address_space *mapping, + pgoff_t index, void *data) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return read_cache_page_async(mapping, index, filler, data); } static inline struct page *read_mapping_page(struct address_space *mapping, - pgoff_t index, void *data) + pgoff_t index, void *data) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return read_cache_page(mapping, index, filler, data); diff --git a/mm/filemap.c b/mm/filemap.c index f820e600f1a..2780be4bd49 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1792,7 +1792,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) { @@ -1823,7 +1823,7 @@ repeat: static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) @@ -1863,7 +1863,7 @@ out: * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. @@ -1875,7 +1875,7 @@ out: */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -1923,7 +1923,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Read into the page cache. If a page already exists, and PageUptodate() is * not set, try to fill the page then wait for it to become unlocked. @@ -1932,7 +1932,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -- cgit v1.2.3-70-g09d2 From b85e0effd3dcbf9118b896232f59526ab1a39a74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:25 -0700 Subject: mm: consistent truncate and invalidate loops Make the pagevec_lookup loops in truncate_inode_pages_range(), invalidate_mapping_pages() and invalidate_inode_pages2_range() more consistent with each other. They were relying upon page->index of an unlocked page, but apologizing for it: accept it, embrace it, add comments and WARN_ONs, and simplify the index handling. invalidate_inode_pages2_range() had special handling for a wrapped page->index + 1 = 0 case; but MAX_LFS_FILESIZE doesn't let us anywhere near there, and a corrupt page->index in the radix_tree could cause more trouble than that would catch. Remove that wrapped handling. invalidate_inode_pages2_range() uses min() to limit the pagevec_lookup when near the end of the range: copy that into the other two, although it's less useful than you might think (it limits the use of the buffer, rather than the indices looked up). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 ++ mm/truncate.c | 110 +++++++++++++++++++++++++--------------------------------- 2 files changed, 49 insertions(+), 63 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2780be4bd49..10a17111327 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -128,6 +128,7 @@ void __delete_from_page_cache(struct page *page) radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) @@ -483,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); diff --git a/mm/truncate.c b/mm/truncate.c index c924764e2ce..dc459014f77 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) * The first pass will remove most pages, so the search cost of the second pass * is low. * - * When looking at page->index outside the page lock we need to be careful to - * copy it into a local to avoid races (it could change at any time). - * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. @@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; - pgoff_t next; + pgoff_t index; + pgoff_t end; int i; cleancache_flush_inode(mapping); @@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, end = (lend >> PAGE_CACHE_SHIFT); pagevec_init(&pvec, 0); - next = start; - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - if (page_index > end) { - next = page_index; + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; - } - if (page_index > next) - next = page_index; - next++; if (!trylock_page(page)) continue; + WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } if (partial) { @@ -264,13 +259,14 @@ void truncate_inode_pages_range(struct address_space *mapping, } } - next = start; + index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - if (next == start) + if (!pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + if (index == start) break; - next = start; + index = start; continue; } if (pvec.pages[0]->index > end) { @@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - if (page->index > end) + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; + lock_page(page); + WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); - if (page->index > next) - next = page->index; - next++; unlock_page(page); } pagevec_release(&pvec); mem_cgroup_uncharge_end(); + index++; } cleancache_flush_inode(mapping); } @@ -333,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next = start; + pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t index; - int lock_failed; - - lock_failed = !trylock_page(page); - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ + /* We rely upon deletion not changing page->index */ index = page->index; - if (index > next) - next = index; - next++; - if (lock_failed) - continue; + if (index > end) + break; + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* @@ -371,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!ret) deactivate_page(page); count += ret; - if (next > end) - break; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } return count; } @@ -442,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next; + pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; - int wrapped = 0; cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); - next = start; - while (next <= end && !wrapped && - pagevec_lookup(&pvec, mapping, next, - min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index; + + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) + break; lock_page(page); + WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } - page_index = page->index; - next = page_index + 1; - if (next == 0) - wrapped = 1; - if (page_index > end) { - unlock_page(page); - break; - } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { @@ -480,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)page_index<