From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- mm/backing-dev.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d3022..c91e139a652 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,17 +14,11 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - struct backing_dev_info default_backing_dev_info = { .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); -- cgit v1.2.3-70-g09d2 From 95f28604a65b1c40b6c6cd95e58439cd7ded3add Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Mar 2011 11:13:12 +0100 Subject: fs: assign sb->s_bdi to default_backing_dev_info if the bdi is going away We don't have proper reference counting for this yet, so we run into cases where the device is pulled and we OOPS on flushing the fs data. This happens even though the dirty inodes have already been migrated to the default_backing_dev_info. Reported-by: Torsten Hilbrich Tested-by: Torsten Hilbrich Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/super.c | 2 ++ fs/sync.c | 4 ++-- mm/backing-dev.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/super.c b/fs/super.c index 7e9dd4cc2c0..0d89e93f654 100644 --- a/fs/super.c +++ b/fs/super.c @@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type) #else INIT_LIST_HEAD(&s->s_files); #endif + s->s_bdi = &default_backing_dev_info; INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -1003,6 +1004,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void } BUG_ON(!mnt->mnt_sb); WARN_ON(!mnt->mnt_sb->s_bdi); + WARN_ON(mnt->mnt_sb->s_bdi == &default_backing_dev_info); mnt->mnt_sb->s_flags |= MS_BORN; error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); diff --git a/fs/sync.c b/fs/sync.c index ba76b9623e7..412dc89163d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -33,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) * This should be safe, as we require bdi backing to actually * write out data in the first place */ - if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) + if (sb->s_bdi == &noop_backing_dev_info) return 0; if (sb->s_qcop && sb->s_qcop->quota_sync) @@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem); static void sync_one_sb(struct super_block *sb, void *arg) { - if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) + if (!(sb->s_flags & MS_RDONLY)) __sync_filesystem(sb, *(int *)arg); } /* diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c91e139a652..8fe9d340792 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -598,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdi == bdi) - sb->s_bdi = NULL; + sb->s_bdi = &default_backing_dev_info; } spin_unlock(&sb_lock); } -- cgit v1.2.3-70-g09d2 From a66979abad090b2765a6c6790c9fdeab996833f2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:41 +1100 Subject: fs: move i_wb_list out from under inode_lock Protect the inode writeback list with a new global lock inode_wb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/block_dev.c | 4 +-- fs/fs-writeback.c | 76 +++++++++++++++++++++++++++-------------------- fs/inode.c | 12 +++++--- fs/internal.h | 5 ++++ include/linux/writeback.h | 1 + mm/backing-dev.c | 8 ++--- mm/filemap.c | 8 ++--- mm/rmap.c | 4 +-- 8 files changed, 70 insertions(+), 48 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index bc39b18cf3d..2bbc0e62102 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -55,13 +55,13 @@ EXPORT_SYMBOL(I_BDEV); static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } static sector_t max_block(struct block_device *bdev) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5de56a2182b..ed800656356 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -175,6 +175,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) spin_unlock_bh(&bdi->wb_lock); } +/* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + spin_lock(&inode_wb_list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&inode_wb_list_lock); +} + + /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { /* - * Prevent speculative execution through spin_unlock(&inode_lock); + * Prevent speculative execution through + * spin_unlock(&inode_wb_list_lock); */ + smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + assert_spin_locked(&inode_wb_list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -308,25 +324,23 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_lock. Either the - * caller has ref on the inode (either via __iget or via syscall against an fd) - * or the inode has I_WILL_FREE set (via generic_forget_inode) + * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either + * the caller has an active reference on the inode or the inode has I_WILL_FREE + * set. * * If `wait' is set, wait on the writeout. * * The whole writeout design is quite complex and fragile. We want to avoid * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. - * - * Called under inode_lock. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -368,7 +382,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); ret = do_writepages(mapping, wbc); @@ -388,12 +402,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * due to delalloc, clear dirty metadata flags right before * write_inode() */ - spin_lock(&inode_lock); spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -401,7 +413,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { @@ -543,10 +555,10 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, */ redirty_tail(inode); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); iput(inode); cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -565,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -583,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ } @@ -592,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } /* @@ -735,7 +747,7 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); @@ -743,7 +755,7 @@ static long wb_writeback(struct bdi_writeback *wb, inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } return wrote; @@ -1009,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; - bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1033,7 +1044,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); - spin_lock(&inode_lock); spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1059,12 +1069,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode->i_state & I_FREEING) goto out_unlock_inode; - spin_unlock(&inode->i_lock); /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + bool wakeup_bdi = false; bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { @@ -1081,18 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } + spin_unlock(&inode->i_lock); + spin_lock(&inode_wb_list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode_wb_list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; } - goto out; } out_unlock_inode: spin_unlock(&inode->i_lock); -out: - spin_unlock(&inode_lock); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1296,9 +1308,9 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, &wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1320,9 +1332,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/inode.c b/fs/inode.c index 785b1ab23ff..239fdc08719 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include "internal.h" /* * inode locking rules. @@ -36,6 +37,8 @@ * inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list + * inode_wb_list_lock protects: + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * * Lock ordering: * inode_lock @@ -44,6 +47,9 @@ * inode_sb_list_lock * inode->i_lock * inode_lru_lock + * + * inode_wb_list_lock + * inode->i_lock */ /* @@ -105,6 +111,7 @@ static struct hlist_head *inode_hashtable __read_mostly; DEFINE_SPINLOCK(inode_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the @@ -483,10 +490,7 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - spin_lock(&inode_lock); - list_del_init(&inode->i_wb_list); - spin_unlock(&inode_lock); - + inode_wb_list_del(inode); inode_sb_list_del(inode); if (op->evict_inode) { diff --git a/fs/internal.h b/fs/internal.h index 7013ae0c88c..b29c46e4e32 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -127,6 +127,11 @@ extern long do_handle_open(int mountdirfd, */ extern spinlock_t inode_sb_list_lock; +/* + * fs-writeback.c + */ +extern void inode_wb_list_del(struct inode *inode); + extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0ead399e08b..3f5fee71832 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -10,6 +10,7 @@ struct backing_dev_info; extern spinlock_t inode_lock; +extern spinlock_t inode_wb_list_lock; /* * fs/fs-writeback.c diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d3022..4b3e9f17ee2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -73,14 +73,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -682,11 +682,11 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } bdi_unregister(bdi); diff --git a/mm/filemap.c b/mm/filemap.c index 499e9aa9145..d8b34d1a107 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -80,8 +80,8 @@ * ->i_mutex * ->i_alloc_sem (various) * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * inode_wb_list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * * ->i_mmap_lock @@ -98,9 +98,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * inode_wb_list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * diff --git a/mm/rmap.c b/mm/rmap.c index 7dada045644..8da044a1db0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -31,12 +31,12 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * within inode_wb_list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock -- cgit v1.2.3-70-g09d2