From 4f5a99d64c17470a784a6c68064207d82e3e74a5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:40:25 -0800 Subject: fs: remove WB_SYNC_HOLD Remove WB_SYNC_HOLD. The primary motiviation is the design of my anti-starvation code for fsync. It requires taking an inode lock over the sync operation, so we could run into lock ordering problems with multiple inodes. It is possible to take a single global lock to solve the ordering problem, but then that would prevent a future nice implementation of "sync multiple inodes" based on lock order via inode address. Seems like a backward step to remove this, but actually it is busted anyway: we can't use the inode lists for data integrity wait: an inode can be taken off the dirty lists but still be under writeback. In order to satisfy data integrity semantics, we should wait for it to finish writeback, but if we only search the dirty lists, we'll miss it. It would be possible to have a "writeback" list, for sys_sync, I suppose. But why complicate things by prematurely optimise? For unmounting, we could avoid the "livelock avoidance" code, which would be easier, but again premature IMO. Fixing the existing data integrity problem will come next. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d0ff0b8cf30..d99601af9e4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * If we're a pdlfush thread, then implement pdflush collision avoidance * against the entire list. * - * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so - * that it can be located for waiting on in __writeback_single_inode(). - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, @@ -499,10 +496,6 @@ void generic_sync_sb_inodes(struct super_block *sb, __iget(inode); pages_skipped = wbc->pages_skipped; __writeback_single_inode(inode, wbc); - if (wbc->sync_mode == WB_SYNC_HOLD) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } if (current_is_pdflush()) writeback_release(bdi); if (wbc->pages_skipped != pages_skipped) { @@ -588,8 +581,7 @@ restart: /* * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is - * used to park the written inodes on sb->s_dirty for the wait pass. + * do this in two passes - one to write, and one to wait. * * A finite limit is set on the number of pages which will be written. * To prevent infinite livelock of sys_sync(). @@ -600,7 +592,7 @@ restart: void sync_inodes_sb(struct super_block *sb, int wait) { struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; -- cgit v1.2.3-70-g09d2 From 38f21977663126fef53f5585e7f1653d8ebe55c4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:40:25 -0800 Subject: fs: sync_sb_inodes fix Fix data integrity semantics required by sys_sync, by iterating over all inodes and waiting for any writeback pages after the initial writeout. Comments explain the exact problem. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 7 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d99601af9e4..a9ee474f969 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -440,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ + int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&sb->s_io)) @@ -516,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb, if (!list_empty(&sb->s_more_io)) wbc->more_io = 1; } - spin_unlock(&inode_lock); + + if (sync) { + struct inode *inode, *old_inode = NULL; + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_WILL_FREE)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); + } else + spin_unlock(&inode_lock); + return; /* Leave any unwritten inodes on s_io */ } EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); @@ -596,13 +639,16 @@ void sync_inodes_sb(struct super_block *sb, int wait) .range_start = 0, .range_end = LLONG_MAX, }; - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - wbc.nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused) + - nr_dirty + nr_unstable; - wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ + if (!wait) { + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + + wbc.nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + } else + wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ + sync_sb_inodes(sb, &wbc); } -- cgit v1.2.3-70-g09d2 From 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:40:26 -0800 Subject: fs: sys_sync fix s_syncing livelock avoidance was breaking data integrity guarantee of sys_sync, by allowing sys_sync to skip writing or waiting for superblocks if there is a concurrent sys_sync happening. This livelock avoidance is much less important now that we don't have the get_super_to_sync() call after every sb that we sync. This was replaced by __put_super_and_need_restart. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 20 +------------------- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 20 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a9ee474f969..e5eaa62fd17 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -652,18 +652,6 @@ void sync_inodes_sb(struct super_block *sb, int wait) sync_sb_inodes(sb, &wbc); } -/* - * Rather lame livelock avoidance. - */ -static void set_sb_syncing(int val) -{ - struct super_block *sb; - spin_lock(&sb_lock); - list_for_each_entry_reverse(sb, &super_blocks, s_list) - sb->s_syncing = val; - spin_unlock(&sb_lock); -} - /** * sync_inodes - writes all inodes to disk * @wait: wait for completion @@ -690,9 +678,6 @@ static void __sync_inodes(int wait) spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_syncing) - continue; - sb->s_syncing = 1; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); @@ -710,13 +695,10 @@ restart: void sync_inodes(int wait) { - set_sb_syncing(0); __sync_inodes(0); - if (wait) { - set_sb_syncing(0); + if (wait) __sync_inodes(1); - } } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index fb59673c60b..d7eba77f666 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1133,7 +1133,6 @@ struct super_block { struct rw_semaphore s_umount; struct mutex s_lock; int s_count; - int s_syncing; int s_need_sync_fs; atomic_t s_active; #ifdef CONFIG_SECURITY -- cgit v1.2.3-70-g09d2