From 18ce3751ccd488c78d3827e9f6bf54e6322676fb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Jul 2008 09:07:34 +0200 Subject: Properly notify block layer of sync writes fsync_buffers_list() and sync_dirty_buffer() both issue async writes and then immediately wait on them. Conceptually, that makes them sync writes and we should treat them as such so that the IO schedulers can handle them appropriately. This patch fixes a write starvation issue that Lin Ming reported, where xx is stuck for more than 2 minutes because of a large number of synchronous IO in the system: INFO: task kjournald:20558 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kjournald D ffff810010820978 6712 20558 2 ffff81022ddb1d10 0000000000000046 ffff81022e7baa10 ffffffff803ba6f2 ffff81022ecd0000 ffff8101e6dc9160 ffff81022ecd0348 000000008048b6cb 0000000000000086 ffff81022c4e8d30 0000000000000000 ffffffff80247537 Call Trace: [] kobject_get+0x12/0x17 [] getnstimeofday+0x2f/0x83 [] sync_buffer+0x0/0x3f [] io_schedule+0x5d/0x9f [] sync_buffer+0x3b/0x3f [] __wait_on_bit+0x40/0x6f [] sync_buffer+0x0/0x3f [] out_of_line_wait_on_bit+0x6c/0x78 [] wake_bit_function+0x0/0x23 [] sync_dirty_buffer+0x98/0xcb [] journal_commit_transaction+0x97d/0xcb6 [] lock_timer_base+0x26/0x4b [] kjournald+0xc1/0x1fb [] autoremove_wake_function+0x0/0x2e [] kjournald+0x0/0x1fb [] kthread+0x47/0x74 [] schedule_tail+0x28/0x5d [] child_rip+0xa/0x12 [] kthread+0x0/0x74 [] child_rip+0x0/0x12 Lin Ming confirms that this patch fixes the issue. I've run tests with it for the past week and no ill effects have been observed, so I'm proposing it for inclusion into 2.6.26. Signed-off-by: Jens Axboe --- fs/buffer.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index a073f3f4f01..0f51c0f7c26 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * contents - it is a noop if I/O is still in * flight on potentially older contents. */ - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block(SWRITE_SYNC, 1, &bh); brelse(bh); spin_lock(lock); } @@ -2940,16 +2940,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; - if (rw == SWRITE) + if (rw == SWRITE || rw == SWRITE_SYNC) lock_buffer(bh); else if (test_set_buffer_locked(bh)) continue; - if (rw == WRITE || rw == SWRITE) { + if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE, bh); + if (rw == SWRITE_SYNC) + submit_bh(WRITE_SYNC, bh); + else + submit_bh(WRITE, bh); continue; } } else { @@ -2978,7 +2981,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); -- cgit v1.2.3-70-g09d2 From c7d206b3379f7d6462e778b74f475c470ee3dcaf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: vfs: Move mark_inode_dirty() from under page lock in generic_write_end() There's no need to call mark_inode_dirty() under page lock in generic_write_end(). It unnecessarily makes hold time of page lock longer and more importantly it forces locking order of page lock and transaction start for journaling filesystems. Signed-off-by: Jan Kara Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/buffer.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 0f51c0f7c26..f4b033237a0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2061,6 +2061,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2073,12 +2074,21 @@ int generic_write_end(struct file *file, struct address_space *mapping, */ if (pos+copied > inode->i_size) { i_size_write(inode, pos+copied); - mark_inode_dirty(inode); + i_size_changed = 1; } unlock_page(page); page_cache_release(page); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + return copied; } EXPORT_SYMBOL(generic_write_end); -- cgit v1.2.3-70-g09d2 From 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 Mon Sep 17 00:00:00 2001 From: Alex Tomas Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: vfs: add hooks for ext4's delayed allocation support Export mpage_bio_submit() and __mpage_writepage() for the benefit of ext4's delayed allocation support. Also change __block_write_full_page so that if buffers that have the BH_Delay flag set it will call get_block() to get the physical block allocated, just as in the !BH_Mapped case. Signed-off-by: Alex Tomas Signed-off-by: "Theodore Ts'o" --- fs/buffer.c | 7 +++++-- fs/mpage.c | 14 +++++--------- include/linux/mpage.h | 10 ++++++++++ 3 files changed, 20 insertions(+), 11 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index f4b033237a0..5fa1512cd9a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; + clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); @@ -1774,7 +1776,8 @@ recover: bh = head; /* Recovery: lock and submit the mapped buffers */ do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { diff --git a/fs/mpage.c b/fs/mpage.c index 235e4d3873a..dbcc7af76a1 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) bio_put(bio); } -static struct bio *mpage_bio_submit(int rw, struct bio *bio) +struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io_read; if (rw == WRITE) @@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) submit_bio(rw, bio); return NULL; } +EXPORT_SYMBOL(mpage_bio_submit); static struct bio * mpage_alloc(struct block_device *bdev, @@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage); * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ -struct mpage_data { - struct bio *bio; - sector_t last_block_in_bio; - get_block_t *get_block; - unsigned use_writepage; -}; -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, - void *data) +int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; @@ -651,6 +646,7 @@ out: mpd->bio = bio; return ret; } +EXPORT_SYMBOL(__mpage_writepage); /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 068a0c9946a..5c42821da2d 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -11,11 +11,21 @@ */ #ifdef CONFIG_BLOCK +struct mpage_data { + struct bio *bio; + sector_t last_block_in_bio; + get_block_t *get_block; + unsigned use_writepage; +}; + struct writeback_control; +struct bio *mpage_bio_submit(int rw, struct bio *bio); int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); +int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); int mpage_writepage(struct page *page, get_block_t *get_block, -- cgit v1.2.3-70-g09d2