From a564da3964db3256069190c2ae95069143ac37fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Mar 2006 00:08:47 -0800 Subject: [PATCH] readahead: ->prev_page can overrun the ahead window If get_next_ra_size() does not grow fast enough, ->prev_page can overrun the ahead window. This means the caller will read the pages from ->ahead_start + ->ahead_size to ->prev_page synchronously. Signed-off-by: Oleg Nesterov Cc: Steven Pratt Cc: Ram Pai Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'mm/readahead.c') diff --git a/mm/readahead.c b/mm/readahead.c index 8d6eeaaa629..57557e29498 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } +static inline void reset_ahead_window(struct file_ra_state *ra) +{ + /* + * ... but preserve ahead_start + ahead_size value, + * see 'recheck:' label in page_cache_readahead(). + * Note: We never use ->ahead_size as rvalue without + * checking ->ahead_start != 0 first. + */ + ra->ahead_size += ra->ahead_start; + ra->ahead_start = 0; +} + static inline void ra_off(struct file_ra_state *ra) { ra->start = 0; ra->flags = 0; ra->size = 0; - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); return; } @@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, * congestion. The ahead window will any way be closed * in case we failed due to excessive page cache hits. */ - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); } return ret; @@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * If we get here we are doing sequential IO and this was not the first * occurence (ie we have an existing window) */ - if (ra->ahead_start == 0) { /* no ahead window yet */ if (!make_ahead_window(mapping, filp, ra, 0)) - goto out; + goto recheck; } + /* * Already have an ahead window, check if we crossed into it. * If so, shift windows and issue a new ahead window. @@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, ra->start = ra->ahead_start; ra->size = ra->ahead_size; make_ahead_window(mapping, filp, ra, 0); +recheck: + /* prev_page shouldn't overrun the ahead window */ + ra->prev_page = min(ra->prev_page, + ra->ahead_start + ra->ahead_size - 1); } out: -- cgit v1.2.3-70-g09d2 From aed75ff3caafce404d9be7f0c088716375be5279 Mon Sep 17 00:00:00 2001 From: Steven Pratt Date: Wed, 22 Mar 2006 00:08:48 -0800 Subject: [PATCH] readahead: fix initial window size calculation The current current get_init_ra_size is not optimal across different IO sizes and max_readahead values. Here is a quick summary of sizes computed under current design and under the attached patch. All of these assume 1st IO at offset 0, or 1st detected sequential IO. 32k max, 4k request old new ----------------- 8k 8k 16k 16k 32k 32k 128k max, 4k request old new ----------------- 32k 16k 64k 32k 128k 64k 128k 128k 128k max, 32k request old new ----------------- 32k 64k <----- 64k 128k 128k 128k 512k max, 4k request old new ----------------- 4k 32k <---- 16k 64k 64k 128k 128k 256k 512k 512k Cc: Oleg Nesterov Cc: Steven Pratt Cc: Ram Pai Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/readahead.c') diff --git a/mm/readahead.c b/mm/readahead.c index 57557e29498..301b36c4a0c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -83,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); - if (newsize <= max / 64) - newsize = newsize * newsize; + if (newsize <= max / 32) + newsize = newsize * 4; else if (newsize <= max / 4) - newsize = max / 4; + newsize = newsize * 2; else newsize = max; return newsize; -- cgit v1.2.3-70-g09d2 From d8733c2956968a01394a4d2a9e97a8b431a78776 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Mar 2006 03:00:11 -0800 Subject: [PATCH] ext3_readdir: use generic readahead Linus points out that ext3_readdir's readahead only cuts in when ext3_readdir() is operating at the very start of the directory. So for large directories we end up performing no readahead at all and we suck. So take it all out and use the core VM's page_cache_readahead(). This means that ext3 directory reads will use all of readahead's dynamic sizing goop. Note that we're using the directory's filp->f_ra to hold the readahead state, but readahead is actually being performed against the underlying blockdev's address_space. Fortunately the readahead code is all set up to handle this. Tested with printk. It works. I was struggling to find a real workload which actually cared. (The patch also exports page_cache_readahead() to GPL modules) Cc: "Stephen C. Tweedie" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/dir.c | 52 +++++++++++++++++++++++-------------------------- fs/ext3/inode.c | 2 +- include/linux/ext3_fs.h | 9 ++++++--- mm/readahead.c | 1 + 4 files changed, 32 insertions(+), 32 deletions(-) (limited to 'mm/readahead.c') diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 832867aef3d..773459164bb 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -95,11 +95,10 @@ static int ext3_readdir(struct file * filp, void * dirent, filldir_t filldir) { int error = 0; - unsigned long offset, blk; - int i, num, stored; - struct buffer_head * bh, * tmp, * bha[16]; - struct ext3_dir_entry_2 * de; - struct super_block * sb; + unsigned long offset; + int i, stored; + struct ext3_dir_entry_2 *de; + struct super_block *sb; int err; struct inode *inode = filp->f_dentry->d_inode; int ret = 0; @@ -124,12 +123,29 @@ static int ext3_readdir(struct file * filp, } #endif stored = 0; - bh = NULL; offset = filp->f_pos & (sb->s_blocksize - 1); while (!error && !stored && filp->f_pos < inode->i_size) { - blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb); - bh = ext3_bread(NULL, inode, blk, 0, &err); + unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); + struct buffer_head map_bh; + struct buffer_head *bh = NULL; + + map_bh.b_state = 0; + err = ext3_get_block_handle(NULL, inode, blk, &map_bh, 0, 0); + if (!err) { + page_cache_readahead(sb->s_bdev->bd_inode->i_mapping, + &filp->f_ra, + filp, + map_bh.b_blocknr >> + (PAGE_CACHE_SHIFT - inode->i_blkbits), + 1); + bh = ext3_bread(NULL, inode, blk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ if (!bh) { ext3_error (sb, "ext3_readdir", "directory #%lu contains a hole at offset %lu", @@ -138,26 +154,6 @@ static int ext3_readdir(struct file * filp, continue; } - /* - * Do the readahead - */ - if (!offset) { - for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0; - i > 0; i--) { - tmp = ext3_getblk (NULL, inode, ++blk, 0, &err); - if (tmp && !buffer_uptodate(tmp) && - !buffer_locked(tmp)) - bha[num++] = tmp; - else - brelse (tmp); - } - if (num) { - ll_rw_block (READA, num, bha); - for (i = 0; i < num; i++) - brelse (bha[i]); - } - } - revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 0384e539b88..d59d5a667b0 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -671,7 +671,7 @@ err_out: * The BKL may not be held on entry here. Be sure to take it early. */ -static int +int ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create, int extend_disksize) { diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index c0272d73ab2..e7239f2f97a 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -772,9 +772,12 @@ extern unsigned long ext3_count_free (struct buffer_head *, unsigned); /* inode.c */ -extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); -extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); -extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); +struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +int ext3_get_block_handle(handle_t *handle, struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create, + int extend_disksize); extern void ext3_read_inode (struct inode *); extern int ext3_write_inode (struct inode *, int); diff --git a/mm/readahead.c b/mm/readahead.c index 301b36c4a0c..0f142a40984 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -555,6 +555,7 @@ recheck: out: return ra->prev_page + 1; } +EXPORT_SYMBOL_GPL(page_cache_readahead); /* * handle_ra_miss() is called when it is known that a page which should have -- cgit v1.2.3-70-g09d2