From 9db5579be4bb5320c3248f6acf807aedf05ae143 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 8 Feb 2008 04:19:49 -0800 Subject: rewrite rd This is a rewrite of the ramdisk block device driver. The old one is really difficult because it effectively implements a block device which serves data out of its own buffer cache. It relies on the dirty bit being set, to pin its backing store in cache, however there are non trivial paths which can clear the dirty bit (eg. try_to_free_buffers()), which had recently lead to data corruption. And in general it is completely wrong for a block device driver to do this. The new one is more like a regular block device driver. It has no idea about vm/vfs stuff. It's backing store is similar to the buffer cache (a simple radix-tree of pages), but it doesn't know anything about page cache (the pages in the radix tree are not pagecache pages). There is one slight downside -- direct block device access and filesystem metadata access goes through an extra copy and gets stored in RAM twice. However, this downside is only slight, because the real buffercache of the device is now reclaimable (because we're not playing crazy games with it), so under memory intensive situations, footprint should effectively be the same -- maybe even a slight advantage to the new driver because it can also reclaim buffer heads. The fact that it now goes through all the regular vm/fs paths makes it much more useful for testing, too. text data bss dec hex filename 2837 849 384 4070 fe6 drivers/block/rd.o 3528 371 12 3911 f47 drivers/block/brd.o Text is larger, but data and bss are smaller, making total size smaller. A few other nice things about it: - Similar structure and layout to the new loop device handlinag. - Dynamic ramdisk creation. - Runtime flexible buffer head size (because it is no longer part of the ramdisk code). - Boot / load time flexible ramdisk size, which could easily be extended to a per-ramdisk runtime changeable size (eg. with an ioctl). - Can use highmem for the backing store. [akpm@linux-foundation.org: fix build] [byron.bbradley@gmail.com: make rd_size non-static] Signed-off-by: Nick Piggin Signed-off-by: Byron Bradley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 826baf4f04b..11b002e01d6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1436,6 +1436,7 @@ void invalidate_bh_lrus(void) { on_each_cpu(invalidate_bh_lru, NULL, 1, 1); } +EXPORT_SYMBOL_GPL(invalidate_bh_lrus); void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) -- cgit v1.2.3-70-g09d2 From fc9b52cd8f5f459b88adcf67c47668425ae31a78 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 8 Feb 2008 04:19:52 -0800 Subject: fs: remove fastcall, it is always empty [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 17 ++++++++--------- fs/buffer.c | 6 +++--- fs/fcntl.c | 2 +- fs/file_table.c | 8 ++++---- fs/namei.c | 16 ++++++++-------- fs/open.c | 4 ++-- 6 files changed, 26 insertions(+), 27 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/aio.c b/fs/aio.c index 8a37dbbf343..8a48ab0c278 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -317,7 +317,7 @@ out: /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. */ -ssize_t fastcall wait_on_sync_kiocb(struct kiocb *iocb) +ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { while (iocb->ki_users) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -336,7 +336,7 @@ ssize_t fastcall wait_on_sync_kiocb(struct kiocb *iocb) * go away, they will call put_ioctx and release any pinned memory * associated with the request (held via struct page * references). */ -void fastcall exit_aio(struct mm_struct *mm) +void exit_aio(struct mm_struct *mm) { struct kioctx *ctx = mm->ioctx_list; mm->ioctx_list = NULL; @@ -365,7 +365,7 @@ void fastcall exit_aio(struct mm_struct *mm) * Called when the last user of an aio context has gone away, * and the struct needs to be freed. */ -void fastcall __put_ioctx(struct kioctx *ctx) +void __put_ioctx(struct kioctx *ctx) { unsigned nr_events = ctx->max_reqs; @@ -397,8 +397,7 @@ void fastcall __put_ioctx(struct kioctx *ctx) * This prevents races between the aio code path referencing the * req (after submitting it) and aio_complete() freeing the req. */ -static struct kiocb *__aio_get_req(struct kioctx *ctx); -static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx) +static struct kiocb *__aio_get_req(struct kioctx *ctx) { struct kiocb *req = NULL; struct aio_ring *ring; @@ -533,7 +532,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) * Returns true if this put was the last user of the kiocb, * false if the request is still in use. */ -int fastcall aio_put_req(struct kiocb *req) +int aio_put_req(struct kiocb *req) { struct kioctx *ctx = req->ki_ctx; int ret; @@ -893,7 +892,7 @@ static void try_queue_kicked_iocb(struct kiocb *iocb) * The retry is usually executed by aio workqueue * threads (See aio_kick_handler). */ -void fastcall kick_iocb(struct kiocb *iocb) +void kick_iocb(struct kiocb *iocb) { /* sync iocbs are easy: they can only ever be executing from a * single context. */ @@ -912,7 +911,7 @@ EXPORT_SYMBOL(kick_iocb); * Returns true if this is the last user of the request. The * only other user of the request can be the cancellation code. */ -int fastcall aio_complete(struct kiocb *iocb, long res, long res2) +int aio_complete(struct kiocb *iocb, long res, long res2) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring_info *info; @@ -1523,7 +1522,7 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode, return 1; } -int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, +int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb) { struct kiocb *req; diff --git a/fs/buffer.c b/fs/buffer.c index 11b002e01d6..6f0bddddcf4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -67,14 +67,14 @@ static int sync_buffer(void *word) return 0; } -void fastcall __lock_buffer(struct buffer_head *bh) +void __lock_buffer(struct buffer_head *bh) { wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_buffer); -void fastcall unlock_buffer(struct buffer_head *bh) +void unlock_buffer(struct buffer_head *bh) { smp_mb__before_clear_bit(); clear_buffer_locked(bh); @@ -1164,7 +1164,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, * mapping->tree_lock and the global inode_lock. */ -void fastcall mark_buffer_dirty(struct buffer_head *bh) +void mark_buffer_dirty(struct buffer_head *bh) { WARN_ON_ONCE(!buffer_uptodate(bh)); if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) diff --git a/fs/fcntl.c b/fs/fcntl.c index 7efe59ed1ed..e632da761fc 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -24,7 +24,7 @@ #include #include -void fastcall set_close_on_exec(unsigned int fd, int flag) +void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; struct fdtable *fdt; diff --git a/fs/file_table.c b/fs/file_table.c index 664e3f2309b..6d27befe2d4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -197,7 +197,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, } EXPORT_SYMBOL(init_file); -void fastcall fput(struct file *file) +void fput(struct file *file) { if (atomic_dec_and_test(&file->f_count)) __fput(file); @@ -208,7 +208,7 @@ EXPORT_SYMBOL(fput); /* __fput is called from task context when aio completion releases the last * last use of a struct file *. Do not use otherwise. */ -void fastcall __fput(struct file *file) +void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; @@ -241,7 +241,7 @@ void fastcall __fput(struct file *file) mntput(mnt); } -struct file fastcall *fget(unsigned int fd) +struct file *fget(unsigned int fd) { struct file *file; struct files_struct *files = current->files; @@ -269,7 +269,7 @@ EXPORT_SYMBOL(fget); * and a flag is returned to be passed to the corresponding fput_light(). * There must not be a cloning between an fget_light/fput_light pair. */ -struct file fastcall *fget_light(unsigned int fd, int *fput_needed) +struct file *fget_light(unsigned int fd, int *fput_needed) { struct file *file; struct files_struct *files = current->files; diff --git a/fs/namei.c b/fs/namei.c index 241cff42365..52703986323 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -106,7 +106,7 @@ * any extra contention... */ -static int fastcall link_path_walk(const char *name, struct nameidata *nd); +static int link_path_walk(const char *name, struct nameidata *nd); /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the @@ -823,7 +823,7 @@ fail: * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ -static fastcall int __link_path_walk(const char * name, struct nameidata *nd) +static int __link_path_walk(const char *name, struct nameidata *nd) { struct path next; struct inode *inode; @@ -1015,7 +1015,7 @@ return_err: * Retry the whole path once, forcing real lookup requests * instead of relying on the dcache. */ -static int fastcall link_path_walk(const char *name, struct nameidata *nd) +static int link_path_walk(const char *name, struct nameidata *nd) { struct nameidata save = *nd; int result; @@ -1039,7 +1039,7 @@ static int fastcall link_path_walk(const char *name, struct nameidata *nd) return result; } -static int fastcall path_walk(const char * name, struct nameidata *nd) +static int path_walk(const char *name, struct nameidata *nd) { current->total_link_count = 0; return link_path_walk(name, nd); @@ -1116,7 +1116,7 @@ set_it: } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int fastcall do_path_lookup(int dfd, const char *name, +static int do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; @@ -1183,7 +1183,7 @@ fput_fail: goto out_fail; } -int fastcall path_lookup(const char *name, unsigned int flags, +int path_lookup(const char *name, unsigned int flags, struct nameidata *nd) { return do_path_lookup(AT_FDCWD, name, flags, nd); @@ -1409,7 +1409,7 @@ struct dentry *lookup_one_noperm(const char *name, struct dentry *base) return __lookup_hash(&this, base, NULL); } -int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags, +int __user_walk_fd(int dfd, const char __user *name, unsigned flags, struct nameidata *nd) { char *tmp = getname(name); @@ -1422,7 +1422,7 @@ int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags, return err; } -int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) +int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) { return __user_walk_fd(AT_FDCWD, name, flags, nd); } diff --git a/fs/open.c b/fs/open.c index 4932b4d1da0..4b389dfbd5c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -991,7 +991,7 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd) files->next_fd = fd; } -void fastcall put_unused_fd(unsigned int fd) +void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); @@ -1014,7 +1014,7 @@ EXPORT_SYMBOL(put_unused_fd); * will follow. */ -void fastcall fd_install(unsigned int fd, struct file * file) +void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; -- cgit v1.2.3-70-g09d2 From 535ee2fbf79ab52d26bce3d2e127c9007503581e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 8 Feb 2008 04:21:59 -0800 Subject: buffer_head: fix private_list handling There are two possible races in handling of private_list in buffer cache. 1) When fsync_buffers_list() processes a private_list, it clears b_assoc_mapping and moves buffer to its private list. Now drop_buffers() comes, sees a buffer is on list so it calls __remove_assoc_queue() which complains about b_assoc_mapping being cleared (as it cannot propagate possible IO error). This race has been actually observed in the wild. 2) When fsync_buffers_list() processes a private_list, mark_buffer_dirty_inode() can be called on bh which is already on the private list of fsync_buffers_list(). As buffer is on some list (note that the check is performed without private_lock), it is not readded to the mapping's private_list and after fsync_buffers_list() finishes, we have a dirty buffer which should be on private_list but it isn't. This race has not been reported, probably because most (but not all) callers of mark_buffer_dirty_inode() hold i_mutex and thus are serialized with fsync(). Fix these issues by not clearing b_assoc_map when fsync_buffers_list() moves buffer to a dedicated list and by reinserting buffer in private_list when it is found dirty after we have submitted buffer for IO. We also change the tests whether a buffer is on a private list from !list_empty(&bh->b_assoc_buffers) to bh->b_assoc_map so that they are single word reads and hence lockless checks are safe. Signed-off-by: Jan Kara Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 6f0bddddcf4..3ebccf4aa7e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -678,7 +678,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) } else { BUG_ON(mapping->assoc_mapping != buffer_mapping); } - if (list_empty(&bh->b_assoc_buffers)) { + if (!bh->b_assoc_map) { spin_lock(&buffer_mapping->private_lock); list_move_tail(&bh->b_assoc_buffers, &mapping->private_list); @@ -794,6 +794,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; struct list_head tmp; + struct address_space *mapping; int err = 0, err2; INIT_LIST_HEAD(&tmp); @@ -801,9 +802,14 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) spin_lock(lock); while (!list_empty(list)) { bh = BH_ENTRY(list->next); + mapping = bh->b_assoc_map; __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); if (buffer_dirty(bh) || buffer_locked(bh)) { list_add(&bh->b_assoc_buffers, &tmp); + bh->b_assoc_map = mapping; if (buffer_dirty(bh)) { get_bh(bh); spin_unlock(lock); @@ -822,8 +828,17 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) while (!list_empty(&tmp)) { bh = BH_ENTRY(tmp.prev); - list_del_init(&bh->b_assoc_buffers); get_bh(bh); + mapping = bh->b_assoc_map; + __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh)) { + list_add(&bh->b_assoc_buffers, + &bh->b_assoc_map->private_list); + bh->b_assoc_map = mapping; + } spin_unlock(lock); wait_on_buffer(bh); if (!buffer_uptodate(bh)) @@ -1195,7 +1210,7 @@ void __brelse(struct buffer_head * buf) void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); - if (!list_empty(&bh->b_assoc_buffers)) { + if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_page->mapping; spin_lock(&buffer_mapping->private_lock); @@ -3022,7 +3037,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) do { struct buffer_head *next = bh->b_this_page; - if (!list_empty(&bh->b_assoc_buffers)) + if (bh->b_assoc_map) __remove_assoc_queue(bh); bh = next; } while (bh != head); -- cgit v1.2.3-70-g09d2