diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 09:17:05 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 09:17:05 -0700 |
commit | a12e4d304ce701844c639541d90df86e165d03f9 (patch) | |
tree | 6ad7314b63a3303d9aa36f1c7eeb68abf64d3592 /mm | |
parent | 89af571ca633ada14d17746519a179553a732d31 (diff) | |
parent | 500b067c5e6ceea49cf280a02597b1169320e08c (diff) |
Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block
* 'writeback' of git://git.kernel.dk/linux-2.6-block:
writeback: check for registered bdi in flusher add and inode dirty
writeback: add name to backing_dev_info
writeback: add some debug inode list counters to bdi stats
writeback: get rid of pdflush completely
writeback: switch to per-bdi threads for flushing data
writeback: move dirty inodes from super_block to backing_dev_info
writeback: get rid of generic_sync_sb_inodes() export
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 381 | ||||
-rw-r--r-- | mm/page-writeback.c | 182 | ||||
-rw-r--r-- | mm/pdflush.c | 269 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
6 files changed, 405 insertions, 432 deletions
diff --git a/mm/Makefile b/mm/Makefile index 5e0bd642669..147a7a7873c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ vmalloc.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o pdflush.o \ + maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c86edd24429..d3ca0dac111 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,8 +1,11 @@ #include <linux/wait.h> #include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> @@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { + .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, @@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = { EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; +DEFINE_SPINLOCK(bdi_lock); +LIST_HEAD(bdi_list); +LIST_HEAD(bdi_pending_list); + +static struct task_struct *sync_supers_tsk; +static struct timer_list sync_supers_timer; + +static int bdi_sync_supers(void *); +static void sync_supers_timer_fn(unsigned long); +static void arm_supers_timer(void); + +static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> @@ -37,9 +53,29 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; + struct inode *inode; + + /* + * inode lock is enough here, the bdi->wb_list is protected by + * RCU on the reader side + */ + nr_wb = nr_dirty = nr_io = nr_more_io = 0; + spin_lock(&inode_lock); + list_for_each_entry(wb, &bdi->wb_list, list) { + nr_wb++; + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; + } + spin_unlock(&inode_lock); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiReclaimable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n", + "BackgroundThresh: %8lu kB\n" + "WriteBack threads:%8lu\n" + "b_dirty: %8lu\n" + "b_io: %8lu\n" + "b_more_io: %8lu\n" + "bdi_list: %8u\n" + "state: %8lx\n" + "wb_mask: %8lx\n" + "wb_list: %8u\n" + "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), - K(dirty_thresh), - K(background_thresh)); + K(bdi_thresh), K(dirty_thresh), + K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, + !list_empty(&bdi->wb_list), bdi->wb_cnt); #undef K return 0; @@ -185,6 +231,13 @@ static int __init default_bdi_init(void) { int err; + sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); + BUG_ON(IS_ERR(sync_supers_tsk)); + + init_timer(&sync_supers_timer); + setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); + arm_supers_timer(); + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -193,6 +246,248 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); +} + +static void bdi_task_init(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct task_struct *tsk = current; + + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&wb->list, &bdi->wb_list); + spin_unlock(&bdi->wb_lock); + + tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(tsk, 0); +} + +static int bdi_start_fn(void *ptr) +{ + struct bdi_writeback *wb = ptr; + struct backing_dev_info *bdi = wb->bdi; + int ret; + + /* + * Add us to the active bdi_list + */ + spin_lock(&bdi_lock); + list_add(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + + bdi_task_init(bdi, wb); + + /* + * Clear pending bit and wakeup anybody waiting to tear us down + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + + ret = bdi_writeback_task(wb); + + /* + * Remove us from the list + */ + spin_lock(&bdi->wb_lock); + list_del_rcu(&wb->list); + spin_unlock(&bdi->wb_lock); + + /* + * Flush any work that raced with us exiting. No new work + * will be added, since this bdi isn't discoverable anymore. + */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + wb->task = NULL; + return ret; +} + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); +} + +static void bdi_flush_io(struct backing_dev_info *bdi) +{ + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + .nr_to_write = 1024, + }; + + writeback_inodes_wbc(&wbc); +} + +/* + * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * or we risk deadlocking on ->s_umount. The longer term solution would be + * to implement sync_supers_bdi() or similar and simply do it from the + * bdi writeback tasks individually. + */ +static int bdi_sync_supers(void *unused) +{ + set_user_nice(current, 0); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* + * Do this periodically, like kupdated() did before. + */ + sync_supers(); + } + + return 0; +} + +static void arm_supers_timer(void) +{ + unsigned long next; + + next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; + mod_timer(&sync_supers_timer, round_jiffies_up(next)); +} + +static void sync_supers_timer_fn(unsigned long unused) +{ + wake_up_process(sync_supers_tsk); + arm_supers_timer(); +} + +static int bdi_forker_task(void *ptr) +{ + struct bdi_writeback *me = ptr; + + bdi_task_init(me->bdi, me); + + for (;;) { + struct backing_dev_info *bdi, *tmp; + struct bdi_writeback *wb; + + /* + * Temporary measure, we want to make sure we don't see + * dirty data on the default backing_dev_info + */ + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + wb_do_writeback(me, 0); + + spin_lock(&bdi_lock); + + /* + * Check if any existing bdi's have dirty data without + * a thread registered. If so, set that up. + */ + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { + if (bdi->wb.task) + continue; + if (list_empty(&bdi->work_list) && + !bdi_has_dirty_io(bdi)) + continue; + + bdi_add_default_flusher_task(bdi); + } + + set_current_state(TASK_INTERRUPTIBLE); + + if (list_empty(&bdi_pending_list)) { + unsigned long wait; + + spin_unlock(&bdi_lock); + wait = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait); + try_to_freeze(); + continue; + } + + __set_current_state(TASK_RUNNING); + + /* + * This is our real job - check for pending entries in + * bdi_pending_list, and create the tasks that got added + */ + bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, + bdi_list); + list_del_init(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + wb = &bdi->wb; + wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", + dev_name(bdi->dev)); + /* + * If task creation fails, then readd the bdi to + * the pending list and force writeout of the bdi + * from this forker thread. That will free some memory + * and we can try again. + */ + if (IS_ERR(wb->task)) { + wb->task = NULL; + + /* + * Add this 'bdi' to the back, so we get + * a chance to flush other bdi's to free + * memory. + */ + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock(&bdi_lock); + + bdi_flush_io(bdi); + } + } + + return 0; +} + +/* + * Add the default flusher task that gets created for any bdi + * that has dirty data pending writeout + */ +void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) +{ + if (!bdi_cap_writeback_dirty(bdi)) + return; + + if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { + printk(KERN_ERR "bdi %p/%s is not registered!\n", + bdi, bdi->name); + return; + } + + /* + * Check with the helper whether to proceed adding a task. Will only + * abort if we two or more simultanous calls to + * bdi_add_default_flusher_task() occured, further additions will block + * waiting for previous additions to finish. + */ + if (!test_and_set_bit(BDI_pending, &bdi->state)) { + list_move_tail(&bdi->bdi_list, &bdi_pending_list); + + /* + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us back to the active bdi_list + */ + wake_up_process(default_backing_dev_info.wb.task); + } +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + bdi->dev = dev; - bdi_debug_register(bdi, dev_name(dev)); + /* + * Just start the forker thread for our default backing_dev_info, + * and add other bdi's to the list. They will get a thread created + * on-demand when they need it. + */ + if (bdi_cap_flush_forker(bdi)) { + struct bdi_writeback *wb = &bdi->wb; + + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + dev_name(dev)); + if (IS_ERR(wb->task)) { + wb->task = NULL; + ret = -ENOMEM; + + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + goto exit; + } + } + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); exit: return ret; } @@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void bdi_wb_shutdown(struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb; + + if (!bdi_cap_writeback_dirty(bdi)) + return; + + /* + * If setup is pending, wait for that to complete first + */ + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); + + /* + * Make sure nobody finds us on the bdi_list anymore + */ + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + /* + * Finally, kill the kernel threads. We don't need to be RCU + * safe anymore, since the bdi is gone from visibility. + */ + list_for_each_entry(wb, &bdi->wb_list, list) + kthread_stop(wb->task); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + if (!bdi_cap_flush_forker(bdi)) + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister); int bdi_init(struct backing_dev_info *bdi) { - int i; - int err; + int i, err; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; + spin_lock_init(&bdi->wb_lock); + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->wb_list); + INIT_LIST_HEAD(&bdi->work_list); + + bdi_wb_init(&bdi->wb, bdi); + + /* + * Just one thread support for now, hard code mask and count + */ + bdi->wb_mask = 1; + bdi->wb_cnt = 1; for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + WARN_ON(bdi_has_dirty_io(bdi)); + bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 81627ebcd31..25e7770309b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,15 +36,6 @@ #include <linux/pagevec.h> /* - * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 - -/* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ @@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -static void background_writeout(unsigned long _min_pages); - /* * Scale the writeback cache size proportional to the relative writeout speeds. * @@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) /* * */ -static DEFINE_SPINLOCK(bdi_lock); static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - unsigned long flags; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { - unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } @@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping) * up. */ if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes(&wbc); + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(BLK_RW_ASYNC, HZ/10); + schedule_timeout(1); } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS) - > background_thresh))) - pdflush_operation(background_writeout, 0); + (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) + > background_thresh))) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .nr_to_write = nr_writeback, + }; + + + bdi_start_writeback(&wbc); + } } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask) } } -/* - * writeback at least _min_pages, and keep writing until the amount of dirty - * memory is less than the background threshold, or until we're all clean. - */ -static void background_writeout(unsigned long _min_pages) -{ - long min_pages = _min_pages; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 0, - .nonblocking = 1, - .range_cyclic = 1, - }; - - for ( ; ; ) { - unsigned long background_thresh; - unsigned long dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh - && min_pages <= 0) - break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(BLK_RW_ASYNC, HZ/10); - else - break; - } - } -} - -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns - * -1 if all pdflush threads were busy. - */ -int wakeup_pdflush(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); -} - -static void wb_timer_fn(unsigned long unused); static void laptop_timer_fn(unsigned long unused); -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* - * Periodic writeback of "old" data. - * - * Define "old": the first time one of an inode's pages is dirtied, we mark the - * dirtying-time in the inode's address_space. So this periodic writeback code - * just walks the superblock inode list, writing back any inodes which are - * older than a specific point in time. - * - * Try to run once per dirty_writeback_interval. But if a writeback event - * takes longer than a dirty_writeback_interval interval, then leave a - * one-second gap. - * - * older_than_this takes precedence over nr_to_write. So we'll only write back - * all dirty pages if they are all attached to "old" mappings. - */ -static void wb_kupdate(unsigned long arg) -{ - unsigned long oldest_jif; - unsigned long start_jif; - unsigned long next_jif; - long nr_to_write; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = &oldest_jif, - .nr_to_write = 0, - .nonblocking = 1, - .for_kupdate = 1, - .range_cyclic = 1, - }; - - sync_supers(); - - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - start_jif = jiffies; - next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); - nr_to_write = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(BLK_RW_ASYNC, HZ/10); - else - break; /* All the old data is written */ - } - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - } - if (time_before(next_jif, jiffies + HZ)) - next_jif = jiffies + HZ; - if (dirty_writeback_interval) - mod_timer(&wb_timer, next_jif); -} - -/* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + - msecs_to_jiffies(dirty_writeback_interval * 10)); - else - del_timer(&wb_timer); return 0; } -static void wb_timer_fn(unsigned long unused) -{ - if (pdflush_operation(wb_kupdate, 0) < 0) - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ -} - -static void laptop_flush(unsigned long unused) +static void do_laptop_sync(struct work_struct *work) { - sys_sync(); + wakeup_flusher_threads(0); + kfree(work); } static void laptop_timer_fn(unsigned long unused) { - pdflush_operation(laptop_flush, 0); + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_laptop_sync); + schedule_work(work); + } } /* @@ -910,8 +786,6 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, - jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 235ac440c44..00000000000 --- a/mm/pdflush.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * mm/pdflush.c - worker threads for writing back filesystem data - * - * Copyright (C) 2002, Linus Torvalds. - * - * 09Apr2002 Andrew Morton - * Initial version - * 29Feb2004 kaos@sgi.com - * Move worker thread creation to kthread to avoid chewing - * up stack space with nested calls to kernel_thread. - */ - -#include <linux/sched.h> -#include <linux/list.h> -#include <linux/signal.h> -#include <linux/spinlock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> /* Needed by writeback.h */ -#include <linux/writeback.h> /* Prototypes pdflush_operation() */ -#include <linux/kthread.h> -#include <linux/cpuset.h> -#include <linux/freezer.h> - - -/* - * Minimum and maximum number of pdflush instances - */ -#define MIN_PDFLUSH_THREADS 2 -#define MAX_PDFLUSH_THREADS 8 - -static void start_one_pdflush_thread(void); - - -/* - * The pdflush threads are worker threads for writing back dirty data. - * Ideally, we'd like one thread per active disk spindle. But the disk - * topology is very hard to divine at this level. Instead, we take - * care in various places to prevent more than one pdflush thread from - * performing writeback against a single filesystem. pdflush threads - * have the PF_FLUSHER flag set in current->flags to aid in this. - */ - -/* - * All the pdflush threads. Protected by pdflush_lock - */ -static LIST_HEAD(pdflush_list); -static DEFINE_SPINLOCK(pdflush_lock); - -/* - * The count of currently-running pdflush threads. Protected - * by pdflush_lock. - * - * Readable by sysctl, but not writable. Published to userspace at - * /proc/sys/vm/nr_pdflush_threads. - */ -int nr_pdflush_threads = 0; - -/* - * The time at which the pdflush thread pool last went empty - */ -static unsigned long last_empty_jifs; - -/* - * The pdflush thread. - * - * Thread pool management algorithm: - * - * - The minimum and maximum number of pdflush instances are bound - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. - * - * - If there have been no idle pdflush instances for 1 second, create - * a new one. - * - * - If the least-recently-went-to-sleep pdflush thread has been asleep - * for more than one second, terminate a thread. - */ - -/* - * A structure for passing work to a pdflush thread. Also for passing - * state information between pdflush threads. Protected by pdflush_lock. - */ -struct pdflush_work { - struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function */ - unsigned long arg0; /* An argument to the callback */ - struct list_head list; /* On pdflush_list, when idle */ - unsigned long when_i_went_to_sleep; -}; - -static int __pdflush(struct pdflush_work *my_work) -{ - current->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - my_work->fn = NULL; - my_work->who = current; - INIT_LIST_HEAD(&my_work->list); - - spin_lock_irq(&pdflush_lock); - for ( ; ; ) { - struct pdflush_work *pdf; - - set_current_state(TASK_INTERRUPTIBLE); - list_move(&my_work->list, &pdflush_list); - my_work->when_i_went_to_sleep = jiffies; - spin_unlock_irq(&pdflush_lock); - schedule(); - try_to_freeze(); - spin_lock_irq(&pdflush_lock); - if (!list_empty(&my_work->list)) { - /* - * Someone woke us up, but without removing our control - * structure from the global list. swsusp will do this - * in try_to_freeze()->refrigerator(). Handle it. - */ - my_work->fn = NULL; - continue; - } - if (my_work->fn == NULL) { - printk("pdflush: bogus wakeup\n"); - continue; - } - spin_unlock_irq(&pdflush_lock); - - (*my_work->fn)(my_work->arg0); - - spin_lock_irq(&pdflush_lock); - - /* - * Thread creation: For how long have there been zero - * available threads? - * - * To throttle creation, we reset last_empty_jifs. - */ - if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list)) { - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); - } - } - } - - my_work->fn = NULL; - - /* - * Thread destruction: For how long has the sleepiest - * thread slept? - */ - if (list_empty(&pdflush_list)) - continue; - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) - continue; - pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { - /* Limit exit rate */ - pdf->when_i_went_to_sleep = jiffies; - break; /* exeunt */ - } - } - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - return 0; -} - -/* - * Of course, my_work wants to be just a local in __pdflush(). It is - * separated out in this manner to hopefully prevent the compiler from - * performing unfortunate optimisations against the auto variables. Because - * these are visible to other tasks and CPUs. (No problem has actually - * been observed. This is just paranoia). - */ -static int pdflush(void *dummy) -{ - struct pdflush_work my_work; - cpumask_var_t cpus_allowed; - - /* - * Since the caller doesn't even check kthread_run() worked, let's not - * freak out too much if this fails. - */ - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); - return 0; - } - - /* - * pdflush can spend a lot of time doing encryption via dm-crypt. We - * don't want to do that at keventd's priority. - */ - set_user_nice(current, 0); - - /* - * Some configs put our parent kthread in a limited cpuset, - * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. - * Our needs are more modest - cut back to our cpusets cpus_allowed. - * This is needed as pdflush's are dynamically created and destroyed. - * The boottime pdflush's are easily placed w/o these 2 lines. - */ - cpuset_cpus_allowed(current, cpus_allowed); - set_cpus_allowed_ptr(current, cpus_allowed); - free_cpumask_var(cpus_allowed); - - return __pdflush(&my_work); -} - -/* - * Attempt to wake up a pdflush thread, and get it to do some work for you. - * Returns zero if it indeed managed to find a worker thread, and passed your - * payload to it. - */ -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) -{ - unsigned long flags; - int ret = 0; - - BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ - - spin_lock_irqsave(&pdflush_lock, flags); - if (list_empty(&pdflush_list)) { - ret = -1; - } else { - struct pdflush_work *pdf; - - pdf = list_entry(pdflush_list.next, struct pdflush_work, list); - list_del_init(&pdf->list); - if (list_empty(&pdflush_list)) - last_empty_jifs = jiffies; - pdf->fn = fn; - pdf->arg0 = arg0; - wake_up_process(pdf->who); - } - spin_unlock_irqrestore(&pdflush_lock, flags); - - return ret; -} - -static void start_one_pdflush_thread(void) -{ - struct task_struct *k; - - k = kthread_run(pdflush, NULL, "pdflush"); - if (unlikely(IS_ERR(k))) { - spin_lock_irq(&pdflush_lock); - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - } -} - -static int __init pdflush_init(void) -{ - int i; - - /* - * Pre-set nr_pdflush_threads... If we fail to create, - * the count will be decremented. - */ - nr_pdflush_threads = MIN_PDFLUSH_THREADS; - - for (i = 0; i < MIN_PDFLUSH_THREADS; i++) - start_one_pdflush_thread(); - return 0; -} - -module_init(pdflush_init); diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38eba79..5ae6b8b78c8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { + .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; diff --git a/mm/vmscan.c b/mm/vmscan.c index 94e86dd6954..ba8228e0a80 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (total_scanned > sc->swap_cluster_max + sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } |