From 31a12666d8f0c22235297e1c1575f82061480029 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:39:04 -0800 Subject: mm: write_cache_pages cyclic fix In write_cache_pages, scanned == 1 is supposed to mean that cyclic writeback has circled through zero, thus we should not circle again. However it gets set to 1 after the first successful pagevec lookup. This leads to cases where not enough data gets written. Counterexample: file with first 10 pages dirty, writeback_index == 5, nr_to_write == 10. Then the 5 last pages will be found, and scanned will be set to 1, after writing those out, we will not cycle back to get the first 5. Rework this logic, now we'll always cycle unless we started off from index 0. When cycling, only write out as far as 1 page before the start page from the first cycle (so we don't write parts of the file twice). Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2970e35fd03..eb277bdd4c5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -868,9 +868,10 @@ int write_cache_pages(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ - int scanned = 0; + int cycled; int range_whole = 0; long nr_to_write = wbc->nr_to_write; @@ -881,14 +882,19 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } retry: while (!done && (index <= end) && @@ -897,7 +903,6 @@ retry: min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -915,7 +920,11 @@ retry: continue; } - if (!wbc->range_cyclic && page->index > end) { + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ done = 1; unlock_page(page); continue; @@ -946,13 +955,15 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } if (!wbc->no_nrwrite_index_update) { -- cgit v1.2.3-70-g09d2 From bd19e012f6fd3b7309689165ea865cbb7bb88c1e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:39:06 -0800 Subject: mm: write_cache_pages early loop termination We'd like to break out of the loop early in many situations, however the existing code has been setting mapping->writeback_index past the final page in the pagevec lookup for cyclic writeback. This is a problem if we don't process all pages up to the final page. Currently the code mostly keeps writeback_index reasonable and hacked around this by not breaking out of the loop or writing pages outside the range in these cases. Keep track of a real "done index" that enables us to terminate the loop in a much more flexible manner. Needed by the subsequent patch to preserve writepage errors, and then further patches to break out of the loop early for other reasons. However there are no functional changes with this patch alone. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eb277bdd4c5..01b9cb8ccf6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -871,6 +871,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ + pgoff_t done_index; int cycled; int range_whole = 0; long nr_to_write = wbc->nr_to_write; @@ -897,6 +898,7 @@ int write_cache_pages(struct address_space *mapping, cycled = 1; /* ignore range_cyclic tests */ } retry: + done_index = index; while (!done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, @@ -906,6 +908,8 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + done_index = page->index + 1; + /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -968,7 +972,7 @@ retry: } if (!wbc->no_nrwrite_index_update) { if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) - mapping->writeback_index = index; + mapping->writeback_index = done_index; wbc->nr_to_write = nr_to_write; } -- cgit v1.2.3-70-g09d2 From 00266770b8b3a6a77f896ca501a0613739086832 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:39:06 -0800 Subject: mm: write_cache_pages writepage error fix In write_cache_pages, if ret signals a real error, but we still have some pages left in the pagevec, done would be set to 1, but the remaining pages would continue to be processed and ret will be overwritten in the process. It could easily be overwritten with success, and thus success will be returned even if there is an error. Thus the caller is told all writes succeeded, wheras in reality some did not. Fix this by bailing immediately if there is an error, and retaining the first error code. This is a data integrity bug. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 01b9cb8ccf6..2e847cdcad0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -944,12 +944,26 @@ retry: } ret = (*writepage)(page, wbc, data); - - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } - if (ret || (--nr_to_write <= 0)) + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (--nr_to_write <= 0) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; -- cgit v1.2.3-70-g09d2 From 05fe478dd04e02fa230c305ab9b5616669821dd3 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:39:08 -0800 Subject: mm: write_cache_pages integrity fix In write_cache_pages, nr_to_write is heeded even for data-integrity syncs, so the function will return success after writing out nr_to_write pages, even if that was not sufficient to guarantee data integrity. The callers tend to set it to values that could break data interity semantics easily in practice. For example, nr_to_write can be set to mapping->nr_pages * 2, however if a file has a single, dirty page, then fsync is called, subsequent pages might be concurrently added and dirtied, then write_cache_pages might writeout two of these newly dirty pages, while not writing out the old page that should have been written out. Fix this by ignoring nr_to_write if it is a data integrity sync. This is a data integrity bug. The reason this has been done in the past is to avoid stalling sync operations behind page dirtiers. "If a file has one dirty page at offset 1000000000000000 then someone does an fsync() and someone else gets in first and starts madly writing pages at offset 0, we want to write that page at 1000000000000000. Somehow." What we do today is return success after an arbitrary amount of pages are written, whether or not we have provided the data-integrity semantics that the caller has asked for. Even this doesn't actually fix all stall cases completely: in the above situation, if the file has a huge number of pages in pagecache (but not dirty), then mapping->nrpages is going to be huge, even if pages are being dirtied. This change does indeed make the possibility of long stalls lager, and that's not a good thing, but lying about data integrity is even worse. We have to either perform the sync, or return -ELINUXISLAME so at least the caller knows what has happened. There are subsequent competing approaches in the works to solve the stall problems properly, without compromising data integrity. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/page-writeback.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/filemap.c b/mm/filemap.c index f9d88183f69..9c5e6235cc7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, int ret; struct writeback_control wbc = { .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, + .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2e847cdcad0..5edca676e2c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -963,8 +963,10 @@ retry: } } - if (--nr_to_write <= 0) - done = 1; + if (wbc->sync_mode == WB_SYNC_NONE) { + if (--wbc->nr_to_write <= 0) + done = 1; + } if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; -- cgit v1.2.3-70-g09d2 From 5a3d5c9813db56a75934eb1015367fda23a8b0b4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:39:09 -0800 Subject: mm: write_cache_pages cleanups Get rid of some complex expressions from flow control statements, add a comment, remove some duplicate code. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5edca676e2c..c3fb38b1ea4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -899,11 +899,14 @@ int write_cache_pages(struct address_space *mapping, } retry: done_index = index; - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -919,7 +922,16 @@ retry: */ lock_page(page); + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } @@ -930,18 +942,15 @@ retry: * end == -1 in that case. */ done = 1; - unlock_page(page); - continue; + goto continue_unlock; } if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; - } + !clear_page_dirty_for_io(page)) + goto continue_unlock; ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { @@ -964,7 +973,8 @@ retry: } if (wbc->sync_mode == WB_SYNC_NONE) { - if (--wbc->nr_to_write <= 0) + wbc->nr_to_write--; + if (wbc->nr_to_write <= 0) done = 1; } if (wbc->nonblocking && bdi_write_congested(bdi)) { -- cgit v1.2.3-70-g09d2 From 515f4a037fb9ab736f8bad733fcd2ffd350cf265 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:39:10 -0800 Subject: mm: write_cache_pages optimise page cleaning In write_cache_pages, if we get stuck behind another process that is cleaning pages, we will be forced to wait for them to finish, then perform our own writeout (if it was redirtied during the long wait), then wait for that. If a page under writeout is still clean, we can skip waiting for it (if we're part of a data integrity sync, we'll be waiting for all writeout pages afterwards, so we'll still be waiting for the other guy's write that's cleaned the page). Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c3fb38b1ea4..2e8c2b01d5d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -945,11 +945,20 @@ continue_unlock: goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) goto continue_unlock; ret = (*writepage)(page, wbc, data); -- cgit v1.2.3-70-g09d2 From d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:39:11 -0800 Subject: mm: write_cache_pages terminate quickly Terminate the write_cache_pages loop upon encountering the first page past end, without locking the page. Pages cannot have their index change when we have a reference on them (truncate, eg truncate_inode_pages_range performs the same check without the page lock). Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2e8c2b01d5d..0d986c13d47 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -911,15 +911,24 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - done_index = page->index + 1; - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index + 1; + lock_page(page); /* @@ -936,15 +945,6 @@ continue_unlock: continue; } - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - goto continue_unlock; - } - if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; -- cgit v1.2.3-70-g09d2 From 82fd1a9a8ced9607312b54859572bcc6211e8919 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 6 Jan 2009 14:39:11 -0800 Subject: mm: write_cache_pages more terminate quickly Now that we have the early-termination logic in place, it makes sense to bail out early in all other cases where done is set to 1. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0d986c13d47..08d2b960b29 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -983,12 +983,15 @@ continue_unlock: if (wbc->sync_mode == WB_SYNC_NONE) { wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) + if (wbc->nr_to_write <= 0) { done = 1; + break; + } } if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; + break; } } pagevec_release(&pvec); -- cgit v1.2.3-70-g09d2 From 364aeb2849789b51bf4b9af2ddd02fee7285c54e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jan 2009 14:39:29 -0800 Subject: mm: change dirty limit type specifiers to unsigned long The background dirty and dirty limits are better defined with type specifiers of unsigned long since negative writeback thresholds are not possible. These values, as returned by get_dirty_limits(), are normally compared with ZVC values to determine whether writeback shall commence or be throttled. Such page counts cannot be negative, so declaring the page limits as signed is unnecessary. Acked-by: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Signed-off-by: David Rientjes Cc: Andrea Righi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 4 ++-- mm/backing-dev.c | 6 +++--- mm/page-writeback.c | 22 +++++++++++----------- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e585657e983..259e9ea58ca 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -125,8 +125,8 @@ struct file; int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi); +void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi); void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 801c08b046e..6f80beddd8a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -24,9 +24,9 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 08d2b960b29..4d4074cff30 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -362,13 +362,13 @@ unsigned long determine_dirtyable_memory(void) } void -get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi) +get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { int background_ratio; /* Percentages */ int dirty_ratio; - long background; - long dirty; + unsigned long background; + unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; @@ -423,9 +423,9 @@ static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -580,8 +580,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(gfp_t gfp_mask) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; for ( ; ; ) { get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); @@ -624,8 +624,8 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + -- cgit v1.2.3-70-g09d2 From 2da02997e08d3efe8174c7a47696e6f7cbe69ba9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jan 2009 14:39:31 -0800 Subject: mm: add dirty_background_bytes and dirty_bytes sysctls This change introduces two new sysctls to /proc/sys/vm: dirty_background_bytes and dirty_bytes. dirty_background_bytes is the counterpart to dirty_background_ratio and dirty_bytes is the counterpart to dirty_ratio. With growing memory capacities of individual machines, it's no longer sufficient to specify dirty thresholds as a percentage of the amount of dirtyable memory over the entire system. dirty_background_bytes and dirty_bytes specify quantities of memory, in bytes, that represent the dirty limits for the entire system. If either of these values is set, its value represents the amount of dirty memory that is needed to commence either background or direct writeback. When a `bytes' or `ratio' file is written, its counterpart becomes a function of the written value. For example, if dirty_bytes is written to be 8096, 8K of memory is required to commence direct writeback. dirty_ratio is then functionally equivalent to 8K / the amount of dirtyable memory: dirtyable_memory = free pages + mapped pages + file cache dirty_background_bytes = dirty_background_ratio * dirtyable_memory -or- dirty_background_ratio = dirty_background_bytes / dirtyable_memory AND dirty_bytes = dirty_ratio * dirtyable_memory -or- dirty_ratio = dirty_bytes / dirtyable_memory Only one of dirty_background_bytes and dirty_background_ratio may be specified at a time, and only one of dirty_bytes and dirty_ratio may be specified. When one sysctl is written, the other appears as 0 when read. The `bytes' files operate on a page size granularity since dirty limits are compared with ZVC values, which are in page units. Prior to this change, the minimum dirty_ratio was 5 as implemented by get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user written value between 0 and 100. This restriction is maintained, but dirty_bytes has a lower limit of only one page. Also prior to this change, the dirty_background_ratio could not equal or exceed dirty_ratio. This restriction is maintained in addition to restricting dirty_background_bytes. If either background threshold equals or exceeds that of the dirty threshold, it is implicitly set to half the dirty threshold. Acked-by: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Signed-off-by: David Rientjes Cc: Andrea Righi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 26 +++++++++- Documentation/sysctl/vm.txt | 3 +- include/linux/writeback.h | 11 ++++ kernel/sysctl.c | 27 ++++++++-- mm/page-writeback.c | 102 +++++++++++++++++++++++++++++++------ 5 files changed, 146 insertions(+), 23 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 71df353e367..32e94635484 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1385,6 +1385,15 @@ swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 causes the kernel to prefer to reclaim dentries and inodes. +dirty_background_bytes +---------------------- + +Contains the amount of dirty memory at which the pdflush background writeback +daemon will start writeback. + +If dirty_background_bytes is written, dirty_background_ratio becomes a function +of its value (dirty_background_bytes / the amount of dirtyable system memory). + dirty_background_ratio ---------------------- @@ -1393,14 +1402,29 @@ pages + file cache, not including locked pages and HugePages), the number of pages at which the pdflush background writeback daemon will start writing out dirty data. +If dirty_background_ratio is written, dirty_background_bytes becomes a function +of its value (dirty_background_ratio * the amount of dirtyable system memory). + +dirty_bytes +----------- + +Contains the amount of dirty memory at which a process generating disk writes +will itself start writeback. + +If dirty_bytes is written, dirty_ratio becomes a function of its value +(dirty_bytes / the amount of dirtyable system memory). + dirty_ratio ------------------ +----------- Contains, as a percentage of the dirtyable system memory (free pages + mapped pages + file cache, not including locked pages and HugePages), the number of pages at which a process which is generating disk writes will itself start writing out dirty data. +If dirty_ratio is written, dirty_bytes becomes a function of its value +(dirty_ratio * the amount of dirtyable system memory). + dirty_writeback_centisecs ------------------------- diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index d79eeda7a69..cd05994a49e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -41,7 +41,8 @@ Currently, these files are in /proc/sys/vm: ============================================================== -dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, +dirty_bytes, dirty_ratio, dirty_background_bytes, +dirty_background_ratio, dirty_expire_centisecs, dirty_writeback_centisecs, highmem_is_dirtyable, vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout, drop-caches, hugepages_treat_as_movable: diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 259e9ea58ca..bb28c975c1d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -107,7 +107,9 @@ void throttle_vm_writeout(gfp_t gfp_mask); /* These are exported to sysctl. */ extern int dirty_background_ratio; +extern unsigned long dirty_background_bytes; extern int vm_dirty_ratio; +extern unsigned long vm_dirty_bytes; extern int dirty_writeback_interval; extern int dirty_expire_interval; extern int vm_highmem_is_dirtyable; @@ -116,9 +118,18 @@ extern int laptop_mode; extern unsigned long determine_dirtyable_memory(void); +extern int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); +extern int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); extern int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); struct ctl_table; struct file; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff6d45c7626..92f6e5bc3c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -87,10 +87,6 @@ extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) -static int one = 1; -#endif - #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -101,6 +97,7 @@ static int two = 2; #endif static int zero; +static int one = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -952,11 +949,21 @@ static struct ctl_table vm_table[] = { .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_background_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = &dirty_background_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, { .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", @@ -968,6 +975,16 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = &dirty_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4d4074cff30..b493db7841d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -68,6 +68,12 @@ static inline long sync_writeback_pages(void) */ int dirty_background_ratio = 5; +/* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; + /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true @@ -79,6 +85,12 @@ int vm_highmem_is_dirtyable; */ int vm_dirty_ratio = 10; +/* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; + /* * The interval between `kupdate'-style writebacks, in jiffies */ @@ -135,23 +147,75 @@ static int calc_period_shift(void) { unsigned long dirty_total; - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + if (vm_dirty_bytes) + dirty_total = vm_dirty_bytes / PAGE_SIZE; + else + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + 100; return 2 + ilog2(dirty_total - 1); } /* - * update the period when the dirty ratio changes. + * update the period when the dirty threshold changes. */ +static void update_completion_period(void) +{ + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); + update_completion_period(); + vm_dirty_bytes = 0; + } + return ret; +} + + +int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + update_completion_period(); + vm_dirty_ratio = 0; } return ret; } @@ -365,23 +429,29 @@ void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { - int background_ratio; /* Percentages */ - int dirty_ratio; unsigned long background; unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else { + int dirty_ratio; - background_ratio = dirty_background_ratio; - if (background_ratio >= dirty_ratio) - background_ratio = dirty_ratio / 2; + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + dirty = (dirty_ratio * available_memory) / 100; + } + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; - background = (background_ratio * available_memory) / 100; - dirty = (dirty_ratio * available_memory) / 100; + if (background >= dirty) + background = dirty / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; -- cgit v1.2.3-70-g09d2