From 1b627d5771312c92404b66f0a0b16f66036dd2e1 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:21:18 -0700 Subject: hostfs: fix UML crash: remove f_spare from hostfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 365b1818 ("add f_flags to struct statfs(64)") resized f_spare within struct statfs which caused a UML crash. There is no need to copy f_spare. Signed-off-by: Richard Weinberger Reported-by: Toralf Förster Tested-by: Toralf Förster Cc: Christoph Hellwig Cc: Al Viro Cc: Jeff Dike Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs.h | 3 +-- fs/hostfs/hostfs_kern.c | 2 +- fs/hostfs/hostfs_user.c | 9 ++------- 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 7c232c1487e..bf15a43016b 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out); + void *fsid_out, int fsid_size, long *namelen_out); #endif diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f7dc9b5f9ef..cd7c93917cc 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) err = do_statfs(dentry->d_sb->s_fs_info, &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), - &sf->f_namelen, sf->f_spare); + &sf->f_namelen); if (err) return err; sf->f_blocks = f_blocks; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 6777aa06ce2..8d02683585e 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -364,8 +364,7 @@ int rename_file(char *from, char *to) int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out) + void *fsid_out, int fsid_size, long *namelen_out) { struct statfs64 buf; int err; @@ -384,10 +383,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out, sizeof(buf.f_fsid) > fsid_size ? fsid_size : sizeof(buf.f_fsid)); *namelen_out = buf.f_namelen; - spare_out[0] = buf.f_spare[0]; - spare_out[1] = buf.f_spare[1]; - spare_out[2] = buf.f_spare[2]; - spare_out[3] = buf.f_spare[3]; - spare_out[4] = buf.f_spare[4]; + return 0; } -- cgit v1.2.3-70-g09d2 From a4f7326da2bfe788b85509ec0c261e64596cdde4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 26 Oct 2010 14:21:21 -0700 Subject: vmcore: it is not experimental any more We use vmcore in our production kernel for a long time, it is pretty stable now. So I don't think we need to mark it as experimental any more. Signed-off-by: WANG Cong Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 50f8f0600f0..6a0068841d9 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -33,8 +33,8 @@ config PROC_KCORE depends on PROC_FS && MMU config PROC_VMCORE - bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && CRASH_DUMP + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP default y help Exports the dump image of crashed kernel in ELF format. -- cgit v1.2.3-70-g09d2 From 3d5992d2ac7dc09aed8ab537cba074589f0f0a52 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 26 Oct 2010 14:21:23 -0700 Subject: oom: add per-mm oom disable count It's pointless to kill a task if another thread sharing its mm cannot be killed to allow future memory freeing. A subsequent patch will prevent kills in such cases, but first it's necessary to have a way to flag a task that shares memory with an OOM_DISABLE task that doesn't incur an additional tasklist scan, which would make select_bad_process() an O(n^2) function. This patch adds an atomic counter to struct mm_struct that follows how many threads attached to it have an oom_score_adj of OOM_SCORE_ADJ_MIN. They cannot be killed by the kernel, so their memory cannot be freed in oom conditions. This only requires task_lock() on the task that we're operating on, it does not require mm->mmap_sem since task_lock() pins the mm and the operation is atomic. [rientjes@google.com: changelog and sys_unshare() code] [rientjes@google.com: protect oom_disable_count with task_lock in fork] [rientjes@google.com: use old_mm for oom_disable_count in exec] Signed-off-by: Ying Han Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 5 +++++ fs/proc/base.c | 30 ++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ kernel/exit.c | 3 +++ kernel/fork.c | 15 ++++++++++++++- 5 files changed, 54 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6d2b6f93685..3aa75b8888a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -759,6 +760,10 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&old_mm->oom_disable_count); + atomic_inc(&tsk->mm->oom_disable_count); + } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5d5f51f3f..6e50c8e6551 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1047,6 +1047,21 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EACCES; } + task_lock(task); + if (!task->mm) { + task_unlock(task); + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EINVAL; + } + + if (oom_adjust != task->signal->oom_adj) { + if (oom_adjust == OOM_DISABLE) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_adj == OOM_DISABLE) + atomic_dec(&task->mm->oom_disable_count); + } + /* * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. @@ -1065,6 +1080,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + task_unlock(task); unlock_task_sighand(task, &flags); put_task_struct(task); @@ -1133,6 +1149,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, return -EACCES; } + task_lock(task); + if (!task->mm) { + task_unlock(task); + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EINVAL; + } + if (oom_score_adj != task->signal->oom_score_adj) { + if (oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&task->mm->oom_disable_count); + } task->signal->oom_score_adj = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is @@ -1143,6 +1172,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; + task_unlock(task); unlock_task_sighand(task, &flags); put_task_struct(task); return count; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cb57d657ce4..bb7288a782f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -310,6 +310,8 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif + /* How many tasks sharing this mm are OOM_DISABLE */ + atomic_t oom_disable_count; }; /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ diff --git a/kernel/exit.c b/kernel/exit.c index e2bdf37f9fd..894179a32ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -687,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408..e87aaaaf513 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -741,6 +743,8 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1299,8 +1303,13 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1693,6 +1702,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) active_mm = current->active_mm; current->mm = new_mm; current->active_mm = new_mm; + if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&mm->oom_disable_count); + atomic_inc(&new_mm->oom_disable_count); + } activate_mm(active_mm, new_mm); new_mm = mm; } -- cgit v1.2.3-70-g09d2 From 723548bff1dde9ab6bdb23f4bb92277c4da49473 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 26 Oct 2010 14:21:25 -0700 Subject: oom: rewrite error handling for oom_adj and oom_score_adj tunables It's better to use proper error handling in oom_adjust_write() and oom_score_adj_write() instead of duplicating the locking order on various exit paths. Suggested-by: Andrew Morton Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 83 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6e50c8e6551..34d11ac31f2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1023,36 +1023,39 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } err = strict_strtol(strstrip(buffer), 0, &oom_adjust); if (err) - return -EINVAL; + goto out; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) - return -EINVAL; + oom_adjust != OOM_DISABLE) { + err = -EINVAL; + goto out; + } task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; + if (!task) { + err = -ESRCH; + goto out; + } if (!lock_task_sighand(task, &flags)) { - put_task_struct(task); - return -ESRCH; + err = -ESRCH; + goto err_task_struct; } if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EACCES; + err = -EACCES; + goto err_sighand; } task_lock(task); if (!task->mm) { - task_unlock(task); - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EINVAL; + err = -EINVAL; + goto err_task_lock; } if (oom_adjust != task->signal->oom_adj) { @@ -1080,11 +1083,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; +err_task_lock: task_unlock(task); +err_sighand: unlock_task_sighand(task, &flags); +err_task_struct: put_task_struct(task); - - return count; +out: + return err < 0 ? err : count; } static const struct file_operations proc_oom_adjust_operations = { @@ -1125,36 +1131,39 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); if (err) - return -EINVAL; + goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || - oom_score_adj > OOM_SCORE_ADJ_MAX) - return -EINVAL; + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; + if (!task) { + err = -ESRCH; + goto out; + } if (!lock_task_sighand(task, &flags)) { - put_task_struct(task); - return -ESRCH; + err = -ESRCH; + goto err_task_struct; } if (oom_score_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EACCES; + err = -EACCES; + goto err_sighand; } task_lock(task); if (!task->mm) { - task_unlock(task); - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EINVAL; + err = -EINVAL; + goto err_task_lock; } if (oom_score_adj != task->signal->oom_score_adj) { if (oom_score_adj == OOM_SCORE_ADJ_MIN) @@ -1172,10 +1181,14 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; +err_task_lock: task_unlock(task); +err_sighand: unlock_task_sighand(task, &flags); +err_task_struct: put_task_struct(task); - return count; +out: + return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { -- cgit v1.2.3-70-g09d2 From d19d5476f4b9f91d2de92b91588bb118beba6c0d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 26 Oct 2010 14:21:26 -0700 Subject: oom: fix locking for oom_adj and oom_score_adj The locking order in oom_adjust_write() and oom_score_adj_write() for task->alloc_lock and task->sighand->siglock is reversed, and lockdep notices that irqs could encounter an ABBA scenario. This fixes the locking order so that we always take task_lock(task) prior to lock_task_sighand(task). Signed-off-by: David Rientjes Reported-by: Andrew Morton Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 34d11ac31f2..53dc8ad40ae 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1042,9 +1042,16 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, err = -ESRCH; goto out; } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + if (!lock_task_sighand(task, &flags)) { err = -ESRCH; - goto err_task_struct; + goto err_task_lock; } if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { @@ -1052,12 +1059,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto err_sighand; } - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - if (oom_adjust != task->signal->oom_adj) { if (oom_adjust == OOM_DISABLE) atomic_inc(&task->mm->oom_disable_count); @@ -1083,11 +1084,10 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; -err_task_lock: - task_unlock(task); err_sighand: unlock_task_sighand(task, &flags); -err_task_struct: +err_task_lock: + task_unlock(task); put_task_struct(task); out: return err < 0 ? err : count; @@ -1150,21 +1150,24 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, err = -ESRCH; goto out; } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + if (!lock_task_sighand(task, &flags)) { err = -ESRCH; - goto err_task_struct; + goto err_task_lock; } + if (oom_score_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; } - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } if (oom_score_adj != task->signal->oom_score_adj) { if (oom_score_adj == OOM_SCORE_ADJ_MIN) atomic_inc(&task->mm->oom_disable_count); @@ -1181,11 +1184,10 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, else task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / OOM_SCORE_ADJ_MAX; -err_task_lock: - task_unlock(task); err_sighand: unlock_task_sighand(task, &flags); -err_task_struct: +err_task_lock: + task_unlock(task); put_task_struct(task); out: return err < 0 ? err : count; -- cgit v1.2.3-70-g09d2 From 1b430beee5e388605dfb092b214ef0320f752cf6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 26 Oct 2010 14:21:26 -0700 Subject: writeback: remove nonblocking/encountered_congestion references This removes more dead code that was somehow missed by commit 0d99519efef (writeback: remove unused nonblocking and congestion checks). There are no behavior change except for the removal of two entries from one of the ext4 tracing interface. The nonblocking checks in ->writepages are no longer used because the flusher now prefer to block on get_request_wait() than to skip inodes on IO congestion. The latter will lead to more seeky IO. The nonblocking checks in ->writepage are no longer used because it's redundant with the WB_SYNC_NONE check. We no long set ->nonblocking in VM page out and page migration, because a) it's effectively redundant with WB_SYNC_NONE in current code b) it's old semantic of "Don't get stuck on request queues" is mis-behavior: that would skip some dirty inodes on congestion and page out others, which is unfair in terms of LRU age. Inspired by Christoph Hellwig. Thanks! Signed-off-by: Wu Fengguang Cc: Theodore Ts'o Cc: David Howells Cc: Sage Weil Cc: Steve French Cc: Chris Mason Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/write.c | 19 +------------------ fs/buffer.c | 2 +- fs/ceph/addr.c | 9 --------- fs/cifs/file.c | 10 ---------- fs/gfs2/meta_io.c | 2 +- fs/nfs/write.c | 4 +--- fs/reiserfs/inode.c | 2 +- fs/xfs/linux-2.6/xfs_aops.c | 3 +-- include/trace/events/ext4.h | 8 +++++--- include/trace/events/writeback.h | 2 -- mm/migrate.c | 1 - mm/vmscan.c | 1 - 12 files changed, 11 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 722743b152d..15690bb1d3b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -438,7 +438,6 @@ no_more: */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - struct backing_dev_info *bdi = page->mapping->backing_dev_info; struct afs_writeback *wb; int ret; @@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) } wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) - wbc->encountered_congestion = 1; _leave(" = 0"); return 0; @@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, pgoff_t index, pgoff_t end, pgoff_t *_next) { - struct backing_dev_info *bdi = mapping->backing_dev_info; struct afs_writeback *wb; struct page *page; int ret, n; @@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping, wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - break; - } - cond_resched(); } while (index < end && wbc->nr_to_write > 0); @@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; pgoff_t start, end, next; int ret; _enter(""); - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - _leave(" = 0 [congest]"); - return 0; - } - if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; ret = afs_writepages_region(mapping, wbc, start, end, &next); - if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && - !(wbc->nonblocking && wbc->encountered_congestion)) + if (start > 0 && wbc->nr_to_write > 0 && ret == 0) ret = afs_writepages_region(mapping, wbc, 0, start, &next); mapping->writeback_index = next; diff --git a/fs/buffer.c b/fs/buffer.c index 7f0b9b083f7..ec21a92e08b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1706,7 +1706,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * and kswapd activity, but those code paths have their own * higher-level throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 51bcc5ce323..e9c874abc9e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc; pgoff_t index, start, end; @@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping, pagevec_init(&pvec, 0); - /* ?? */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - dout(" writepages congested\n"); - wbc->encountered_congestion = 1; - goto out_final; - } - /* where to start/end? */ if (wbc->range_cyclic) { start = mapping->writeback_index; /* Start from prev offset */ @@ -885,7 +877,6 @@ out: rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); -out_final: return rc; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c81e7b14d5..45af003865d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1303,7 +1303,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned int bytes_to_write; unsigned int bytes_written; struct cifs_sb_info *cifs_sb; @@ -1326,15 +1325,6 @@ static int cifs_writepages(struct address_space *mapping, int scanned = 0; int xid, long_op; - /* - * BB: Is this meaningful for a non-block-device file system? - * If it is, we should test it again after we do I/O - */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - cifs_sb = CIFS_SB(mapping->host->i_sb); /* diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f3b071f921a..939739c7b3f 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb * activity, but those code paths have their own higher-level * throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 605e292501f..4c14c17a527 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -290,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page->index); - ret = nfs_page_async_flush(pgio, page, - wbc->sync_mode == WB_SYNC_NONE || - wbc->nonblocking != 0); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index caa758377d6..c1f93896cb5 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2438,7 +2438,7 @@ static int reiserfs_write_full_page(struct page *page, /* from this point on, we know the buffer is mapped to a * real block and not a direct item */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else { if (!trylock_buffer(bh)) { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index b552f816de1..c9af48fffcd 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1139,8 +1139,7 @@ xfs_vm_writepage( type = IO_DELAY; flags = BMAPI_ALLOCATE; - if (wbc->sync_mode == WB_SYNC_NONE && - wbc->nonblocking) + if (wbc->sync_mode == WB_SYNC_NONE) flags |= BMAPI_TRYLOCK; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 01e9e0076a9..6bcb00645de 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -242,18 +242,20 @@ TRACE_EVENT(ext4_da_writepages, __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; - __entry->nonblocking = wbc->nonblocking; __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld " + "range_start %llu range_end %llu " + "for_kupdate %d for_reclaim %d " + "range_cyclic %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, - __entry->range_end, __entry->nonblocking, + __entry->range_end, __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic, (unsigned long) __entry->writeback_index) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index f345f66ae9d..0bb01ab2e98 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -96,8 +96,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) - __field(int, nonblocking) - __field(int, encountered_congestion) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) diff --git a/mm/migrate.c b/mm/migrate.c index f8c9bccf252..d917ac3207f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -497,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page) .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1 }; int rc; diff --git a/mm/vmscan.c b/mm/vmscan.c index b94c9464f26..6cbc1aac23a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -376,7 +376,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; -- cgit v1.2.3-70-g09d2 From f629d1c9bd0dbc44a6c4f9a4a67d1646c42bfc6f Mon Sep 17 00:00:00 2001 From: Michael Rubin Date: Tue, 26 Oct 2010 14:21:33 -0700 Subject: mm: add account_page_writeback() To help developers and applications gain visibility into writeback behaviour this patch adds two counters to /proc/vmstat. # grep nr_dirtied /proc/vmstat nr_dirtied 3747 # grep nr_written /proc/vmstat nr_written 3618 These entries allow user apps to understand writeback behaviour over time and learn how it is impacting their performance. Currently there is no way to inspect dirty and writeback speed over time. It's not possible for nr_dirty/nr_writeback. These entries are necessary to give visibility into writeback behaviour. We have /proc/diskstats which lets us understand the io in the block layer. We have blktrace for more in depth understanding. We have e2fsprogs and debugsfs to give insight into the file systems behaviour, but we don't offer our users the ability understand what writeback is doing. There is no way to know how active it is over the whole system, if it's falling behind or to quantify it's efforts. With these values exported users can easily see how much data applications are sending through writeback and also at what rates writeback is processing this data. Comparing the rates of change between the two allow developers to see when writeback is not able to keep up with incoming traffic and the rate of dirty memory being sent to the IO back end. This allows folks to understand their io workloads and track kernel issues. Non kernel engineers at Google often use these counters to solve puzzling performance problems. Patch #4 adds a pernode vmstat file with nr_dirtied and nr_written Patch #5 add writeback thresholds to /proc/vmstat Currently these values are in debugfs. But they should be promoted to /proc since they are useful for developers who are writing databases and file servers and are not debugging the kernel. The output is as below: # grep threshold /proc/vmstat nr_pages_dirty_threshold 409111 nr_pages_dirty_background_threshold 818223 This patch: This allows code outside of the mm core to safely manipulate page writeback state and not worry about the other accounting. Not using these routines means that some code will lose track of the accounting and we get bugs. Modify nilfs2 to use interface. Signed-off-by: Michael Rubin Reviewed-by: KOSAKI Motohiro Reviewed-by: Wu Fengguang Cc: KONISHI Ryusuke Cc: Jiro SEKIBA Cc: Dave Chinner Cc: Jens Axboe Cc: KOSAKI Motohiro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 2 +- include/linux/mm.h | 1 + mm/page-writeback.c | 13 ++++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index d926af62617..687d090cea3 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1609,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) kunmap_atomic(kaddr, KM_USER0); if (!TestSetPageWriteback(clone_page)) - inc_zone_page_state(clone_page, NR_WRITEBACK); + account_page_writeback(clone_page); unlock_page(clone_page); return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index a4c66846fb8..c36297faf7c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -868,6 +868,7 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); +void account_page_writeback(struct page *page); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e3bccac1f02..94159819a65 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1128,6 +1128,17 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); +/* + * Helper function for set_page_writeback family. + * NOTE: Unlike account_page_dirtied this does not rely on being atomic + * wrt interrupts. + */ +void account_page_writeback(struct page *page) +{ + inc_zone_page_state(page, NR_WRITEBACK); +} +EXPORT_SYMBOL(account_page_writeback); + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -1366,7 +1377,7 @@ int test_set_page_writeback(struct page *page) ret = TestSetPageWriteback(page); } if (!ret) - inc_zone_page_state(page, NR_WRITEBACK); + account_page_writeback(page); return ret; } -- cgit v1.2.3-70-g09d2 From 4cbec4c8b9fda9ec784086fe7f74cd32a8adda95 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 26 Oct 2010 14:21:45 -0700 Subject: writeback: remove the internal 5% low bound on dirty_ratio The dirty_ratio was silently limited in global_dirty_limits() to >= 5%. This is not a user expected behavior. And it's inconsistent with calc_period_shift(), which uses the plain vm_dirty_ratio value. Let's remove the internal bound. At the same time, fix balance_dirty_pages() to work with the dirty_thresh=0 case. This allows applications to proceed when dirty+writeback pages are all cleaned. And ">" fits with the name "exceeded" better than ">=" does. Neil thinks it is an aesthetic improvement as well as a functional one :) Signed-off-by: Wu Fengguang Cc: Jan Kara Proposed-by: Con Kolivas Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Neil Brown Reviewed-by: KOSAKI Motohiro Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- mm/page-writeback.c | 16 +++++----------- 2 files changed, 6 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ab38fef1c9a..97d2951bd4d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -582,7 +582,7 @@ static inline bool over_bground_thresh(void) global_dirty_limits(&background_thresh, &dirty_thresh); return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) >= background_thresh); + global_page_state(NR_UNSTABLE_NFS) > background_thresh); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4dd91f7fd39..b840afa8976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else { - int dirty_ratio; - - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; - dirty = (dirty_ratio * available_memory) / 100; - } + else + dirty = (vm_dirty_ratio * available_memory) / 100; if (dirty_background_bytes) background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); @@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping, * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback < + if (nr_reclaimable + nr_writeback <= (background_thresh + dirty_thresh) / 2) break; @@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping, * the last resort safeguard. */ dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) - || (nr_reclaimable + nr_writeback >= dirty_thresh); + (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) + || (nr_reclaimable + nr_writeback > dirty_thresh); if (!dirty_exceeded) break; -- cgit v1.2.3-70-g09d2 From 74ce002d9aee23031b4967e1dd1c1966ddc60749 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Oct 2010 14:22:05 -0700 Subject: fs/fs-writeback.c: restore lost comment I had to go back to a 2.6.20 tree to work out why we're adding a number-of-inodes into a number-of-pages count. Restore the lost comment. Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 97d2951bd4d..b5aae4bd0ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -721,6 +721,10 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; wb->last_old_flush = jiffies; + /* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); -- cgit v1.2.3-70-g09d2 From c5c6dd4e2dce3cb4d705d97183bc42b4e33e2606 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 26 Oct 2010 14:22:20 -0700 Subject: hostfs: code cleanups Some code cleanups for hostfs. Signed-off-by: Richard Weinberger Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs_user.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 8d02683585e..d51a98384bc 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out) dir = opendir(path); *err_out = errno; - if (dir == NULL) - return NULL; + return dir; } @@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) if (attrs->ia_valid & HOSTFS_ATTR_MODE) { if (fd >= 0) { if (fchmod(fd, attrs->ia_mode) != 0) - return (-errno); + return -errno; } else if (chmod(file, attrs->ia_mode) != 0) { return -errno; } -- cgit v1.2.3-70-g09d2 From 3ecb01df3261d3b1f02ccfcf8384e2a255d2a1d0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Oct 2010 14:22:27 -0700 Subject: use clear_page()/copy_page() in favor of memset()/memcpy() on whole pages After all that's what they are intended for. Signed-off-by: Jan Beulich Cc: Miklos Szeredi Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 2 +- kernel/kexec.c | 2 +- kernel/power/snapshot.c | 14 +++++++------- kernel/power/swap.c | 6 +++--- mm/memory.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cde755cca56..69037118b2c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -811,7 +811,7 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, if (page && zeroing && count < PAGE_SIZE) { void *mapaddr = kmap_atomic(page, KM_USER1); - memset(mapaddr, 0, PAGE_SIZE); + clear_page(mapaddr); kunmap_atomic(mapaddr, KM_USER1); } while (count) { diff --git a/kernel/kexec.c b/kernel/kexec.c index c0613f7d673..b55045bc756 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, ptr = kmap(page); /* Start with a clear page */ - memset(ptr, 0, PAGE_SIZE); + clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); if (mchunk > mbytes) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 9e3581f4619..0dac75ea445 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) */ safe_copy_page(buffer, s_page); dst = kmap_atomic(d_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); } else { safe_copy_page(page_address(d_page), s_page); @@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); + clear_page(buffer); pack_pfns(buffer, &orig_bm); } else { struct page *page; @@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle) void *kaddr; kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); + copy_page(buffer, kaddr); kunmap_atomic(kaddr, KM_USER0); handle->buffer = buffer; } else { @@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void) void *dst; dst = kmap_atomic(last_highmem_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); last_highmem_page = NULL; } @@ -2270,9 +2270,9 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) kaddr1 = kmap_atomic(p1, KM_USER0); kaddr2 = kmap_atomic(p2, KM_USER1); - memcpy(buf, kaddr1, PAGE_SIZE); - memcpy(kaddr1, kaddr2, PAGE_SIZE); - memcpy(kaddr2, buf, PAGE_SIZE); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); kunmap_atomic(kaddr2, KM_USER1); kunmap_atomic(kaddr1, KM_USER0); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 916eaa79039..a0e4a86ccf9 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (bio_chain) { src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (src) { - memcpy(src, buf, PAGE_SIZE); + copy_page(src, buf); } else { WARN_ON_ONCE(1); bio_chain = NULL; /* Go synchronous */ @@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = write_page(handle->cur, handle->cur_swap, NULL); if (error) goto out; - memset(handle->cur, 0, PAGE_SIZE); + clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } @@ -910,7 +910,7 @@ int swsusp_check(void) hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, PAGE_SIZE); + clear_page(swsusp_header); error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) diff --git a/mm/memory.c b/mm/memory.c index 861f7982dd5..02e48aa0ed1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2080,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); + clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); } else -- cgit v1.2.3-70-g09d2 From b6777c40c79168d938c30b5b7471fbd64bca109c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 26 Oct 2010 14:22:27 -0700 Subject: fuse: use clear_highpage() and KM_USER0 instead of KM_USER1 Commit 7909b1c640 ("fuse: don't use atomic kmap") removed KM_USER0 usage from fuse/dev.c. Switch KM_USER1 uses to KM_USER0 for clarity. Also replace open coded clear_highpage(). Signed-off-by: Miklos Szeredi Cc: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 69037118b2c..b98664275f0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, int err; struct page *page = *pagep; - if (page && zeroing && count < PAGE_SIZE) { - void *mapaddr = kmap_atomic(page, KM_USER1); - clear_page(mapaddr); - kunmap_atomic(mapaddr, KM_USER1); - } + if (page && zeroing && count < PAGE_SIZE) + clear_highpage(page); + while (count) { if (cs->write && cs->pipebufs && page) { return fuse_ref_page(cs, page, offset, count); @@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, } } if (page) { - void *mapaddr = kmap_atomic(page, KM_USER1); + void *mapaddr = kmap_atomic(page, KM_USER0); void *buf = mapaddr + offset; offset += fuse_copy_do(cs, &buf, &count); - kunmap_atomic(mapaddr, KM_USER1); + kunmap_atomic(mapaddr, KM_USER0); } else offset += fuse_copy_do(cs, NULL, &count); } -- cgit v1.2.3-70-g09d2 From cd1c584f380183aca268d454c14b22d8454eac4f Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Tue, 26 Oct 2010 14:22:28 -0700 Subject: fs/direct-io.c: fix truncation error in dio_complete() return Fix up truncation (ssize_t->int). This only matters with >2G reads/writes, which the kernel doesn't permit. Signed-off-by: Edward Shishkin Reviewed-by: Christoph Hellwig Acked-by: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 48d74c7391d..85882f6ba5f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) { ssize_t transferred = 0; -- cgit v1.2.3-70-g09d2 From 1df79da85657aecde2ecff052ff0cf9910311078 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:31 -0700 Subject: fs/buffer.c: remove duplicated assignment to b_private bh->b_private is initialized within init_buffer(), thus this assignment is redundant. Signed-off-by: Namhyung Kim Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index ec21a92e08b..8d595ab2aed 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -905,7 +905,6 @@ try_again: bh->b_state = 0; atomic_set(&bh->b_count, 0); - bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ -- cgit v1.2.3-70-g09d2 From 4199ca77cc44c418972e8f30a36be7d26d21a0d2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 26 Oct 2010 14:22:32 -0700 Subject: fs: move exportfs since it is not a networking filesystem Move the EXPORTFS kconfig symbol out of the NETWORK_FILESYSTEMS block since it provides a library function that can be (and is) used by other (non-network) filesystems. This also eliminates a kconfig dependency warning: warning: (XFS_FS && BLOCK || NFSD && NETWORK_FILESYSTEMS && INET && FILE_LOCKING && BKL) selects EXPORTFS which has unmet direct dependencies (NETWORK_FILESYSTEMS) Signed-off-by: Randy Dunlap Cc: Dave Chinner Cc: Al Viro Cc: Alex Elder Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 65781de44fc..b5e582bd769 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig" endif # BLOCK +config EXPORTFS + tristate + config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y @@ -222,9 +225,6 @@ config LOCKD_V4 depends on FILE_LOCKING default y -config EXPORTFS - tristate - config NFS_ACL_SUPPORT tristate select FS_POSIX_ACL -- cgit v1.2.3-70-g09d2 From 99dc829256bb8cfcb1f58b7f118893fdbf608e60 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 26 Oct 2010 14:22:33 -0700 Subject: procfs: fix numbering in /proc/locks The lock number in /proc/locks (first field) is implemented by a counter (private field of struct seq_file) which is incremented at each call of locks_show() and reset to 1 in locks_start() whatever the offset is. It should be reset according to the actual position in the list. Because of this, the numbering erratically restarts at 1 several times when reading a long /proc/locks file. Moreover, locks_show() can be called twice to print a single line thus skipping a number. The counter should be incremented in locks_next(). And last, pos is a loff_t, which can be bigger than a pointer, so we don't use the pointer as an integer anymore, and allocate a loff_t instead. Signed-off-by: Jerome Marchand Cc: Pavel Emelyanov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 8b2b6ad56a0..4de3a266681 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2109,7 +2109,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock); #include static void lock_get_status(struct seq_file *f, struct file_lock *fl, - int id, char *pfx) + loff_t id, char *pfx) { struct inode *inode = NULL; unsigned int fl_pid; @@ -2122,7 +2122,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (fl->fl_file != NULL) inode = fl->fl_file->f_path.dentry->d_inode; - seq_printf(f, "%d:%s ", id, pfx); + seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { seq_printf(f, "%6s %s ", (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", @@ -2185,24 +2185,27 @@ static int locks_show(struct seq_file *f, void *v) fl = list_entry(v, struct file_lock, fl_link); - lock_get_status(f, fl, (long)f->private, ""); + lock_get_status(f, fl, *((loff_t *)f->private), ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) - lock_get_status(f, bfl, (long)f->private, " ->"); + lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); - f->private++; return 0; } static void *locks_start(struct seq_file *f, loff_t *pos) { + loff_t *p = f->private; + lock_flocks(); - f->private = (void *)1; + *p = (*pos + 1); return seq_list_start(&file_lock_list, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) { + loff_t *p = f->private; + ++*p; return seq_list_next(v, &file_lock_list, pos); } @@ -2220,14 +2223,14 @@ static const struct seq_operations locks_seq_operations = { static int locks_open(struct inode *inode, struct file *filp) { - return seq_open(filp, &locks_seq_operations); + return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); } static const struct file_operations proc_locks_operations = { .open = locks_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; static int __init proc_locks_init(void) -- cgit v1.2.3-70-g09d2 From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 17 +++++++---------- include/linux/fs.h | 8 ++++---- kernel/sysctl.c | 6 +++--- net/unix/af_unix.c | 14 +++++++------- 4 files changed, 21 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index a04bdd81c11..c3dee381f1b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -60,7 +60,7 @@ static inline void file_free(struct file *f) /* * Return the total number of open files in the system */ -static int get_nr_files(void) +static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } @@ -68,7 +68,7 @@ static int get_nr_files(void) /* * Return the maximum number of open files in the system */ -int get_max_files(void) +unsigned long get_max_files(void) { return files_stat.max_files; } @@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(ctl_table *table, int write, @@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *get_empty_filp(void) { const struct cred *cred = current_cred(); - static int old_max; + static long old_max; struct file * f; /* @@ -140,8 +140,7 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { - printk(KERN_INFO "VFS: file-max limit %d reached\n", - get_max_files()); + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } goto fail; @@ -487,7 +486,7 @@ retry: void __init files_init(unsigned long mempages) { - int n; + unsigned long n; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); @@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages) */ n = (mempages * (PAGE_SIZE / 1024)) / 10; - files_stat.max_files = n; - if (files_stat.max_files < NR_FILE) - files_stat.max_files = NR_FILE; + files_stat.max_files = max_t(unsigned long, n, NR_FILE); files_defer_init(); lg_lock_init(files_lglock); percpu_counter_init(&nr_files, 0); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f34ff6e555..b2cdb6bc828 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -34,9 +34,9 @@ /* And dynamically-tunable limits and defaults: */ struct files_stat_struct { - int nr_files; /* read only */ - int nr_free_files; /* read only */ - int max_files; /* tunable */ + unsigned long nr_files; /* read only */ + unsigned long nr_free_files; /* read only */ + unsigned long max_files; /* tunable */ }; struct inodes_stat_t { @@ -400,7 +400,7 @@ extern void __init inode_init_early(void); extern void __init files_init(unsigned long); extern struct files_stat_struct files_stat; -extern int get_max_files(void); +extern unsigned long get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770..694b140852c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a666..3c95304a081 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); -- cgit v1.2.3-70-g09d2 From 766f9164193f6dda1497bbf3861060198421fb92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2010 14:22:45 -0700 Subject: kernel: remove PF_FLUSHER PF_FLUSHER is only ever set, not tested, remove it. Signed-off-by: Peter Zijlstra Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- include/linux/sched.h | 1 - mm/backing-dev.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b5aae4bd0ac..9e46aec10d1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -794,7 +794,7 @@ int bdi_writeback_thread(void *data) struct backing_dev_info *bdi = wb->bdi; long pages_written; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); wb->last_active = jiffies; diff --git a/include/linux/sched.h b/include/linux/sched.h index 56154bbb8da..393ce94e54b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1706,7 +1706,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ -#define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ #define PF_FREEZING 0x00004000 /* freeze in progress. do not account to load */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5ad3c106606..f2eb27884ff 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); /* -- cgit v1.2.3-70-g09d2