From 9d823e8f6b1b7b39f952d7d1795f29162143a433 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 11 Jun 2011 18:10:12 -0600 Subject: writeback: per task dirty rate limit Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra Reviewed-by: Andrea Righi Signed-off-by: Wu Fengguang --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 8e6b6f4fb27..cc0815df99f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1302,6 +1302,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->pdeath_signal = 0; p->exit_state = 0; + p->nr_dirtied = 0; + p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. -- cgit v1.2.3-70-g09d2 From c9f01245b6a7d77d17deaa71af10f6aca14fa24e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 31 Oct 2011 17:07:15 -0700 Subject: oom: remove oom_disable_count This removes mm->oom_disable_count entirely since it's unnecessary and currently buggy. The counter was intended to be per-process but it's currently decremented in the exit path for each thread that exits, causing it to underflow. The count was originally intended to prevent oom killing threads that share memory with threads that cannot be killed since it doesn't lead to future memory freeing. The counter could be fixed to represent all threads sharing the same mm, but it's better to remove the count since: - it is possible that the OOM_DISABLE thread sharing memory with the victim is waiting on that thread to exit and will actually cause future memory freeing, and - there is no guarantee that a thread is disabled from oom killing just because another thread sharing its mm is oom disabled. Signed-off-by: David Rientjes Reported-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Cc: Ying Han Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 ---- fs/proc/base.c | 13 ------------- include/linux/mm_types.h | 3 --- kernel/exit.c | 2 -- kernel/fork.c | 10 +--------- mm/oom_kill.c | 23 +++++------------------ 6 files changed, 6 insertions(+), 49 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/exec.c b/fs/exec.c index 25dcbe5fc35..36254645b7c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); - if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&old_mm->oom_disable_count); - atomic_inc(&tsk->mm->oom_disable_count); - } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 5eb02069e1b..8f0087e20e1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1107,13 +1107,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_adjust != task->signal->oom_adj) { - if (oom_adjust == OOM_DISABLE) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_adj == OOM_DISABLE) - atomic_dec(&task->mm->oom_disable_count); - } - /* * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. @@ -1215,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_score_adj != task->signal->oom_score_adj) { - if (oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&task->mm->oom_disable_count); - } task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c93d00a6e95..6456624aa96 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -336,9 +336,6 @@ struct mm_struct { unsigned int token_priority; unsigned int last_interval; - /* How many tasks sharing this mm are OOM_DISABLE */ - atomic_t oom_disable_count; - unsigned long flags; /* Must use atomic bitops to access the bits */ struct core_state *core_state; /* coredumping support */ diff --git a/kernel/exit.c b/kernel/exit.c index 2913b3509d4..d0b7d988f87 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 8e6b6f4fb27..70d76191afb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -501,7 +501,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); - atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -816,8 +815,6 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -1391,13 +1388,8 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) { - task_lock(p); - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&p->mm->oom_disable_count); - task_unlock(p); + if (p->mm) mmput(p->mm); - } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0d8943bc9f..2b97e8f0460 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -54,13 +54,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; - if (new_val != old_val) { - if (new_val == OOM_SCORE_ADJ_MIN) - atomic_inc(¤t->mm->oom_disable_count); - else if (old_val == OOM_SCORE_ADJ_MIN) - atomic_dec(¤t->mm->oom_disable_count); - current->signal->oom_score_adj = new_val; - } + current->signal->oom_score_adj = new_val; spin_unlock_irq(&sighand->siglock); return old_val; @@ -172,16 +166,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, if (!p) return 0; - /* - * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN - * so the entire heuristic doesn't need to be executed for something - * that cannot be killed. - */ - if (atomic_read(&p->mm->oom_disable_count)) { - task_unlock(p); - return 0; - } - /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. @@ -451,6 +435,9 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) for_each_process(q) if (q->mm == mm && !same_thread_group(q, p) && !(q->flags & PF_KTHREAD)) { + if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; + task_lock(q); /* Protect ->comm from prctl() */ pr_err("Kill process %d (%s) sharing same memory\n", task_pid_nr(q), q->comm); @@ -727,7 +714,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && - current->mm && !atomic_read(¤t->mm->oom_disable_count)) { + current->mm) { /* * oom_kill_process() needs tasklist_lock held. If it returns * non-zero, current could not be killed so we must fallback to -- cgit v1.2.3-70-g09d2