From 3ad2f3fbb961429d2aa627465ae4829758bc7e07 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 3 Feb 2010 08:01:28 +0800 Subject: tree-wide: Assorted spelling fixes In particular, several occurances of funny versions of 'success', 'unknown', 'therefore', 'acknowledge', 'argument', 'achieve', 'address', 'beginning', 'desirable', 'separate' and 'necessary' are fixed. Signed-off-by: Daniel Mack Cc: Joe Perches Cc: Junio C Hamano Signed-off-by: Jiri Kosina --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index abdfacc5865..a70957b138e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1533,7 +1533,7 @@ struct task_struct { struct list_head *scm_work_list; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* Index of current stored adress in ret_stack */ + /* Index of current stored address in ret_stack */ int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack *ret_stack; -- cgit v1.2.3-70-g09d2 From db1466b3e1bd1727375cdbfcbea4bcce2f860f61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 07:46:56 -0800 Subject: rcu: Use wrapper function instead of exporting tasklist_lock Lockdep-RCU commit d11c563d exported tasklist_lock, which is not a good thing. This patch instead exports a function that uses lockdep to check whether tasklist_lock is held. Suggested-by: Christoph Hellwig Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Christoph Hellwig LKML-Reference: <1267631219-8713-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/cred.h | 2 +- include/linux/sched.h | 4 ++++ kernel/exit.c | 2 +- kernel/fork.c | 9 ++++++++- kernel/pid.c | 4 +++- 5 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/cred.h b/include/linux/cred.h index 4db09f89b63..52507c3e138 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -280,7 +280,7 @@ static inline void put_cred(const struct cred *_cred) * task or by holding tasklist_lock to prevent it from being unlinked. */ #define __task_cred(task) \ - ((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)))) + ((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_tasklist_lock_is_held()))) /** * get_task_cred - Get another task's objective credentials diff --git a/include/linux/sched.h b/include/linux/sched.h index 0eef87b58ea..a47af2064dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -258,6 +258,10 @@ extern spinlock_t mmlist_lock; struct task_struct; +#ifdef CONFIG_PROVE_RCU +extern int lockdep_tasklist_lock_is_held(void); +#endif /* #ifdef CONFIG_PROVE_RCU */ + extern void sched_init(void); extern void sched_init_smp(void); extern asmlinkage void schedule_tail(struct task_struct *prev); diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf..fed3a4db6f0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference_check(tsk->sighand, rcu_read_lock_held() || - lockdep_is_held(&tasklist_lock)); + lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356..8691c540a47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -EXPORT_SYMBOL_GPL(tasklist_lock); + +#ifdef CONFIG_PROVE_RCU +int lockdep_tasklist_lock_is_held(void) +{ + return lockdep_is_held(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); +#endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { diff --git a/kernel/pid.c b/kernel/pid.c index b08e697cd83..b6064405f36 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); + first = rcu_dereference_check(pid->tasks[type].first, + rcu_read_lock_held() || + lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pids[(type)].node); } -- cgit v1.2.3-70-g09d2 From d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:39 -0800 Subject: mm: clean up mm_counter Presently, per-mm statistics counter is defined by macro in sched.h This patch modifies it to - defined in mm.h as inlinf functions - use array instead of macro's name creation. This patch is for reducing patch size in future patch to modify implementation of per-mm counter. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +- include/linux/mm.h | 104 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 33 ++++++++++----- include/linux/sched.h | 54 ------------------------ kernel/fork.c | 3 +- kernel/tsacct.c | 1 + mm/filemap_xip.c | 2 +- mm/fremap.c | 2 +- mm/memory.c | 56 +++++++++++++++---------- mm/oom_kill.c | 4 +- mm/rmap.c | 10 ++--- mm/swapfile.c | 2 +- 12 files changed, 174 insertions(+), 101 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f277c4a111c..37558127601 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -65,11 +65,11 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - *shared = get_mm_counter(mm, file_rss); + *shared = get_mm_counter(mm, MM_FILEPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, anon_rss); + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 90957f14195..2124cdb2d1d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -870,6 +870,110 @@ extern int mprotect_fixup(struct vm_area_struct *vma, */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +/* + * per-process(per-mm_struct) statistics. + */ +#if USE_SPLIT_PTLOCKS +/* + * The mm counters are not protected by its page_table_lock, + * so must be incremented atomically. + */ +static inline void set_mm_counter(struct mm_struct *mm, int member, long value) +{ + atomic_long_set(&mm->rss_stat.count[member], value); +} + +static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]); +} + +static inline void add_mm_counter(struct mm_struct *mm, int member, long value) +{ + atomic_long_add(value, &mm->rss_stat.count[member]); +} + +static inline void inc_mm_counter(struct mm_struct *mm, int member) +{ + atomic_long_inc(&mm->rss_stat.count[member]); +} + +static inline void dec_mm_counter(struct mm_struct *mm, int member) +{ + atomic_long_dec(&mm->rss_stat.count[member]); +} + +#else /* !USE_SPLIT_PTLOCKS */ +/* + * The mm counters are protected by its page_table_lock, + * so can be incremented directly. + */ +static inline void set_mm_counter(struct mm_struct *mm, int member, long value) +{ + mm->rss_stat.count[member] = value; +} + +static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + return mm->rss_stat.count[member]; +} + +static inline void add_mm_counter(struct mm_struct *mm, int member, long value) +{ + mm->rss_stat.count[member] += value; +} + +static inline void inc_mm_counter(struct mm_struct *mm, int member) +{ + mm->rss_stat.count[member]++; +} + +static inline void dec_mm_counter(struct mm_struct *mm, int member) +{ + mm->rss_stat.count[member]--; +} + +#endif /* !USE_SPLIT_PTLOCKS */ + +static inline unsigned long get_mm_rss(struct mm_struct *mm) +{ + return get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_ANONPAGES); +} + +static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) +{ + return max(mm->hiwater_rss, get_mm_rss(mm)); +} + +static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) +{ + return max(mm->hiwater_vm, mm->total_vm); +} + +static inline void update_hiwater_rss(struct mm_struct *mm) +{ + unsigned long _rss = get_mm_rss(mm); + + if ((mm)->hiwater_rss < _rss) + (mm)->hiwater_rss = _rss; +} + +static inline void update_hiwater_vm(struct mm_struct *mm) +{ + if (mm->hiwater_vm < mm->total_vm) + mm->hiwater_vm = mm->total_vm; +} + +static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, + struct mm_struct *mm) +{ + unsigned long hiwater_rss = get_mm_hiwater_rss(mm); + + if (*maxrss < hiwater_rss) + *maxrss = hiwater_rss; +} + /* * A callback you can register to apply pressure to ageable caches. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36f96271306..e1ca64be667 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -24,12 +24,6 @@ struct address_space; #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) -#if USE_SPLIT_PTLOCKS -typedef atomic_long_t mm_counter_t; -#else /* !USE_SPLIT_PTLOCKS */ -typedef unsigned long mm_counter_t; -#endif /* !USE_SPLIT_PTLOCKS */ - /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -201,6 +195,22 @@ struct core_state { struct completion startup; }; +enum { + MM_FILEPAGES, + MM_ANONPAGES, + NR_MM_COUNTERS +}; + +#if USE_SPLIT_PTLOCKS +struct mm_rss_stat { + atomic_long_t count[NR_MM_COUNTERS]; +}; +#else /* !USE_SPLIT_PTLOCKS */ +struct mm_rss_stat { + unsigned long count[NR_MM_COUNTERS]; +}; +#endif /* !USE_SPLIT_PTLOCKS */ + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -227,11 +237,6 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - mm_counter_t _file_rss; - mm_counter_t _anon_rss; unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ @@ -244,6 +249,12 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + /* + * Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ + struct mm_rss_stat rss_stat; + struct linux_binfmt *binfmt; cpumask_t cpu_vm_mask; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1753f7e48..cbeafa49a53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -396,60 +396,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif -#if USE_SPLIT_PTLOCKS -/* - * The mm counters are not protected by its page_table_lock, - * so must be incremented atomically. - */ -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) - -#else /* !USE_SPLIT_PTLOCKS */ -/* - * The mm counters are protected by its page_table_lock, - * so can be incremented directly. - */ -#define set_mm_counter(mm, member, value) (mm)->_##member = (value) -#define get_mm_counter(mm, member) ((mm)->_##member) -#define add_mm_counter(mm, member, value) (mm)->_##member += (value) -#define inc_mm_counter(mm, member) (mm)->_##member++ -#define dec_mm_counter(mm, member) (mm)->_##member-- - -#endif /* !USE_SPLIT_PTLOCKS */ - -#define get_mm_rss(mm) \ - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) -#define update_hiwater_rss(mm) do { \ - unsigned long _rss = get_mm_rss(mm); \ - if ((mm)->hiwater_rss < _rss) \ - (mm)->hiwater_rss = _rss; \ -} while (0) -#define update_hiwater_vm(mm) do { \ - if ((mm)->hiwater_vm < (mm)->total_vm) \ - (mm)->hiwater_vm = (mm)->total_vm; \ -} while (0) - -static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) -{ - return max(mm->hiwater_rss, get_mm_rss(mm)); -} - -static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, - struct mm_struct *mm) -{ - unsigned long hiwater_rss = get_mm_hiwater_rss(mm); - - if (*maxrss < hiwater_rss) - *maxrss = hiwater_rss; -} - -static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) -{ - return max(mm->hiwater_vm, mm->total_vm); -} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356..7616bcf107b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); + memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 00d59d048ed..0a67e041edf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * fill in basic accounting fields diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb..78b94f0b6d5 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -194,7 +194,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb3..46f5dacf90a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); } } else { if (!pte_file(pte)) diff --git a/mm/memory.c b/mm/memory.c index 72fb5f39bcc..c5767847880 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -121,6 +121,7 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -376,12 +377,18 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +static inline void init_rss_vec(int *rss) { - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); } /* @@ -632,7 +639,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - rss[PageAnon(page)]++; + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; } out_set_pte: @@ -648,11 +658,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; again: - rss[1] = rss[0] = 0; + init_rss_vec(rss); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; @@ -688,7 +699,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); - add_mm_rss(dst_mm, rss[0], rss[1]); + add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -816,8 +827,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; + int rss[NR_MM_COUNTERS]; + + init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -863,14 +875,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -893,7 +905,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -1527,7 +1539,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2163,11 +2175,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2604,7 +2616,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2688,7 +2700,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2842,10 +2854,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 237050478f2..35755a4156d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(p), p->comm, K(p->mm->total_vm), - K(get_mm_counter(p->mm, anon_rss)), - K(get_mm_counter(p->mm, file_rss))); + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); /* diff --git a/mm/rmap.c b/mm/rmap.c index 278cd277bde..73d0472884c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -815,9 +815,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { if (PageAnon(page)) - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { @@ -839,7 +839,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration @@ -857,7 +857,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); page_remove_rmap(page); page_cache_release(page); @@ -996,7 +996,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c0585b1641..893984946a2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -840,7 +840,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); -- cgit v1.2.3-70-g09d2 From 34e55232e59f7b19050267a05ff1226e5cd122a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:40 -0800 Subject: mm: avoid false sharing of mm_counter Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 6 +++ fs/exec.c | 1 + include/linux/mm.h | 8 ++-- include/linux/mm_types.h | 6 +++ include/linux/sched.h | 4 +- kernel/exit.c | 3 +- mm/memory.c | 94 ++++++++++++++++++++++++++++++++++---- 7 files changed, 107 insertions(+), 15 deletions(-) (limited to 'include/linux/sched.h') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a..e418f3d8f42 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file contains details information about the process itself. Its fields are explained in Table 1-4. +(for SMP CONFIG users) +For making accounting scalable, RSS related information are handled in +asynchronous manner and the vaule may not be very precise. To see a precise +snapshot of a moment, you can see /proc//smaps file and scan page table. +It's slow but very precise. + Table 1-2: Contents of the statm files (as of 2.6.30-rc7) .............................................................................. Field Content diff --git a/fs/exec.c b/fs/exec.c index cce6bbdbdbb..ea7861727ef 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; + sync_mm_rss(tsk, old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 2124cdb2d1d..8e580c07d17 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, /* * per-process(per-mm_struct) statistics. */ -#if USE_SPLIT_PTLOCKS +#if defined(SPLIT_RSS_COUNTING) /* * The mm counters are not protected by its page_table_lock, * so must be incremented atomically. @@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value) atomic_long_set(&mm->rss_stat.count[member], value); } -static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) -{ - return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]); -} +unsigned long get_mm_counter(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { @@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, *maxrss = hiwater_rss; } +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); /* * A callback you can register to apply pressure to ageable caches. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e1ca64be667..21861239ab0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -202,9 +202,15 @@ enum { }; #if USE_SPLIT_PTLOCKS +#define SPLIT_RSS_COUNTING struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; }; +/* per-thread cached information, */ +struct task_rss_stat { + int events; /* for synchronization threshold */ + int count[NR_MM_COUNTERS]; +}; #else /* !USE_SPLIT_PTLOCKS */ struct mm_rss_stat { unsigned long count[NR_MM_COUNTERS]; diff --git a/include/linux/sched.h b/include/linux/sched.h index cbeafa49a53..46c6f8d5dc0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1220,7 +1220,9 @@ struct task_struct { struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; - +#if defined(SPLIT_RSS_COUNTING) + struct task_rss_stat rss_stat; +#endif /* task state */ int exit_state; int exit_code, exit_signal; diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf..10d3c5d5ae4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - + /* sync mm's RSS info before statistics gathering */ + sync_mm_rss(tsk, tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/mm/memory.c b/mm/memory.c index c5767847880..a4597614f18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -122,6 +122,79 @@ static int __init init_zero_pfn(void) core_initcall(init_zero_pfn); +#if defined(SPLIT_RSS_COUNTING) + +void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (task->rss_stat.count[i]) { + add_mm_counter(mm, i, task->rss_stat.count[i]); + task->rss_stat.count[i] = 0; + } + } + task->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + __sync_task_rss_stat(task, task->mm); +} + +unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + long val = 0; + + /* + * Don't use task->mm here...for avoiding to use task_get_mm().. + * The caller must guarantee task->mm is not invalid. + */ + val = atomic_long_read(&mm->rss_stat.count[member]); + /* + * counter is updated in asynchronous manner and may go to minus. + * But it's never be expected number for users. + */ + if (val < 0) + return 0; + return (unsigned long)val; +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ + __sync_task_rss_stat(task, mm); +} +#else + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ +} +#endif + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; + if (current->mm == mm) + sync_mm_rss(current, mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); @@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2175,11 +2250,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, MM_FILEPAGES); - inc_mm_counter(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; @@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -- cgit v1.2.3-70-g09d2 From 93c59907c6f247d09239135caecf294a106a2ae0 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:03 -0800 Subject: copy_signal() cleanup: clean thread_group_cputime_init() Remove unneeded initializations in thread_group_cputime_init() and in posix_cpu_timers_init_group(). They are useless after kmem_cache_zalloc() was used in copy_signal(). Signed-off-by: Veaceslav Falico Acked-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 -- kernel/fork.c | 11 ----------- 2 files changed, 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 46c6f8d5dc0..ca635c12848 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2391,9 +2391,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); static inline void thread_group_cputime_init(struct signal_struct *sig) { - sig->cputimer.cputime = INIT_CPUTIME; spin_lock_init(&sig->cputimer.lock); - sig->cputimer.running = 0; } static inline void thread_group_cputime_free(struct signal_struct *sig) diff --git a/kernel/fork.c b/kernel/fork.c index ce2666f84d8..1beb6c303c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -833,17 +833,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) /* Thread group counters. */ thread_group_cputime_init(sig); - /* Expiration times and increments. */ - sig->it[CPUCLOCK_PROF].expires = cputime_zero; - sig->it[CPUCLOCK_PROF].incr = cputime_zero; - sig->it[CPUCLOCK_VIRT].expires = cputime_zero; - sig->it[CPUCLOCK_VIRT].incr = cputime_zero; - - /* Cached expiration times. */ - sig->cputime_expires.prof_exp = cputime_zero; - sig->cputime_expires.virt_exp = cputime_zero; - sig->cputime_expires.sched_exp = 0; - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); -- cgit v1.2.3-70-g09d2