diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 165 | ||||
-rw-r--r-- | kernel/sched/sched.h | 5 |
3 files changed, 160 insertions, 13 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1fe59da280e..51092d5cc64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1733,6 +1733,9 @@ static void __sched_fork(struct task_struct *p) p->numa_work.next = &p->numa_work; p->numa_faults = NULL; p->numa_faults_buffer = NULL; + + INIT_LIST_HEAD(&p->numa_entry); + p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dbe0f628efa..85565053a6e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -888,6 +888,17 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + struct list_head task_list; + + struct rcu_head rcu; + atomic_long_t faults[0]; +}; + static inline int task_faults_idx(int nid, int priv) { return 2 * nid + priv; @@ -1182,7 +1193,10 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { + long diff; + i = task_faults_idx(nid, priv); + diff = -p->numa_faults[i]; /* Decay existing window, copy faults since last scan */ p->numa_faults[i] >>= 1; @@ -1190,6 +1204,11 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults_buffer[i] = 0; faults += p->numa_faults[i]; + diff += p->numa_faults[i]; + if (p->numa_group) { + /* safe because we can only change our own group */ + atomic_long_add(diff, &p->numa_group->faults[i]); + } } if (faults > max_faults) { @@ -1207,6 +1226,131 @@ static void task_numa_placement(struct task_struct *p) } } +static inline int get_numa_group(struct numa_group *grp) +{ + return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (atomic_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static void task_numa_group(struct task_struct *p, int cpupid) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!p->numa_group)) { + unsigned int size = sizeof(struct numa_group) + + 2*nr_node_ids*sizeof(atomic_long_t); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + atomic_set(&grp->refcount, 1); + spin_lock_init(&grp->lock); + INIT_LIST_HEAD(&grp->task_list); + + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_set(&grp->faults[i], p->numa_faults[i]); + + list_add(&p->numa_entry, &grp->task_list); + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } + + rcu_read_lock(); + tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto unlock; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto unlock; + + my_grp = p->numa_group; + if (grp == my_grp) + goto unlock; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto unlock; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto unlock; + + if (!get_numa_group(grp)) + goto unlock; + + join = true; + +unlock: + rcu_read_unlock(); + + if (!join) + return; + + for (i = 0; i < 2*nr_node_ids; i++) { + atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); + atomic_long_add(p->numa_faults[i], &grp->faults[i]); + } + + double_lock(&my_grp->lock, &grp->lock); + + list_move(&p->numa_entry, &grp->task_list); + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); +} + +void task_numa_free(struct task_struct *p) +{ + struct numa_group *grp = p->numa_group; + int i; + + if (grp) { + for (i = 0; i < 2*nr_node_ids; i++) + atomic_long_sub(p->numa_faults[i], &grp->faults[i]); + + spin_lock(&grp->lock); + list_del(&p->numa_entry); + grp->nr_tasks--; + spin_unlock(&grp->lock); + rcu_assign_pointer(p->numa_group, NULL); + put_numa_group(grp); + } + + kfree(p->numa_faults); +} + /* * Got a PROT_NONE fault for a page on @node. */ @@ -1222,15 +1366,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) if (!p->mm) return; - /* - * First accesses are treated as private, otherwise consider accesses - * to be private if the accessing pid has not changed - */ - if (!cpupid_pid_unset(last_cpupid)) - priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid)); - else - priv = 1; - /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; @@ -1245,6 +1380,18 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) } /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv) + task_numa_group(p, last_cpupid); + } + + /* * If pages are properly placed (did not migrate) then scan slower. * This is reset periodically in case of phase changes */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 691e96964dc..8037b10a256 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -559,10 +559,7 @@ static inline u64 rq_clock_task(struct rq *rq) #ifdef CONFIG_NUMA_BALANCING extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); -static inline void task_numa_free(struct task_struct *p) -{ - kfree(p->numa_faults); -} +extern void task_numa_free(struct task_struct *p); #else /* CONFIG_NUMA_BALANCING */ static inline void task_numa_free(struct task_struct *p) { |