diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 112 |
1 files changed, 87 insertions, 25 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d26a16e4543..66237ff8b01 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p) sched_setnuma(p, env.dst_nid); + /* + * Reset the scan period if the task is being rescheduled on an + * alternative node to recheck if the tasks is now properly placed. + */ + p->numa_scan_period = task_scan_min(p); + if (env.best_task == NULL) { int ret = migrate_task_to(p, env.best_cpu); return ret; @@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p) p->numa_migrate_retry = jiffies + HZ*5; } +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, + unsigned long shared, unsigned long private) +{ + unsigned int period_slot; + int ratio; + int diff; + + unsigned long remote = p->numa_faults_locality[0]; + unsigned long local = p->numa_faults_locality[1]; + + /* + * If there were no record hinting faults then either the task is + * completely idle or all activity is areas that are not of interest + * to automatic numa balancing. Scan slower + */ + if (local + shared == 0) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + p->mm->numa_next_scan = jiffies + + msecs_to_jiffies(p->numa_scan_period); + + return; + } + + /* + * Prepare to scale scan period relative to the current period. + * == NUMA_PERIOD_THRESHOLD scan period stays the same + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) + */ + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); + ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + if (ratio >= NUMA_PERIOD_THRESHOLD) { + int slot = ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else { + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + + /* + * Scale scan rate increases based on sharing. There is an + * inverse relationship between the degree of sharing and + * the adjustment made to the scanning period. Broadly + * speaking the intent is that there is little point + * scanning faster if shared accesses dominate as it may + * simply bounce migrations uselessly + */ + period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + } + + p->numa_scan_period = clamp(p->numa_scan_period + diff, + task_scan_min(p), task_scan_max(p)); + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; + unsigned long fault_types[2] = { 0, 0 }; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p) /* Decay existing window, copy faults since last scan */ p->numa_faults[i] >>= 1; p->numa_faults[i] += p->numa_faults_buffer[i]; + fault_types[priv] += p->numa_faults_buffer[i]; p->numa_faults_buffer[i] = 0; faults += p->numa_faults[i]; @@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p) } } + update_task_scan_period(p, fault_types[0], fault_types[1]); + if (p->numa_group) { /* * If the preferred task and group nids are different, @@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) BUG_ON(p->numa_faults_buffer); p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); p->total_numa_faults = 0; + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } /* @@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } - /* - * If pages are properly placed (did not migrate) then scan slower. - * This is reset periodically in case of phase changes - */ - if (!migrated) { - /* Initialise if necessary */ - if (!p->numa_scan_period_max) - p->numa_scan_period_max = task_scan_max(p); - - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period + 10); - } - task_numa_placement(p); /* Retry task to preferred node migration if it previously failed */ @@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) p->numa_pages_migrated += pages; p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -1702,18 +1776,6 @@ void task_numa_work(struct callback_head *work) out: /* - * If the whole process was scanned without updates then no NUMA - * hinting faults are being recorded and scan rate should be lower. - */ - if (mm->numa_scan_offset == 0 && !nr_pte_updates) { - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period << 1); - - next_scan = now + msecs_to_jiffies(p->numa_scan_period); - mm->numa_next_scan = next_scan; - } - - /* * It is possible to reach the end of the VMA list but the last few * VMAs are not guaranteed to the vma_migratable. If they are not, we * would find the !migratable VMA on the next scan but not reset the |