From 10b777246c6953100099af1870d35c8b24d49b12 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Nov 2007 22:39:37 +0100 Subject: sched: fix vslice vslice was missing a factor NICE_0_LOAD, as weight is in weight*NICE_0_LOAD units. the effect of this bug was larger initial slices and thus latency-noisier forks. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 01859f662ab..62b057603f0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -259,6 +259,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) { u64 vslice = __sched_period(nr_running); + vslice *= NICE_0_LOAD; do_div(vslice, rq_weight); return vslice; -- cgit v1.2.3-70-g09d2 From 2cb8600e6be4281e381d39e44de4359e46333e23 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Nov 2007 22:39:37 +0100 Subject: sched: documentation: place_entity() comments Add a few comments to place_entity(). No code changed. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 62b057603f0..8763bee6b66 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -473,19 +473,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) vruntime += sched_vslice(cfs_rq)/2; + /* + * The 'current' period is already promised to the current tasks, + * however the extra weight of the new task will slow them down a + * little, place the new task so that it fits in the slot that + * stays open at the end. + */ if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice_add(cfs_rq, se); if (!initial) { + /* sleeps upto a single latency don't count. */ if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && task_of(se)->policy != SCHED_BATCH) vruntime -= sysctl_sched_latency; - vruntime = max_t(s64, vruntime, se->vruntime); + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); } se->vruntime = vruntime; - } static void -- cgit v1.2.3-70-g09d2 From b2be5e96dc0b5a179cf4cb98e65cfb605752ca26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Nov 2007 22:39:37 +0100 Subject: sched: reintroduce the sched_min_granularity tunable we lost the sched_min_granularity tunable to a clever optimization that uses the sched_latency/min_granularity ratio - but the ratio is quite unintuitive to users and can also crash the kernel if the ratio is set to 0. So reintroduce the min_granularity tunable, while keeping the ratio maintained internally. no functionality changed. [ mingo@elte.hu: some fixlets. ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 +++++- kernel/sched_debug.c | 2 +- kernel/sched_fair.c | 35 ++++++++++++++++++++++++++++------- kernel/sysctl.c | 11 +++++++---- 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 155d7438f7a..5457b6234e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1460,12 +1460,16 @@ extern void sched_idle_next(void); #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_nr_latency; +extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; + +int sched_nr_latency_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, + loff_t *ppos); #endif extern unsigned int sysctl_sched_compat_yield; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 415e5c38554..ca198a797bf 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -211,7 +211,7 @@ static int sched_debug_show(struct seq_file *m, void *v) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_latency); - PN(sysctl_sched_nr_latency); + PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); PN(sysctl_sched_batch_wakeup_granularity); PN(sysctl_sched_child_runs_first); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8763bee6b66..c495dcf7031 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -35,16 +35,21 @@ const_debug unsigned int sysctl_sched_latency = 20000000ULL; /* - * After fork, child runs first. (default) If set to 0 then - * parent will (try to) run first. + * Minimal preemption granularity for CPU-bound tasks: + * (default: 1 msec, units: nanoseconds) */ -const_debug unsigned int sysctl_sched_child_runs_first = 1; +const_debug unsigned int sysctl_sched_min_granularity = 1000000ULL; /* - * Minimal preemption granularity for CPU-bound tasks: - * (default: 2 msec, units: nanoseconds) + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -const_debug unsigned int sysctl_sched_nr_latency = 20; +const_debug unsigned int sched_nr_latency = 20; + +/* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. + */ +const_debug unsigned int sysctl_sched_child_runs_first = 1; /* * sys_sched_yield() compat mode @@ -212,6 +217,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ +#ifdef CONFIG_SCHED_DEBUG +int sched_nr_latency_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, + sysctl_sched_min_granularity); + + return 0; +} +#endif /* * The idea is to set a period in which each task runs once. @@ -224,7 +245,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) static u64 __sched_period(unsigned long nr_running) { u64 period = sysctl_sched_latency; - unsigned long nr_latency = sysctl_sched_nr_latency; + unsigned long nr_latency = sched_nr_latency; if (unlikely(nr_running > nr_latency)) { period *= nr_running; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3b4efbe2644..6e3b63c0685 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -235,11 +235,14 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_nr_latency", - .data = &sysctl_sched_nr_latency, + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_nr_latency_handler, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, }, { .ctl_name = CTL_UNNUMBERED, @@ -247,7 +250,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &sched_nr_latency_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, -- cgit v1.2.3-70-g09d2 From 9a41785cc43d88397f787a651ed7286a33f8462f Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 9 Nov 2007 22:39:37 +0100 Subject: sched: fix delay accounting regression Fix the delay accounting regression introduced by commit 75d4ef16a6aa84f708188bada182315f80aab6fa. rq no longer has sched_info data associated with it. task_struct sched_info structure is used by delay accounting to provide back statistics to user space. also remove direct use of sched_clock() (which is not a valid thing to do anymore) and use rq->clock instead. Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched_stats.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index ef1a7df80ea..630178e53bb 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -127,7 +127,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_set(var, val) do { } while (0) #endif -#ifdef CONFIG_SCHEDSTATS +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -155,7 +155,7 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long long now = sched_clock(), delta = 0; + unsigned long long now = task_rq(t)->clock, delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -186,7 +186,7 @@ static inline void sched_info_queued(struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = sched_clock(); + t->sched_info.last_queued = task_rq(t)->clock; } /* @@ -195,7 +195,8 @@ static inline void sched_info_queued(struct task_struct *t) */ static inline void sched_info_depart(struct task_struct *t) { - unsigned long long delta = sched_clock() - t->sched_info.last_arrival; + unsigned long long delta = task_rq(t)->clock - + t->sched_info.last_arrival; t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); @@ -231,5 +232,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) #else #define sched_info_queued(t) do { } while (0) #define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ -- cgit v1.2.3-70-g09d2 From fa13a5a1f25f671d084d8884be96fc48d9b68275 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Nov 2007 22:39:38 +0100 Subject: sched: restore deterministic CPU accounting on powerpc Since powerpc started using CONFIG_GENERIC_CLOCKEVENTS, the deterministic CPU accounting (CONFIG_VIRT_CPU_ACCOUNTING) has been broken on powerpc, because we end up counting user time twice: once in timer_interrupt() and once in update_process_times(). This fixes the problem by pulling the code in update_process_times that updates utime and stime into a separate function called account_process_tick. If CONFIG_VIRT_CPU_ACCOUNTING is not defined, there is a version of account_process_tick in kernel/timer.c that simply accounts a whole tick to either utime or stime as before. If CONFIG_VIRT_CPU_ACCOUNTING is defined, then arch code gets to implement account_process_tick. This also lets us simplify the s390 code a bit; it means that the s390 timer interrupt can now call update_process_times even when CONFIG_VIRT_CPU_ACCOUNTING is turned on, and can just implement a suitable account_process_tick(). account_process_tick() now takes the task_struct * as an argument. Tested both with and without CONFIG_VIRT_CPU_ACCOUNTING. Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/time.c | 25 +------------------------ arch/s390/kernel/time.c | 4 ---- arch/s390/kernel/vtime.c | 8 +------- include/linux/sched.h | 1 + kernel/timer.c | 21 ++++++++++++++------- 6 files changed, 18 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index b9d88374f14..41e13f4cc6e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -350,7 +350,7 @@ struct task_struct *__switch_to(struct task_struct *prev, local_irq_save(flags); account_system_vtime(current); - account_process_vtime(current); + account_process_tick(current, 0); calculate_steal_time(); last = _switch(old_thread, new_thread); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 9eb3284deac..a70dfb76d0a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -259,7 +259,7 @@ void account_system_vtime(struct task_struct *tsk) * user and system time records. * Must be called with interrupts disabled. */ -void account_process_vtime(struct task_struct *tsk) +void account_process_tick(struct task_struct *tsk, int user_tick) { cputime_t utime, utimescaled; @@ -274,18 +274,6 @@ void account_process_vtime(struct task_struct *tsk) account_user_time_scaled(tsk, utimescaled); } -static void account_process_time(struct pt_regs *regs) -{ - int cpu = smp_processor_id(); - - account_process_vtime(current); - run_local_timers(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_mode(regs)); - scheduler_tick(); - run_posix_cpu_timers(current); -} - /* * Stuff for accounting stolen time. */ @@ -375,7 +363,6 @@ static void snapshot_purr(void) #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ #define calc_cputime_factors() -#define account_process_time(regs) update_process_times(user_mode(regs)) #define calculate_steal_time() do { } while (0) #endif @@ -599,16 +586,6 @@ void timer_interrupt(struct pt_regs * regs) get_lppaca()->int_dword.fields.decr_int = 0; #endif - /* - * We cannot disable the decrementer, so in the period - * between this cpu's being marked offline in cpu_online_map - * and calling stop-self, it is taking timer interrupts. - * Avoid calling into the scheduler rebalancing code if this - * is the case. - */ - if (!cpu_is_offline(cpu)) - account_process_time(regs); - if (evt->event_handler) evt->event_handler(evt); else diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index a963fe81359..22b800ce212 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -145,12 +145,8 @@ void account_ticks(u64 time) do_timer(ticks); #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING - account_tick_vtime(current); -#else while (ticks--) update_process_times(user_mode(get_irq_regs())); -#endif s390_do_profile(); } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 84ff78de6ba..c5f05b3fb2c 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer); * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void account_tick_vtime(struct task_struct *tsk) +void account_process_tick(struct task_struct *tsk, int user_tick) { cputime_t cputime; __u64 timer, clock; @@ -64,12 +64,6 @@ void account_tick_vtime(struct task_struct *tsk) S390_lowcore.steal_clock -= cputime << 12; account_steal_time(tsk, cputime); } - - run_local_timers(); - if (rcu_pending(smp_processor_id())) - rcu_check_callbacks(smp_processor_id(), rcu_user_flag); - scheduler_tick(); - run_posix_cpu_timers(tsk); } /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 5457b6234e1..951759e30c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -254,6 +254,7 @@ long io_schedule_timeout(long timeout); extern void cpu_init (void); extern void trap_init(void); +extern void account_process_tick(struct task_struct *task, int user); extern void update_process_times(int user); extern void scheduler_tick(void); diff --git a/kernel/timer.c b/kernel/timer.c index 00e44e2afd6..a05817c021d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -817,6 +817,19 @@ unsigned long next_timer_interrupt(void) #endif +#ifndef CONFIG_VIRT_CPU_ACCOUNTING +void account_process_tick(struct task_struct *p, int user_tick) +{ + if (user_tick) { + account_user_time(p, jiffies_to_cputime(1)); + account_user_time_scaled(p, jiffies_to_cputime(1)); + } else { + account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); + account_system_time_scaled(p, jiffies_to_cputime(1)); + } +} +#endif + /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. @@ -827,13 +840,7 @@ void update_process_times(int user_tick) int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ - if (user_tick) { - account_user_time(p, jiffies_to_cputime(1)); - account_user_time_scaled(p, jiffies_to_cputime(1)); - } else { - account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); - account_system_time_scaled(p, jiffies_to_cputime(1)); - } + account_process_tick(p, user_tick); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); -- cgit v1.2.3-70-g09d2 From 19978ca610946ed57c071bad63f8f6642ca1298b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:38 +0100 Subject: sched: reintroduce SMP tunings again Yanmin Zhang reported an aim7 regression and bisected it down to: | commit 38ad464d410dadceda1563f36bdb0be7fe4c8938 | Author: Ingo Molnar | Date: Mon Oct 15 17:00:02 2007 +0200 | | sched: uniform tunings | | use the same defaults on both UP and SMP. fix this by reintroducing similar SMP tunings again. This resolves the regression. (also update the comments to match the ilog2(nr_cpus) tuning effect) Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++++++++++++++++++ kernel/sched_fair.c | 18 +++++++++--------- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 3f6bd111290..69cae271c63 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4992,6 +4992,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static inline void sched_init_granularity(void) +{ + unsigned int factor = 1 + ilog2(num_online_cpus()); + const unsigned long limit = 200000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; + + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_wakeup_granularity *= factor; + sysctl_sched_batch_wakeup_granularity *= factor; +} + #ifdef CONFIG_SMP /* * This is how migration works: @@ -6688,10 +6714,12 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); + sched_init_granularity(); } #else void __init sched_init_smp(void) { + sched_init_granularity(); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c495dcf7031..7264814ba62 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -22,7 +22,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms, units: nanoseconds) + * (default: 20ms * ilog(ncpus), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -32,18 +32,18 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -const_debug unsigned int sysctl_sched_latency = 20000000ULL; +unsigned int sysctl_sched_latency = 20000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 1 msec, units: nanoseconds) + * (default: 1 msec * ilog(ncpus), units: nanoseconds) */ -const_debug unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -const_debug unsigned int sched_nr_latency = 20; +unsigned int sched_nr_latency = 20; /* * After fork, child runs first. (default) If set to 0 then @@ -61,23 +61,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_BATCH wake-up granularity. - * (default: 10 msec, units: nanoseconds) + * (default: 10 msec * ilog(ncpus), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec, units: nanoseconds) + * (default: 10 msec * ilog(ncpus), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 10000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -- cgit v1.2.3-70-g09d2 From d6322faf296ab5bbb597f8b0abcb50153754cd08 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Nov 2007 22:39:38 +0100 Subject: sched: cleanup, use NSEC_PER_MSEC and NSEC_PER_SEC 1) hardcoded 1000000000 value is used five times in places where NSEC_PER_SEC might be more readable. 2) A conversion from nsec to msec uses the hardcoded 1000000 value, which is a candidate for NSEC_PER_MSEC. no code changed: text data bss dec hex filename 44359 3326 36 47721 ba69 sched.o.before 44359 3326 36 47721 ba69 sched.o.after Signed-off-by: Eric Dumazet Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- kernel/sysctl.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 69cae271c63..387258cdd0b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,7 +75,7 @@ */ unsigned long long __attribute__((weak)) sched_clock(void) { - return (unsigned long long)jiffies * (1000000000 / HZ); + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } /* @@ -99,8 +99,8 @@ unsigned long long __attribute__((weak)) sched_clock(void) /* * Some helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT @@ -7256,7 +7256,7 @@ static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft) spin_unlock_irqrestore(&cpu_rq(i)->lock, flags); } /* Convert from ns to ms */ - do_div(res, 1000000); + do_div(res, NSEC_PER_MSEC); return res; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6e3b63c0685..adddf682d4c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -226,9 +226,9 @@ static struct ctl_table root_table[] = { #ifdef CONFIG_SCHED_DEBUG static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ -static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ +static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ -static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ +static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { -- cgit v1.2.3-70-g09d2 From 4e2947f12516d13446d6ffa1d9e4fbd33b1636fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:38 +0100 Subject: x86: make ipi_handler() always defined prepare for up_smp_call_function() to ensure that the 'func' pointer is unused. (which is related to a KVM build fix) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 9abbdf7562c..3b20613325d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -139,13 +139,12 @@ struct set_mtrr_data { mtrr_type smp_type; }; -#ifdef CONFIG_SMP - static void ipi_handler(void *info) /* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. [RETURNS] Nothing. */ { +#ifdef CONFIG_SMP struct set_mtrr_data *data = info; unsigned long flags; @@ -168,9 +167,8 @@ static void ipi_handler(void *info) atomic_dec(&data->count); local_irq_restore(flags); -} - #endif +} static inline int types_compatible(mtrr_type type1, mtrr_type type2) { return type1 == MTRR_TYPE_UNCACHABLE || -- cgit v1.2.3-70-g09d2 From 0492007ed9b53f6a2a2f983910d0fe7c97b09822 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:38 +0100 Subject: x86: make nmi_cpu_busy() always defined nmi_cpu_busy() must be available on !SMP too. this is in preparation to a smp_call_function_mask() fix. Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index f803ed0ed1c..600fd404e44 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); static int endflag __initdata = 0; -#ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all * CPUs during the test make them busy. */ static __init void nmi_cpu_busy(void *data) { +#ifdef CONFIG_SMP local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, @@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data) care if they get somewhat less cycles. */ while (endflag == 0) mb(); -} #endif +} static int __init check_nmi_watchdog(void) { -- cgit v1.2.3-70-g09d2 From a5fbb6d1064be885d2a6b82f625186753cf74848 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:38 +0100 Subject: KVM: fix !SMP build error fix a !SMP build error: drivers/kvm/kvm_main.c: In function 'kvm_flush_remote_tlbs': drivers/kvm/kvm_main.c:220: error: implicit declaration of function 'smp_call_function_mask' (and also avoid unused function warning related to up_smp_call_function() not making use of the 'func' parameter.) Signed-off-by: Ingo Molnar --- include/linux/smp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 259a13c3bd9..c25e66bcecf 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -84,11 +84,12 @@ void smp_prepare_boot_cpu(void); * These macros fold the SMP functionality into a single CPU system */ #define raw_smp_processor_id() 0 -static inline int up_smp_call_function(void) +static inline int up_smp_call_function(void (*func)(void *), void *info) { return 0; } -#define smp_call_function(func,info,retry,wait) (up_smp_call_function()) +#define smp_call_function(func, info, retry, wait) \ + (up_smp_call_function(func, info)) #define on_each_cpu(func,info,retry,wait) \ ({ \ local_irq_disable(); \ @@ -107,6 +108,8 @@ static inline void smp_send_reschedule(int cpu) { } local_irq_enable(); \ 0; \ }) +#define smp_call_function_mask(mask, func, info, wait) \ + (up_smp_call_function(func, info)) #endif /* !SMP */ -- cgit v1.2.3-70-g09d2 From 52d3da1ad4f442cec877fbeb83902707b56da0cf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: turn off PREEMPT_RESTRICT PREEMPT_RESTRICT was a method aimed at reducing the amount of wakeup related preemption. It has a disadvantage though, it can prevent legitimate wakeups if a task is 'unlucky' to be hit too early by a tick that clears peer_preempt. Now that the wakeup preemption has been cleaned up we dont seem to have excessive preemptions anymore, so this feature can be turned off. (and removed in the next patch) Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 387258cdd0b..4b23dfb4c80 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -469,7 +469,7 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_TREE_AVG * 0 | SCHED_FEAT_APPROX_AVG * 0 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | - SCHED_FEAT_PREEMPT_RESTRICT * 1; + SCHED_FEAT_PREEMPT_RESTRICT * 0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) -- cgit v1.2.3-70-g09d2 From 3e3e13f399ac8060a20d14d210a28dc02dda372e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: remove PREEMPT_RESTRICT remove PREEMPT_RESTRICT. (this is a separate commit so that any regression related to the removal itself is bisectable) Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 4 +--- kernel/sched_fair.c | 10 ++-------- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 951759e30c0..93fd30d6dac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -863,7 +863,6 @@ struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; unsigned int on_rq; - int peer_preempt; u64 exec_start; u64 sum_exec_runtime; diff --git a/kernel/sched.c b/kernel/sched.c index 4b23dfb4c80..2a107e4ad5e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -460,7 +460,6 @@ enum { SCHED_FEAT_TREE_AVG = 4, SCHED_FEAT_APPROX_AVG = 8, SCHED_FEAT_WAKEUP_PREEMPT = 16, - SCHED_FEAT_PREEMPT_RESTRICT = 32, }; const_debug unsigned int sysctl_sched_features = @@ -468,8 +467,7 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_TREE_AVG * 0 | SCHED_FEAT_APPROX_AVG * 0 | - SCHED_FEAT_WAKEUP_PREEMPT * 1 | - SCHED_FEAT_PREEMPT_RESTRICT * 0; + SCHED_FEAT_WAKEUP_PREEMPT * 1; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7264814ba62..fbcb426029d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -546,7 +546,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { - se->peer_preempt = 0; #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -574,10 +573,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime || - (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) + if (delta_exec > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); - curr->peer_preempt = 0; } static void @@ -867,9 +864,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) gran = calc_delta_fair(gran, &se->load); if (delta > gran) { - int now = !sched_feat(PREEMPT_RESTRICT); - - if (now || p->prio < curr->prio || !se->peer_preempt++) + if (p->prio < curr->prio) resched_task(curr); } } @@ -1083,7 +1078,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) swap(curr->vruntime, se->vruntime); } - se->peer_preempt = 0; enqueue_task_fair(rq, p, 0); resched_task(rq->curr); } -- cgit v1.2.3-70-g09d2 From 8bc6767acb3236e0345e99cf198168e60e7ae456 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: wakeup preemption fix wakeup preemption fix: do not make it dependent on p->prio. Preemption purely depends on ->vruntime. This improves preemption in mixed-nice-level workloads. Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fbcb426029d..a3badf52bba 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -863,10 +863,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se->load.weight != NICE_0_LOAD)) gran = calc_delta_fair(gran, &se->load); - if (delta > gran) { - if (p->prio < curr->prio) - resched_task(curr); - } + if (delta > gran) + resched_task(curr); } } -- cgit v1.2.3-70-g09d2 From 77d9cc44b543fa831169e54c495ad06ef3a0c726 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: clean up the wakeup preempt check clean up the wakeup preemption check. No code changed: text data bss dec hex filename 44227 3326 36 47589 b9e5 sched.o.before 44227 3326 36 47589 b9e5 sched.o.after Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a3badf52bba..d558716a9ad 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -852,20 +852,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(p->policy == SCHED_BATCH)) return; - if (sched_feat(WAKEUP_PREEMPT)) { - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); - } - - delta = se->vruntime - pse->vruntime; - gran = sysctl_sched_wakeup_granularity; - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + if (!sched_feat(WAKEUP_PREEMPT)) + return; - if (delta > gran) - resched_task(curr); + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); } + + delta = se->vruntime - pse->vruntime; + gran = sysctl_sched_wakeup_granularity; + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); + + if (delta > gran) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.2.3-70-g09d2 From 502d26b524d8980f3ed80d9aec398e85671a8160 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: clean up the wakeup preempt check, #2 clean up the preemption check to not use unnecessary 64-bit variables. This improves code size: text data bss dec hex filename 44227 3326 36 47589 b9e5 sched.o.before 44201 3326 36 47563 b9cb sched.o.after Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d558716a9ad..6c361472cc7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -837,7 +837,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - s64 delta, gran; + unsigned long gran; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -860,12 +860,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) pse = parent_entity(pse); } - delta = se->vruntime - pse->vruntime; gran = sysctl_sched_wakeup_granularity; if (unlikely(se->load.weight != NICE_0_LOAD)) gran = calc_delta_fair(gran, &se->load); - if (delta > gran) + if (pse->vruntime + gran < se->vruntime) resched_task(curr); } -- cgit v1.2.3-70-g09d2 From 3c90e6e99b08f01d5684a3a07cceae6a543e4fa8 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: fix copy_namespace() <-> sched_fork() dependency in do_fork Sukadev Bhattiprolu reported a kernel crash with control groups. There are couple of problems discovered by Suka's test: - The test requires the cgroup filesystem to be mounted with atleast the cpu and ns options (i.e both namespace and cpu controllers are active in the same hierarchy). # mkdir /dev/cpuctl # mount -t cgroup -ocpu,ns none cpuctl (or simply) # mount -t cgroup none cpuctl -> Will activate all controllers in same hierarchy. - The test invokes clone() with CLONE_NEWNS set. This causes a a new child to be created, also a new group (do_fork->copy_namespaces->ns_cgroup_clone-> cgroup_clone) and the child is attached to the new group (cgroup_clone-> attach_task->sched_move_task). At this point in time, the child's scheduler related fields are uninitialized (including its on_rq field, which it has inherited from parent). As a result sched_move_task thinks its on runqueue, when it isn't. As a solution to this problem, I moved sched_fork() call, which initializes scheduler related fields on a new task, before copy_namespaces(). I am not sure though whether moving up will cause other side-effects. Do you see any issue? - The second problem exposed by this test is that task_new_fair() assumes that parent and child will be part of the same group (which needn't be as this test shows). As a result, cfs_rq->curr can be NULL for the child. The solution is to test for curr pointer being NULL in task_new_fair(). With the patch below, I could run ns_exec() fine w/o a crash. Reported-by: Sukadev Bhattiprolu Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar --- kernel/fork.c | 6 +++--- kernel/sched_fair.c | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 28a74015198..8ca1a14cdc8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1123,6 +1123,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->blocked_on = NULL; /* not blocked yet */ #endif + /* Perform scheduler related setup. Assign this task to a CPU. */ + sched_fork(p, clone_flags); + if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) @@ -1212,9 +1215,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); - /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); - /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible * on the tasklist. */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c361472cc7..d3c03070872 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1067,8 +1067,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) update_curr(cfs_rq); place_entity(cfs_rq, se, 1); + /* 'curr' will be NULL if the child belongs to a different group */ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr->vruntime < se->vruntime) { + curr && curr->vruntime < se->vruntime) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. -- cgit v1.2.3-70-g09d2 From b82d9fdd848abfbe7263a4ecd9bbb55e575100a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: avoid large irq-latencies in smp-balancing SMP balancing is done with IRQs disabled and can iterate the full rq. When rqs are large this can cause large irq-latencies. Limit the nr of iterations on each run. This fixes a scheduling latency regression reported by the -rt folks. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Tested-by: Gregory Haskins Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 15 ++++++++++----- kernel/sysctl.c | 8 ++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 93fd30d6dac..2cc789fef71 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1466,6 +1466,7 @@ extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_nr_migrate; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --git a/kernel/sched.c b/kernel/sched.c index 2a107e4ad5e..e195a422941 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -471,6 +471,12 @@ const_debug unsigned int sysctl_sched_features = #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): @@ -2235,7 +2241,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct rq_iterator *iterator) { - int pulled = 0, pinned = 0, skip_for_load; + int loops = 0, pulled = 0, pinned = 0, skip_for_load; struct task_struct *p; long rem_load_move = max_load_move; @@ -2249,10 +2255,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, */ p = iterator->start(iterator->arg); next: - if (!p) + if (!p || loops++ > sysctl_sched_nr_migrate) goto out; /* - * To help distribute high priority tasks accross CPUs we don't + * To help distribute high priority tasks across CPUs we don't * skip a task if it will be the highest priority task (i.e. smallest * prio value) on its new queue regardless of its load weight */ @@ -2269,8 +2275,7 @@ next: rem_load_move -= p->se.load.weight; /* - * We only want to steal up to the prescribed number of tasks - * and the prescribed amount of weighted load. + * We only want to steal up to the prescribed amount of weighted load. */ if (rem_load_move > 0) { if (p->prio < *this_best_prio) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index adddf682d4c..3a1744fed2b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -301,6 +301,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_nr_migrate", + .data = &sysctl_sched_nr_migrate, + .maxlen = sizeof(unsigned int), + .mode = 644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.3-70-g09d2 From e6fe6649b4ec11aa3075e394b4d8743eebe1f64c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 9 Nov 2007 22:39:39 +0100 Subject: sched: proper prototype for kernel/sched.c:migration_init() This patch adds a proper prototype for migration_init() in include/linux/sched.h Since there's no point in always returning 0 to a caller that doesn't check the return value it also changes the function to return void. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++++ init/main.c | 4 +--- kernel/sched.c | 4 +--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2cc789fef71..ee800e7a70d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1988,6 +1988,14 @@ static inline void inc_syscw(struct task_struct *tsk) } #endif +#ifdef CONFIG_SMP +void migration_init(void); +#else +static inline void migration_init(void) +{ +} +#endif + #endif /* __KERNEL__ */ #endif diff --git a/init/main.c b/init/main.c index f605a969ea6..80b04b6c515 100644 --- a/init/main.c +++ b/init/main.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -747,11 +748,8 @@ __setup("nosoftlockup", nosoftlockup_setup); static void __init do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); -#ifdef CONFIG_SMP - extern int migration_init(void); migration_init(); -#endif spawn_ksoftirqd(); if (!nosoftlockup) spawn_softlockup_task(); diff --git a/kernel/sched.c b/kernel/sched.c index e195a422941..b18f231a487 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5650,7 +5650,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -int __init migration_init(void) +void __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -5660,8 +5660,6 @@ int __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - - return 0; } #endif -- cgit v1.2.3-70-g09d2