diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-12-20 20:32:03 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-12-20 20:32:11 +0100 |
commit | d87f69a16eb2cb96459117b06949a560679002e4 (patch) | |
tree | bbb55b2bd2b6c9e8bd4067aa3279783cb6bd9028 /kernel | |
parent | 124ba9403318d834ef21bcd899c22c870708d2c4 (diff) | |
parent | 384703b8e6cd4c8ef08512e596024e028c91c339 (diff) |
Merge commit 'v3.2-rc6' into perf/core
Merge reason: Update with the latest fixes.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup_freezer.c | 11 | ||||
-rw-r--r-- | kernel/events/core.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 5 | ||||
-rw-r--r-- | kernel/hrtimer.c | 6 | ||||
-rw-r--r-- | kernel/irq/manage.c | 7 | ||||
-rw-r--r-- | kernel/irq/spurious.c | 6 | ||||
-rw-r--r-- | kernel/lockdep.c | 8 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 37 | ||||
-rw-r--r-- | kernel/power/main.c | 3 | ||||
-rw-r--r-- | kernel/printk.c | 3 | ||||
-rw-r--r-- | kernel/sched.c | 17 | ||||
-rw-r--r-- | kernel/sched_fair.c | 159 | ||||
-rw-r--r-- | kernel/sched_features.h | 1 | ||||
-rw-r--r-- | kernel/sched_rt.c | 3 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 1 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 62 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 92 | ||||
-rw-r--r-- | kernel/timer.c | 2 |
20 files changed, 352 insertions, 79 deletions
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5e828a2ca8e..213c0351dad 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss, kfree(cgroup_freezer(cgroup)); } +/* task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} + /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (frozen(task)) + if (is_task_frozen_enough(task)) nfrozen++; } @@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) continue; - if (frozen(task)) + if (is_task_frozen_enough(task)) continue; if (!freezing(task) && !freezer_should_skip(task)) num_cant_freeze_now++; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3a3b1a18f49..2f8f3f103cb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2179,11 +2179,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); - if (ctx->nr_events) cpuctx->task_ctx = ctx; + perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); + perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); diff --git a/kernel/fork.c b/kernel/fork.c index ba0d1726132..da4a6a10d08 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -162,7 +162,6 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { - prop_local_destroy_single(&tsk->dirties); account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); @@ -274,10 +273,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack = ti; - err = prop_local_init_single(&tsk->dirties); - if (err) - goto out; - setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 422e567eecf..ae34bf51682 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { + struct timerqueue_node *next_timer; if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; - if (&timer->node == timerqueue_getnext(&base->active)) { + next_timer = timerqueue_getnext(&base->active); + timerqueue_del(&base->active, &timer->node); + if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ if (reprogram && hrtimer_hres_active()) { @@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - timerqueue_del(&base->active, &timer->node); if (!timerqueue_getnext(&base->active)) base->cpu_base->active_bases &= ~(1 << base->index); out: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 67ce837ae52..1da999f5e74 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -623,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) static int irq_wait_for_interrupt(struct irqaction *action) { + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { @@ -632,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action) return 0; } schedule(); + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return -1; } @@ -1596,7 +1599,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return -ENOMEM; action->handler = handler; - action->flags = IRQF_PERCPU; + action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index aa57d5da18c..dc813a948be 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) */ action = desc->action; if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER) || !action->next) + (action->flags & __IRQF_TIMER) || + (action->handler(irq, action->dev_id) == IRQ_HANDLED) || + !action->next) goto out; /* Already running on another processor */ @@ -115,7 +117,7 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; - if (atomic_inc_return(&irq_poll_active) == 1) + if (atomic_inc_return(&irq_poll_active) != 1) goto out; irq_poll_cpu = smp_processor_id(); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index d2fab46a1c9..24f176c9fc9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -44,6 +44,7 @@ #include <linux/stringify.h> #include <linux/bitops.h> #include <linux/gfp.h> +#include <linux/kmemcheck.h> #include <asm/sections.h> @@ -2944,7 +2945,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - memset(lock, 0, sizeof(*lock)); + int i; + + kmemcheck_mark_initialized(lock, sizeof(*lock)); + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b4511b6d3ef..a6b0503574e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -55,6 +55,8 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; +static bool freezer_test_done; + static const struct platform_hibernation_ops *hibernation_ops; /** @@ -345,11 +347,24 @@ int hibernation_snapshot(int platform_mode) error = freeze_kernel_threads(); if (error) - goto Close; + goto Cleanup; + + if (hibernation_test(TEST_FREEZER) || + hibernation_testmode(HIBERNATION_TESTPROC)) { + + /* + * Indicate to the caller that we are returning due to a + * successful freezer test. + */ + freezer_test_done = true; + goto Cleanup; + } error = dpm_prepare(PMSG_FREEZE); - if (error) - goto Complete_devices; + if (error) { + dpm_complete(msg); + goto Cleanup; + } suspend_console(); pm_restrict_gfp_mask(); @@ -378,8 +393,6 @@ int hibernation_snapshot(int platform_mode) pm_restore_gfp_mask(); resume_console(); - - Complete_devices: dpm_complete(msg); Close: @@ -389,6 +402,10 @@ int hibernation_snapshot(int platform_mode) Recover_platform: platform_recover(platform_mode); goto Resume_devices; + + Cleanup: + swsusp_free(); + goto Close; } /** @@ -641,15 +658,13 @@ int hibernate(void) if (error) goto Finish; - if (hibernation_test(TEST_FREEZER)) - goto Thaw; - - if (hibernation_testmode(HIBERNATION_TESTPROC)) - goto Thaw; - error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error) goto Thaw; + if (freezer_test_done) { + freezer_test_done = false; + goto Thaw; + } if (in_suspend) { unsigned int flags = 0; diff --git a/kernel/power/main.c b/kernel/power/main.c index 71f49fe4377..36e0f0903c3 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -290,13 +290,14 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) break; } - if (state < PM_SUSPEND_MAX && *s) + if (state < PM_SUSPEND_MAX && *s) { error = enter_state(state); if (error) { suspend_stats.fail++; dpm_save_failed_errno(error); } else suspend_stats.success++; + } #endif Exit: diff --git a/kernel/printk.c b/kernel/printk.c index 1455a0d4eed..7982a0a841e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1293,10 +1293,11 @@ again: raw_spin_lock(&logbuf_lock); if (con_start != log_end) retry = 1; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + if (retry && console_trylock()) goto again; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); } diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be..d6b149ccf92 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> +#include <linux/init_task.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -4810,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -4824,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits for completion of a specific task to be signaled. It is * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_interruptible(struct completion *x) { @@ -4841,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_interruptible_timeout(struct completion *x, @@ -4856,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); * * This waits to be signaled for completion of a specific task. It can be * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_killable(struct completion *x) { @@ -4874,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); * This waits for either a completion of a specific task to be * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_killable_timeout(struct completion *x, @@ -6099,6 +6113,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); +#if defined(CONFIG_SMP) + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif } /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c9e67923b7..a78ed2736ba 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ + long tg_weight; + + /* + * Use this CPU's actual weight instead of the last load_contribution + * to gain a more accurate current total weight. See + * update_cfs_rq_load_contribution(). + */ + tg_weight = atomic_read(&tg->load_weight); + tg_weight -= cfs_rq->load_contribution; + tg_weight += cfs_rq->load.weight; + + return tg_weight; +} + static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long load_weight, load, shares; + long tg_weight, load, shares; + tg_weight = calc_tg_weight(tg, cfs_rq); load = cfs_rq->load.weight; - load_weight = atomic_read(&tg->load_weight); - load_weight += load; - load_weight -= cfs_rq->load_contribution; - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; + if (tg_weight) + shares /= tg_weight; if (shares < MIN_SHARES) shares = MIN_SHARES; @@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); @@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. + * + * Calculate the effective load difference if @wl is added (subtracted) to @tg + * on this @cpu and results in a total addition (subtraction) of @wg to the + * total group weight. + * + * Given a runqueue weight distribution (rw_i) we can compute a shares + * distribution (s_i) using: + * + * s_i = rw_i / \Sum rw_j (1) + * + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting + * shares distribution (s_i): + * + * rw_i = { 2, 4, 1, 0 } + * s_i = { 2/7, 4/7, 1/7, 0 } + * + * As per wake_affine() we're interested in the load of two CPUs (the CPU the + * task used to run on and the CPU the waker is running on), we need to + * compute the effect of waking a task on either CPU and, in case of a sync + * wakeup, compute the effect of the current task going to sleep. + * + * So for a change of @wl to the local @cpu with an overall group weight change + * of @wl we can compute the new shares distribution (s'_i) using: + * + * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) + * + * Suppose we're interested in CPUs 0 and 1, and want to compute the load + * differences in waking a task to CPU 0. The additional task changes the + * weight and shares distributions like: + * + * rw'_i = { 3, 4, 1, 0 } + * s'_i = { 3/8, 4/8, 1/8, 0 } + * + * We can then compute the difference in effective weight by using: + * + * dw_i = S * (s'_i - s_i) (3) + * + * Where 'S' is the group weight as seen by its parent. + * + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - + * 4/7) times the weight of the group. */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent) + if (!tg->parent) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { - long lw, w; + long w, W; tg = se->my_q->tg; - w = se->my_q->load.weight; - /* use this cpu's instantaneous contribution */ - lw = atomic_read(&tg->load_weight); - lw -= se->my_q->load_contribution; - lw += w + wg; + /* + * W = @wg + \Sum rw_j + */ + W = wg + calc_tg_weight(tg, se->my_q); - wl += w; + /* + * w = rw_i + @wl + */ + w = se->my_q->load.weight + wl; - if (lw > 0 && wl < lw) - wl = (wl * tg->shares) / lw; + /* + * wl = S * s'_i; see (2) + */ + if (W > 0 && w < W) + wl = (w * tg->shares) / W; else wl = tg->shares; - /* zero point is MIN_SHARES */ + /* + * Per the above, wl is the new se->load.weight value; since + * those are clipped to [MIN_SHARES, ...) do so now. See + * calc_cfs_shares(). + */ if (wl < MIN_SHARES) wl = MIN_SHARES; + + /* + * wl = dw_i = S * (s'_i - s_i); see (3) + */ wl -= se->load.weight; + + /* + * Recursively apply this logic to all parent groups to compute + * the final effective load change on the root group. Since + * only the @tg group gets extra weight, all parent groups can + * only redistribute existing shares. @wl is the shift in shares + * resulting from this level per the above. + */ wg = 0; } @@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - int i; + struct sched_group *sg; + int i, smt = 0; /* * If the task is going to be woken-up on this cpu and if it is @@ -2269,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target) * Otherwise, iterate the domains and find an elegible idle cpu. */ rcu_read_lock(); +again: for_each_domain(target, sd) { - if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) - break; + if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) + continue; - for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { - target = i; - break; + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) { + if (!smt) { + smt = 1; + goto again; } + break; } - /* - * Lets stop looking for an idle sibling when we reached - * the domain that spans the current cpu and prev_cpu. - */ - if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && - cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) - break; + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); } +done: rcu_read_unlock(); return target; @@ -3511,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, } /** - * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu diff --git a/kernel/sched_features.h b/kernel/sched_features.h index efa0a7b75dd..84802245abd 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -67,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1) SCHED_FEAT(TTWU_QUEUE, 1) SCHED_FEAT(FORCE_SD_OVERLAP, 0) +SCHED_FEAT(RT_RUNTIME_SHARE, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 056cbd2e2a2..583a1368afe 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -560,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; + if (!sched_feat(RT_RUNTIME_SHARE)) + return more; + if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c436e790b21..8a46f5d6450 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -195,7 +195,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) struct alarm *alarm; ktime_t expired = next->expires; - if (expired.tv64 >= now.tv64) + if (expired.tv64 > now.tv64) break; alarm = container_of(next, struct alarm, node); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6..c4eb71c8b2e 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -387,6 +387,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e09..da2f760e780 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void) } /** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs: Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more then 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + +/** * clocksource_max_deferment - Returns max time the clocksource can be deferred * @cs: Pointer to clocksource * @@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/cs->mult which - * is equivalent to the below. - * max_cycles < (2^63)/cs->mult - * max_cycles < 2^(log2((2^63)/cs->mult)) - * max_cycles < 2^(log2(2^63) - log2(cs->mult)) - * max_cycles < 2^(63 - log2(cs->mult)) - * max_cycles < 1 << (63 - log2(cs->mult)) + * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) + * which is equivalent to the below. + * max_cycles < (2^63)/(cs->mult + cs->maxadj) + * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) + * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) + * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) + * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is * determined by the minimum of max_cycles and cs->mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. */ max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, + cs->shift); /* * To ensure that the clocksource does not wrap whilst we are idle, @@ -529,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) * note a margin of 12.5% is used because this can be computed with * a shift, versus say 10% which would require division. */ - return max_nsecs - (max_nsecs >> 5); + return max_nsecs - (max_nsecs >> 3); } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; - /* * Calc the maximum number of seconds which we can run before * wrapping around. For clocksources which have a mask > 32bit @@ -651,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) * ~ 0.06ppm granularity for NTP. We apply the same 12.5% * margin as we do in clocksource_max_deferment() */ - sec = (cs->mask - (cs->mask >> 5)); + sec = (cs->mask - (cs->mask >> 3)); do_div(sec, freq); do_div(sec, scale); if (!sec) @@ -661,6 +679,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); + + /* + * for clocksources that have large mults, to avoid overflow. + * Since mult may be adjusted by ntp, add an safety extra margin + * + */ + cs->maxadj = clocksource_max_adjustment(cs); + while ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult)) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + cs->max_idle_ns = clocksource_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -701,6 +733,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); */ int clocksource_register(struct clocksource *cs) { + /* calculate max adjustment for given mult/shift */ + cs->maxadj = clocksource_max_adjustment(cs); + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + /* calculate max idle time permitted for this clocksource */ cs->max_idle_ns = clocksource_max_deferment(cs); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f954282d9a8..fd4a7b1625a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; - clockevents_exchange_device(NULL, dev); + clockevents_exchange_device(tick_broadcast_device.evtdev, dev); tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(dev); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e850..237841378c0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -249,6 +249,8 @@ ktime_t ktime_get(void) secs = xtime.tv_sec + wall_to_monotonic.tv_sec; nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); } while (read_seqretry(&xtime_lock, seq)); /* @@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) *ts = xtime; tomono = wall_to_monotonic; nsecs = timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); } while (read_seqretry(&xtime_lock, seq)); @@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset) s64 error, interval = timekeeper.cycle_interval; int adj; + /* + * The point of this is to check if the error is greater then half + * an interval. + * + * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. + * + * Note we subtract one in the shift, so that error is really error*2. + * This "saves" dividing(shifting) intererval twice, but keeps the + * (error > interval) comparision as still measuring if error is + * larger then half an interval. + * + * Note: It does not "save" on aggrivation when reading the code. + */ error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { + /* + * We now divide error by 4(via shift), which checks if + * the error is greater then twice the interval. + * If it is greater, we need a bigadjust, if its smaller, + * we can adjust by 1. + */ error >>= 2; + /* + * XXX - In update_wall_time, we round up to the next + * nanosecond, and store the amount rounded up into + * the error. This causes the likely below to be unlikely. + * + * The properfix is to avoid rounding up by using + * the high precision timekeeper.xtime_nsec instead of + * xtime.tv_nsec everywhere. Fixing this will take some + * time. + */ if (likely(error <= interval)) adj = 1; else adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { + /* See comment above, this is just switched for the negative */ error >>= 2; if (likely(error >= -interval)) { adj = -1; @@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset) offset = -offset; } else adj = timekeeping_bigadjust(error, &interval, &offset); - } else + } else /* No adjustment needed */ return; + WARN_ONCE(timekeeper.clock->maxadj && + (timekeeper.mult + adj > timekeeper.clock->mult + + timekeeper.clock->maxadj), + "Adjusting %s more then 11%% (%ld vs %ld)\n", + timekeeper.clock->name, (long)timekeeper.mult + adj, + (long)timekeeper.clock->mult + + timekeeper.clock->maxadj); + /* + * So the following can be confusing. + * + * To keep things simple, lets assume adj == 1 for now. + * + * When adj != 1, remember that the interval and offset values + * have been appropriately scaled so the math is the same. + * + * The basic idea here is that we're increasing the multiplier + * by one, this causes the xtime_interval to be incremented by + * one cycle_interval. This is because: + * xtime_interval = cycle_interval * mult + * So if mult is being incremented by one: + * xtime_interval = cycle_interval * (mult + 1) + * Its the same as: + * xtime_interval = (cycle_interval * mult) + cycle_interval + * Which can be shortened to: + * xtime_interval += cycle_interval + * + * So offset stores the non-accumulated cycles. Thus the current + * time (in shifted nanoseconds) is: + * now = (offset * adj) + xtime_nsec + * Now, even though we're adjusting the clock frequency, we have + * to keep time consistent. In other words, we can't jump back + * in time, and we also want to avoid jumping forward in time. + * + * So given the same offset value, we need the time to be the same + * both before and after the freq adjustment. + * now = (offset * adj_1) + xtime_nsec_1 + * now = (offset * adj_2) + xtime_nsec_2 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_2) + xtime_nsec_2 + * And we know: + * adj_2 = adj_1 + 1 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * (adj_1+1)) + xtime_nsec_2 + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_1) + offset + xtime_nsec_2 + * Canceling the sides: + * xtime_nsec_1 = offset + xtime_nsec_2 + * Which gives us: + * xtime_nsec_2 = xtime_nsec_1 - offset + * Which simplfies to: + * xtime_nsec -= offset + * + * XXX - TODO: Doc ntp_error calculation. + */ timekeeper.mult += adj; timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; diff --git a/kernel/timer.c b/kernel/timer.c index dbaa62422b1..9c3c62b0c4b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1368,7 +1368,7 @@ SYSCALL_DEFINE0(getppid) int pid; rcu_read_lock(); - pid = task_tgid_vnr(current->real_parent); + pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; |