diff options
Diffstat (limited to 'kernel/rcutree_plugin.h')
-rw-r--r-- | kernel/rcutree_plugin.h | 289 |
1 files changed, 231 insertions, 58 deletions
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4b9b9f8a418..8bb35d73e1f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; + int empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, + * so we must take a snapshot of the expedited state. */ + empty_exp_now = !rcu_preempted_readers_exp(rnp); if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report("preempt_rcu", rnp->gpnum, @@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ - if (!empty_exp && !rcu_preempted_readers_exp(rnp)) - rcu_report_exp_rnp(&rcu_preempt_state, rnp); + if (!empty_exp && empty_exp_now) + rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); } else { local_irq_restore(flags); } @@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * + * Most callers will set the "wake" flag, but the task initiating the + * expedited grace period need not wake itself. + * * Caller must hold sync_rcu_preempt_exp_mutex. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { unsigned long flags; unsigned long mask; @@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up(&sync_rcu_preempt_exp_wq); + if (wake) + wake_up(&sync_rcu_preempt_exp_wq); break; } mask = rnp->grpmask; @@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) must_wait = 1; } if (!must_wait) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } /* @@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { - return; } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -static struct lock_class_key rcu_boost_class; - /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); - /* Avoid lockdep false positives. This rt_mutex is its own thing. */ - lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, - "rcu_boost_mutex"); t->rcu_boost_mutex = &mtx; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; + return ACCESS_ONCE(rnp->exp_tasks) != NULL || + ACCESS_ONCE(rnp->boost_tasks) != NULL; } /* @@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void) } /* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return __get_cpu_var(rcu_cpu_kthread_task) == current; +} + +/* * Set the affinity of the boost kthread. The CPU-hotplug locks are * held, so no one should be messing with the existence of the boost * kthread. @@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void) WARN_ON_ONCE(1); } +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } @@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void) * grace period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started) - 1; + snap = atomic_read(&sync_sched_expedited_started); smp_mb(); /* ensure read is before try_stop_cpus(). */ } @@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. * - * Because we have preemptible RCU, just check whether this CPU needs - * any flavor of RCU. Do not chew up lots of CPU cycles with preemption - * disabled in a most-likely vain attempt to cause RCU not to need this CPU. + * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs + * any flavor of RCU. */ int rcu_needs_cpu(int cpu) { - return rcu_needs_cpu_quick_check(cpu); + return rcu_cpu_has_callbacks(cpu); +} + +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ +} + +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up + * after it. + */ +static void rcu_cleanup_after_idle(int cpu) +{ +} + +/* + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * is nothing. + */ +static void rcu_prepare_for_idle(int cpu) +{ } #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#define RCU_NEEDS_CPU_FLUSHES 5 +/* + * This code is invoked when a CPU goes idle, at which point we want + * to have the CPU do everything required for RCU so that it can enter + * the energy-efficient dyntick-idle mode. This is handled by a + * state machine implemented by rcu_prepare_for_idle() below. + * + * The following three proprocessor symbols control this state machine: + * + * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt + * to satisfy RCU. Beyond this point, it is better to incur a periodic + * scheduling-clock interrupt than to loop through the state machine + * at full power. + * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are + * optional if RCU does not need anything immediately from this + * CPU, even if this CPU still has RCU callbacks queued. The first + * times through the state machine are mandatory: we need to give + * the state machine a chance to communicate a quiescent state + * to the RCU core. + * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted + * to sleep in dyntick-idle mode with RCU callbacks pending. This + * is sized to be roughly one RCU grace period. Those energy-efficiency + * benchmarkers who might otherwise be tempted to set this to a large + * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your + * system. And if you are -that- concerned about energy efficiency, + * just power the system down and be done with it! + * + * The values below work well in practice. If future workloads require + * adjustment, they can be converted into kernel config parameters, though + * making the state machine smarter might be a better option. + */ +#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ +#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ +#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ + static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); +static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); +static ktime_t rcu_idle_gp_wait; /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + */ +int rcu_needs_cpu(int cpu) +{ + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) + return 0; + /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ + return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; +} + +/* + * Timer handler used to force CPU to start pushing its remaining RCU + * callbacks in the case where it entered dyntick-idle mode with callbacks + * pending. The hander doesn't really need to do anything because the + * real work is done upon re-entry to idle, or by the next scheduling-clock + * interrupt should idle not be re-entered. + */ +static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +{ + trace_rcu_prep_idle("Timer"); + return HRTIMER_NORESTART; +} + +/* + * Initialize the timer used to pull CPUs out of dyntick-idle mode. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ + static int firsttime = 1; + struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + + hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtp->function = rcu_idle_gp_timer_func; + if (firsttime) { + unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); + + rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); + firsttime = 0; + } +} + +/* + * Clean up for exit from idle. Because we are exiting from idle, there + * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * do nothing if this timer is not active, so just cancel it unconditionally. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); +} + +/* + * Check to see if any RCU-related work can be done by the current CPU, + * and if so, schedule a softirq to get it done. This function is part + * of the RCU implementation; it is -not- an exported member of the RCU API. * - * Because we are not supporting preemptible RCU, attempt to accelerate - * any current grace periods so that RCU no longer needs this CPU, but - * only if all other CPUs are already in dynticks-idle mode. This will - * allow the CPU cores to be powered down immediately, as opposed to after - * waiting many milliseconds for grace periods to elapse. + * The idea is for the current CPU to clear out all work required by the + * RCU core for the current grace period, so that this CPU can be permitted + * to enter dyntick-idle mode. In some cases, it will need to be awakened + * at the end of the grace period by whatever CPU ends the grace period. + * This allows CPUs to go dyntick-idle more quickly, and to reduce the + * number of wakeups by a modest integer factor. * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu) +static void rcu_prepare_for_idle(int cpu) { - int c = 0; - int snap; - int thatcpu; - - /* Check for being in the holdoff period. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) - return rcu_needs_cpu_quick_check(cpu); - - /* Don't bother unless we are the last non-dyntick-idle CPU. */ - for_each_online_cpu(thatcpu) { - if (thatcpu == cpu) - continue; - snap = atomic_add_return(0, &per_cpu(rcu_dynticks, - thatcpu).dynticks); - smp_mb(); /* Order sampling of snap with end of grace period. */ - if ((snap & 0x1) != 0) { - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - return rcu_needs_cpu_quick_check(cpu); - } + unsigned long flags; + + local_irq_save(flags); + + /* + * If there are no callbacks on this CPU, enter dyntick-idle mode. + * Also reset state to avoid prejudicing later attempts. + */ + if (!rcu_cpu_has_callbacks(cpu)) { + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + per_cpu(rcu_dyntick_drain, cpu) = 0; + local_irq_restore(flags); + trace_rcu_prep_idle("No callbacks"); + return; + } + + /* + * If in holdoff mode, just return. We will presumably have + * refrained from disabling the scheduling-clock tick. + */ + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + local_irq_restore(flags); + trace_rcu_prep_idle("In holdoff"); + return; } /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; + per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; + } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + !rcu_pending(cpu)) { + /* Can we go dyntick-idle despite still having callbacks? */ + trace_rcu_prep_idle("Dyntick with callbacks"); + per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); + return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - return rcu_needs_cpu_quick_check(cpu); + local_irq_restore(flags); + trace_rcu_prep_idle("Begin holdoff"); + invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + return; } - /* Do one step pushing remaining RCU callbacks through. */ + /* + * Do one step of pushing the remaining RCU callbacks through + * the RCU core state machine. + */ +#ifdef CONFIG_TREE_PREEMPT_RCU + if (per_cpu(rcu_preempt_data, cpu).nxtlist) { + local_irq_restore(flags); + rcu_preempt_qs(cpu); + force_quiescent_state(&rcu_preempt_state, 0); + local_irq_save(flags); + } +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { + local_irq_restore(flags); rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); - c = c || per_cpu(rcu_sched_data, cpu).nxtlist; + local_irq_save(flags); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { + local_irq_restore(flags); rcu_bh_qs(cpu); force_quiescent_state(&rcu_bh_state, 0); - c = c || per_cpu(rcu_bh_data, cpu).nxtlist; + local_irq_save(flags); } - /* If RCU callbacks are still pending, RCU still needs this CPU. */ - if (c) + /* + * If RCU callbacks are still pending, RCU still needs this CPU. + * So try forcing the callbacks through the grace period. + */ + if (rcu_cpu_has_callbacks(cpu)) { + local_irq_restore(flags); + trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - return c; + } else { + local_irq_restore(flags); + trace_rcu_prep_idle("Callbacks drained"); + } } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |