diff options
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 97 |
1 files changed, 82 insertions, 15 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd081987a8e..b3d116cd072 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { + struct rcu_state *rsp; + struct rcu_data *rdp; + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + do_nocb_deferred_wakeup(rdp); + } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ @@ -411,11 +418,12 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - else + rcu_eqs_enter_common(rdtp, oldval, user); + } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_eqs_enter_common(rdtp, oldval, user); + } } /** @@ -533,11 +541,12 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) + if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else + } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(rdtp, oldval, user); + } } /** @@ -716,7 +725,7 @@ bool rcu_lockdep_current_cpu_online(void) bool ret; if (in_nmi()) - return 1; + return true; preempt_disable(); rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; @@ -755,6 +764,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, } /* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + +/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -812,16 +827,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, */ rcu_kick_nohz_cpu(rdp->cpu); + /* + * Alternatively, the CPU might be running in the kernel + * for an extended period of time without a quiescent state. + * Attempt to force the CPU through the scheduler to gain the + * needed quiescent state, but only if the grace period has gone + * on for an uncommonly long time. If there are many stuck CPUs, + * we will beat on the first one until it gets unstuck, then move + * to the next. Only do this for the primary flavor of RCU. + */ + if (rdp->rsp == rcu_state && + ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { + rdp->rsp->jiffies_resched += 5; + resched_cpu(rdp->cpu); + } + return 0; } static void record_gp_stall_check_time(struct rcu_state *rsp) { unsigned long j = ACCESS_ONCE(jiffies); + unsigned long j1; rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ - rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); + j1 = rcu_jiffies_till_stall_check(); + rsp->jiffies_stall = j + j1; + rsp->jiffies_resched = j + j1 / 2; } /* @@ -1133,8 +1166,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) + if (rnp != rnp_root) { raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); + } /* * Get a new grace-period number. If there really is no grace @@ -1354,6 +1389,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } + smp_mb__after_unlock_lock(); __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1368,6 +1404,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); if (rsp->gp_flags == 0) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1409,6 +1446,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1463,6 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rsp->gp_flags &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } @@ -1480,6 +1519,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1505,16 +1545,19 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) __note_gp_changes(rsp, rnp, rdp); + /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ @@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; cond_resched(); @@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg) (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); + /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) @@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); WARN_ON_ONCE(rnp_c->qsmask); } @@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum) { @@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Adopt the RCU callbacks from the specified rcu_state structure's * orphanage. The caller must hold the ->orphan_lock. */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ @@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp); + rcu_adopt_orphan_cbs(rsp, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) @@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp, cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (!rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rnp = rcu_get_root(rsp); if (rnp->qsmask == 0) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ } } @@ -2263,6 +2313,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave(&rnp_old->lock, flags); + smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* If there are callbacks ready, invoke them. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_callbacks(rsp, rdp); + + /* Do any needed deferred wakeups of rcuo kthreads. */ + do_nocb_deferred_wakeup(rdp); } /* @@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_node *rnp_root = rcu_get_root(rsp); raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); } else { @@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy); + offline = !__call_rcu_nocb(rdp, head, lazy, flags); WARN_ON_ONCE(offline); /* _call_rcu() is illegal on offline CPU; leak the callback. */ local_irq_restore(flags); @@ -2757,6 +2812,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); + /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ + if (rcu_nohz_full_cpu(rsp)) + return 0; + /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { @@ -2790,6 +2849,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp)) { + rdp->n_rp_nocb_defer_wakeup++; + return 1; + } + /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -3214,9 +3279,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = rcu_num_lvls - 1; i > 0; i--) + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -3346,6 +3411,8 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + rcu_fanout_leaf, nr_cpu_ids); /* * Compute number of nodes that can be handled an rcu_node tree |