From fe15d706cfc1cb321dbe2329b04b5ca185edff60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jan 2012 13:30:33 -0800 Subject: rcu: Add lockdep-RCU checks for simple self-deadlock It is illegal to have a grace period within a same-flavor RCU read-side critical section, so this commit adds lockdep-RCU checks to splat when such abuse is encountered. This commit does not detect more elaborate RCU deadlock situations. These situations might be a job for lockdep enhancements. Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8bb35d73e1f..3680b6b35bf 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -688,6 +688,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; wait_rcu_gp(call_rcu); -- cgit v1.2.3-70-g09d2 From 486e259340fc4c60474f2c14703e3b3634bb58ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Jan 2012 14:11:30 -0800 Subject: rcu: Avoid waking up CPUs having only kfree_rcu() callbacks When CONFIG_RCU_FAST_NO_HZ is enabled, RCU will allow a given CPU to enter dyntick-idle mode even if it still has RCU callbacks queued. RCU avoids system hangs in this case by scheduling a timer for several jiffies in the future. However, if all of the callbacks on that CPU are from kfree_rcu(), there is no reason to wake the CPU up, as it is not a problem to defer freeing of memory. This commit therefore tracks the number of callbacks on a given CPU that are from kfree_rcu(), and avoids scheduling the timer if all of a given CPU's callbacks are from kfree_rcu(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- include/linux/rcutiny.h | 6 ++++ include/linux/rcutree.h | 2 ++ include/trace/events/rcu.h | 63 ++++++++++++++++++++++-------------- kernel/rcu.h | 4 ++- kernel/rcutiny.c | 4 +-- kernel/rcutree.c | 29 ++++++++++------- kernel/rcutree.h | 3 +- kernel/rcutree_plugin.h | 79 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcutree_trace.c | 8 ++--- 10 files changed, 153 insertions(+), 47 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 81c04f4348e..a67d5f1072e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -841,7 +841,7 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset) /* See the kfree_rcu() header comment. */ BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); - call_rcu(head, (rcu_callback)offset); + kfree_call_rcu(head, (rcu_callback)offset); } /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 00b7a5e493d..51bf29c8148 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -83,6 +83,12 @@ static inline void synchronize_sched_expedited(void) synchronize_sched(); } +static inline void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + call_rcu(head, func); +} + #ifdef CONFIG_TINY_RCU static inline void rcu_preempt_note_context_switch(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 73e7195f999..73892483fd0 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -61,6 +61,8 @@ extern void synchronize_rcu_bh(void); extern void synchronize_sched_expedited(void); extern void synchronize_rcu_expedited(void); +void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + static inline void synchronize_rcu_bh_expedited(void) { synchronize_sched_expedited(); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d2d88bed891..337099783f3 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -313,19 +313,22 @@ TRACE_EVENT(rcu_prep_idle, /* * Tracepoint for the registration of a single RCU callback function. * The first argument is the type of RCU, the second argument is - * a pointer to the RCU callback itself, and the third element is the - * new RCU callback queue length for the current CPU. + * a pointer to the RCU callback itself, the third element is the + * number of lazy callbacks queued, and the fourth element is the + * total number of callbacks queued. */ TRACE_EVENT(rcu_callback, - TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen), + TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen_lazy, + long qlen), - TP_ARGS(rcuname, rhp, qlen), + TP_ARGS(rcuname, rhp, qlen_lazy, qlen), TP_STRUCT__entry( __field(char *, rcuname) __field(void *, rhp) __field(void *, func) + __field(long, qlen_lazy) __field(long, qlen) ), @@ -333,11 +336,13 @@ TRACE_EVENT(rcu_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->func = rhp->func; + __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%pf %ld", - __entry->rcuname, __entry->rhp, __entry->func, __entry->qlen) + TP_printk("%s rhp=%p func=%pf %ld/%ld", + __entry->rcuname, __entry->rhp, __entry->func, + __entry->qlen_lazy, __entry->qlen) ); /* @@ -345,20 +350,21 @@ TRACE_EVENT(rcu_callback, * kfree() form. The first argument is the RCU type, the second argument * is a pointer to the RCU callback, the third argument is the offset * of the callback within the enclosing RCU-protected data structure, - * and the fourth argument is the new RCU callback queue length for the - * current CPU. + * the fourth argument is the number of lazy callbacks queued, and the + * fifth argument is the total number of callbacks queued. */ TRACE_EVENT(rcu_kfree_callback, TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen), + long qlen_lazy, long qlen), - TP_ARGS(rcuname, rhp, offset, qlen), + TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen), TP_STRUCT__entry( __field(char *, rcuname) __field(void *, rhp) __field(unsigned long, offset) + __field(long, qlen_lazy) __field(long, qlen) ), @@ -366,41 +372,45 @@ TRACE_EVENT(rcu_kfree_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->offset = offset; + __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ld %ld", + TP_printk("%s rhp=%p func=%ld %ld/%ld", __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen) + __entry->qlen_lazy, __entry->qlen) ); /* * Tracepoint for marking the beginning rcu_do_batch, performed to start * RCU callback invocation. The first argument is the RCU flavor, - * the second is the total number of callbacks (including those that - * are not yet ready to be invoked), and the third argument is the - * current RCU-callback batch limit. + * the second is the number of lazy callbacks queued, the third is + * the total number of callbacks queued, and the fourth argument is + * the current RCU-callback batch limit. */ TRACE_EVENT(rcu_batch_start, - TP_PROTO(char *rcuname, long qlen, int blimit), + TP_PROTO(char *rcuname, long qlen_lazy, long qlen, int blimit), - TP_ARGS(rcuname, qlen, blimit), + TP_ARGS(rcuname, qlen_lazy, qlen, blimit), TP_STRUCT__entry( __field(char *, rcuname) + __field(long, qlen_lazy) __field(long, qlen) __field(int, blimit) ), TP_fast_assign( __entry->rcuname = rcuname; + __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; __entry->blimit = blimit; ), - TP_printk("%s CBs=%ld bl=%d", - __entry->rcuname, __entry->qlen, __entry->blimit) + TP_printk("%s CBs=%ld/%ld bl=%d", + __entry->rcuname, __entry->qlen_lazy, __entry->qlen, + __entry->blimit) ); /* @@ -531,16 +541,21 @@ TRACE_EVENT(rcu_torture_read, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) -#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, qsmask) do { } while (0) +#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ + qsmask) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) -#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) +#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ + grplo, grphi, gp_tasks) do { } \ + while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) #define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) #define trace_rcu_prep_idle(reason) do { } while (0) -#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) -#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) -#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) +#define trace_rcu_callback(rcuname, rhp, qlen_lazy, qlen) do { } while (0) +#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen_lazy, qlen) \ + do { } while (0) +#define trace_rcu_batch_start(rcuname, qlen_lazy, qlen, blimit) \ + do { } while (0) #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) #define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \ diff --git a/kernel/rcu.h b/kernel/rcu.h index aa88baab5f7..a074b0b43fc 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -76,16 +76,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) extern void kfree(const void *); -static inline void __rcu_reclaim(char *rn, struct rcu_head *head) +static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; if (__is_kfree_rcu_offset(offset)) { RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); + return 1; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); + return 0; } } diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 8e00d461911..4eb34fcc2a7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -258,7 +258,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) { - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, ACCESS_ONCE(rcp->rcucblist), need_resched(), @@ -269,7 +269,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 570f7530f4b..acf2d67ad2f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1261,6 +1261,7 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen_lazy += rdp->qlen_lazy; receive_rdp->qlen += rdp->qlen; receive_rdp->n_cbs_adopted += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; @@ -1268,6 +1269,7 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; rdp->qlen = 0; } @@ -1368,11 +1370,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count; + int bl, count, count_lazy; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, 0, 0); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -1385,7 +1387,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ local_irq_save(flags); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen, bl); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); list = rdp->nxtlist; rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1396,12 +1398,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); /* Invoke callbacks. */ - count = 0; + count = count_lazy = 0; while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - __rcu_reclaim(rsp->name, list); + if (__rcu_reclaim(rsp->name, list)) + count_lazy++; list = next; /* Stop only if limit reached and CPU has something to do. */ if (++count >= bl && @@ -1416,6 +1419,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ + rdp->qlen_lazy -= count_lazy; rdp->qlen -= count; rdp->n_cbs_invoked += count; if (list != NULL) { @@ -1702,7 +1706,7 @@ static void invoke_rcu_core(void) static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp) + struct rcu_state *rsp, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -1727,12 +1731,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; + if (lazy) + rdp->qlen_lazy++; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen); + rdp->qlen_lazy, rdp->qlen); else - trace_rcu_callback(rsp->name, head, rdp->qlen); + trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { @@ -1779,16 +1785,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state); + __call_rcu(head, func, &rcu_sched_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_sched); /* - * Queue an RCU for invocation after a quicker grace period. + * Queue an RCU callback for invocation after a quicker grace period. */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_bh_state); + __call_rcu(head, func, &rcu_bh_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -2036,6 +2042,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index fddff92d667..af2af3cc5e6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -265,7 +265,8 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; - long qlen; /* # of queued callbacks */ + long qlen_lazy; /* # of lazy queued callbacks */ + long qlen; /* # of queued callbacks, incl lazy */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3680b6b35bf..7adf232bb66 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -671,10 +671,24 @@ static void rcu_preempt_do_callbacks(void) */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state); + __call_rcu(head, func, &rcu_preempt_state, 0); } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -1064,6 +1078,22 @@ static void rcu_preempt_process_callbacks(void) { } +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + * + * Because there is no preemptible RCU, we use RCU-sched instead. + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + /* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. @@ -2051,6 +2081,48 @@ int rcu_needs_cpu(int cpu) return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; } +/* + * Does the specified flavor of RCU have non-lazy callbacks pending on + * the specified CPU? Both RCU flavor and CPU are specified by the + * rcu_data structure. + */ +static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) +{ + return rdp->qlen != rdp->qlen_lazy; +} + +#ifdef CONFIG_TREE_PREEMPT_RCU + +/* + * Are there non-lazy RCU-preempt callbacks? (There cannot be if there + * is no RCU-preempt in the kernel.) + */ +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + + return __rcu_cpu_has_nonlazy_callbacks(rdp); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ + return 0; +} + +#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/* + * Does any flavor of RCU have non-lazy callbacks on the specified CPU? + */ +static bool rcu_cpu_has_nonlazy_callbacks(int cpu) +{ + return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || + __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || + rcu_preempt_cpu_has_nonlazy_callbacks(cpu); +} + /* * Timer handler used to force CPU to start pushing its remaining RCU * callbacks in the case where it entered dyntick-idle mode with callbacks @@ -2149,8 +2221,9 @@ static void rcu_prepare_for_idle(int cpu) trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 654cfe67f0d..db0987c1e1b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -73,8 +73,8 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c", - rdp->qlen, + seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", + rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -145,7 +145,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, + seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); + seq_puts(m, "\"of\",\"ri\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.3-70-g09d2 From e5601400081651060a59bd1f45f2821bb8e97f95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Jan 2012 11:03:57 -0800 Subject: rcu: Simplify offline processing Move ->qsmaskinit and blkd_tasks[] manipulation to the CPU_DYING notifier. This simplifies the code by eliminating a potential deadlock and by reducing the responsibilities of force_quiescent_state(). Also rename functions to make their connection to the CPU-hotplug stages explicit. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 160 +++++++++++++++++++++++------------------------- kernel/rcutree.h | 4 +- kernel/rcutree_plugin.h | 25 ++++---- 3 files changed, 90 insertions(+), 99 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index acf2d67ad2f..575f91d03f0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -943,6 +943,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * in preparation for detecting the next grace period. The caller must hold * the root node's ->lock, which is released before return. Hard irqs must * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. */ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) @@ -1245,118 +1249,101 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) /* * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Synchronization is not required because this function executes - * in stop_machine() context. + * Also record a quiescent state for this CPU for the current grace period. + * Synchronization and interrupt disabling are not required because + * this function executes in stop_machine() context. Therefore, cleanup + * operations that might block must be done later from the CPU_DEAD + * notifier. + * + * Note that the outgoing CPU's bit has already been cleared in the + * cpu_online_mask. This allows us to randomly pick a callback + * destination from the bits set in that mask. */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { + unsigned long flags; int i; - /* current DYING CPU is cleared in the cpu_online_mask */ + unsigned long mask; + int need_report; int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); + struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ + + /* Move callbacks to some other CPU. */ + if (rdp->nxtlist != NULL) { + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = + rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen_lazy += rdp->qlen_lazy; + receive_rdp->qlen += rdp->qlen; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; + rdp->qlen = 0; + } - if (rdp->nxtlist == NULL) - return; /* irqs disabled, so comparison is stable. */ - - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->qlen_lazy += rdp->qlen_lazy; - receive_rdp->qlen += rdp->qlen; - receive_rdp->n_cbs_adopted += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen_lazy = 0; - rdp->qlen = 0; -} - -/* - * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy - * and move all callbacks from the outgoing CPU to the current one. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. - */ -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - int need_report = 0; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp; - - rcu_stop_cpu_kthread(cpu); - - /* Exclude any attempts to start a new grace period. */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ + /* Record a quiescent state for the dying CPU. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - !!(rnp->qsmask & mask), + "cpuofl"); + rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); + /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ + + /* + * Remove the dying CPU from the bitmasks in the rcu_node + * hierarchy. Because we are in stop_machine() context, we + * automatically exclude ->onofflock critical sections. + */ do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - else - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; } if (rnp == rdp->mynode) { - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp, true); } else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); mask = rnp->grpmask; rnp = rnp->parent; } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->onofflock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - rcu_node_kthread_setaffinity(rnp, -1); } /* - * Remove the specified CPU from the RCU hierarchy and move any pending - * callbacks that it might have to the current CPU. This code assumes - * that at least one CPU in the system will remain running at all times. - * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. */ -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - __rcu_offline_cpu(cpu, &rcu_sched_state); - __rcu_offline_cpu(cpu, &rcu_bh_state); - rcu_preempt_offline_cpu(cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + rcu_stop_cpu_kthread(cpu); + rcu_node_kthread_setaffinity(rnp, -1); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -1725,6 +1712,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ @@ -2155,16 +2143,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_send_cbs_to_online(&rcu_bh_state); - rcu_send_cbs_to_online(&rcu_sched_state); - rcu_preempt_send_cbs_to_online(); + rcu_cleanup_dying_cpu(&rcu_bh_state); + rcu_cleanup_dying_cpu(&rcu_sched_state); + rcu_preempt_cleanup_dying_cpu(); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_offline_cpu(cpu); + rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); + rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); + rcu_preempt_cleanup_dead_cpu(cpu); break; default: break; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index af2af3cc5e6..05e03675439 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -439,8 +439,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_cleanup_dead_cpu(int cpu); static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); @@ -451,7 +451,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_send_cbs_to_online(void); +static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7adf232bb66..eeb2cc6b865 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -618,16 +618,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, return retval; } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Do CPU-offline processing for preemptible RCU. */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu) { - __rcu_offline_cpu(cpu, &rcu_preempt_state); + rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, @@ -912,11 +912,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptible RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU + * and record a quiescent state. */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void) { - rcu_send_cbs_to_online(&rcu_preempt_state); + rcu_cleanup_dying_cpu(&rcu_preempt_state); } /* @@ -1052,16 +1053,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, return 0; } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Because preemptible RCU does not exist, it never needs CPU-offline * processing. */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu) { } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1153,9 +1154,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Because there is no preemptible RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there is no cleanup to do. */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void) { } -- cgit v1.2.3-70-g09d2 From 778d250a29224795e6320b58928bafa6b6104a06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2012 14:13:24 -0800 Subject: rcu: Limit lazy-callback duration Currently, a given CPU is permitted to remain in dyntick-idle mode indefinitely if it has only lazy RCU callbacks queued. This is vulnerable to corner cases in NUMA systems, so limit the time to six seconds by default. (Currently controlled by a cpp macro.) Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index eeb2cc6b865..f8e6dcc9186 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2050,6 +2050,9 @@ static void rcu_prepare_for_idle(int cpu) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! + * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is + * permitted to sleep in dyntick-idle mode with only lazy RCU + * callbacks pending. Setting this too high can OOM your system. * * The values below work well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though @@ -2058,11 +2061,13 @@ static void rcu_prepare_for_idle(int cpu) #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ +#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; +static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ +static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2151,6 +2156,8 @@ static void rcu_prepare_for_idle_init(int cpu) unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); + upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); + rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); firsttime = 0; } } @@ -2225,6 +2232,9 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_nonlazy_callbacks(cpu)) hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), rcu_idle_gp_wait, HRTIMER_MODE_REL); + else + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ -- cgit v1.2.3-70-g09d2 From 30fbcc90b02187c55c57ff0ecf57cecbd487d694 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 11:01:14 -0800 Subject: rcu: Clean up straggling rcu_preempt_needs_cpu() name The recent updates to RCU_CPU_FAST_NO_HZ have an rcu_needs_cpu() that does more than just check for callbacks, so get the name for rcu_preempt_needs_cpu() consistent with that change, now calling it rcu_preempt_cpu_has_callbacks(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree.h | 2 +- kernel/rcutree_plugin.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 49bb363a083..0569ba11e35 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1958,7 +1958,7 @@ static int rcu_cpu_has_callbacks(int cpu) /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_needs_cpu(cpu); + rcu_preempt_cpu_has_callbacks(cpu); } static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 05e03675439..58c9fc3bc82 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -449,7 +449,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); -static int rcu_preempt_needs_cpu(int cpu); +static int rcu_preempt_cpu_has_callbacks(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f8e6dcc9186..297d561c3e4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -887,9 +887,9 @@ static int rcu_preempt_pending(int cpu) } /* - * Does preemptible RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU have callbacks on this CPU? */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu) { return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } @@ -1128,9 +1128,9 @@ static int rcu_preempt_pending(int cpu) } /* - * Because preemptible RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never has callbacks */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu) { return 0; } -- cgit v1.2.3-70-g09d2 From 27565d64a4e564e72c22d8c91a3cfcb9442383e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 19:35:08 -0800 Subject: rcu: Remove #ifdef CONFIG_SMP from TREE_RCU Now that both TINY_RCU and TINY_PREEMPT_RCU have been in place for awhile, it is time to remove UP support from TREE_RCU, which is what this commit does. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 19 ------------------- kernel/rcutree_plugin.h | 12 ------------ 2 files changed, 31 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 195bf1fb59e..61adb351d2c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -301,8 +301,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -#ifdef CONFIG_SMP - /* * If the specified CPU is offline, tell the caller that it is in * a quiescent state. Otherwise, whack it with a reschedule IPI. @@ -339,8 +337,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 0; } -#endif /* #ifdef CONFIG_SMP */ - /* * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle * @@ -606,8 +602,6 @@ int rcu_is_cpu_rrupt_from_idle(void) return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } -#ifdef CONFIG_SMP - /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU @@ -651,8 +645,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return rcu_implicit_offline_qs(rdp); } -#endif /* #ifdef CONFIG_SMP */ - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; @@ -1517,8 +1509,6 @@ void rcu_check_callbacks(int cpu, int user) trace_rcu_utilization("End scheduler-tick"); } -#ifdef CONFIG_SMP - /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. @@ -1641,15 +1631,6 @@ unlock_fqs_ret: trace_rcu_utilization("End fqs"); } -#else /* #ifdef CONFIG_SMP */ - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ - set_need_resched(); -} - -#endif /* #else #ifdef CONFIG_SMP */ - /* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 297d561c3e4..0e74e1c6333 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1858,16 +1858,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#ifndef CONFIG_SMP - -void synchronize_sched_expedited(void) -{ - cond_resched(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#else /* #ifndef CONFIG_SMP */ - static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); @@ -1982,8 +1972,6 @@ void synchronize_sched_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); -#endif /* #else #ifndef CONFIG_SMP */ - #if !defined(CONFIG_RCU_FAST_NO_HZ) /* -- cgit v1.2.3-70-g09d2 From a858af2875fb291d0f4b0a4419fefbf03c2379c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Jan 2012 13:29:10 -0800 Subject: rcu: Print scheduling-clock information on RCU CPU stall-warning messages There have been situations where RCU CPU stall warnings were caused by issues in scheduling-clock timer initialization. To make it easier to track these down, this commit causes the RCU CPU stall-warning messages to print out the number of scheduling-clock interrupts taken in the current grace period for each stalled CPU. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 33 +++++++---- kernel/rcutree.h | 11 ++++ kernel/rcutree_plugin.h | 150 +++++++++++++++++++++++++++++++++++++++++++++++- lib/Kconfig.debug | 14 +++++ 4 files changed, 194 insertions(+), 14 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index cfdab9898e3..dccd2f78db4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -689,12 +689,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) return; } rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; - - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - ndetected = rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -702,8 +696,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", + printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", rsp->name); + print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); @@ -712,11 +707,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) { - printk(" %d", rnp->grplo + cpu); + print_cpu_stall_info(rsp, rnp->grplo + cpu); ndetected++; } } - printk("} (detected by %d, t=%ld jiffies)\n", + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + ndetected = rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + print_cpu_stall_info_end(); + printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); @@ -740,8 +746,11 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", - rsp->name, smp_processor_id(), jiffies - rsp->gp_start); + printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + print_cpu_stall_info_begin(); + print_cpu_stall_info(rsp, smp_processor_id()); + print_cpu_stall_info_end(); + printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); if (!trigger_all_cpu_backtrace()) dump_stack(); @@ -831,6 +840,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rdp->passed_quiesce = 0; } else rdp->qs_pending = 0; + zero_cpu_stall_ticks(rdp); } } @@ -1496,6 +1506,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); + increment_cpu_stall_ticks(); if (user || rcu_is_cpu_rrupt_from_idle()) { /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 0328a537846..e2ac8ee415b 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -239,6 +239,12 @@ struct rcu_data { bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned long ticks_this_gp; /* The number of scheduling-clock */ + /* ticks this CPU has handled */ + /* during and after the last grace */ + /* period it is aware of. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ /* 2) batch handling */ /* @@ -466,5 +472,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); +static void print_cpu_stall_info_begin(void); +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); +static void print_cpu_stall_info_end(void); +static void zero_cpu_stall_ticks(struct rcu_data *rdp); +static void increment_cpu_stall_ticks(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e74e1c6333..aa93b074bb2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -63,7 +63,10 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); #endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); + printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); +#endif +#if defined(CONFIG_RCU_CPU_STALL_INFO) + printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); @@ -490,6 +493,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ + printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); +} + +static void rcu_print_task_stall_end(void) +{ + printk(KERN_CONT "\n"); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ +} + +static void rcu_print_task_stall_end(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -501,12 +529,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; + rcu_print_task_stall_begin(rnp); t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - printk(" P%d", t->pid); + printk(KERN_CONT " P%d", t->pid); ndetected++; } + rcu_print_task_stall_end(); return ndetected; } @@ -2004,7 +2034,7 @@ static void rcu_cleanup_after_idle(int cpu) } /* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, * is nothing. */ static void rcu_prepare_for_idle(int cpu) @@ -2273,3 +2303,117 @@ static void rcu_prepare_for_idle(int cpu) } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +#ifdef CONFIG_RCU_CPU_STALL_INFO + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + + sprintf(cp, "drain=%d %c timer=%lld", + per_cpu(rcu_dyntick_drain, cpu), + per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + hrtimer_active(hrtp) + ? ktime_to_us(hrtimer_get_remaining(hrtp)) + : -1); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* Initiate the stall-info list. */ +static void print_cpu_stall_info_begin(void) +{ + printk(KERN_CONT "\n"); +} + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period + * (flavor specified by rsp), then print the number of scheduling + * clock interrupts the CPU has taken during the time that it has + * been aware. Otherwise, print the number of RCU grace periods + * that this CPU is ignorant of, for example, "1" if the CPU was + * aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + char fast_no_hz[72]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = rdp->dynticks; + char *ticks_title; + unsigned long ticks_value; + + if (rsp->gpnum == rdp->gpnum) { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } else { + ticks_title = "GPs behind"; + ticks_value = rsp->gpnum - rdp->gpnum; + } + print_cpu_stall_fast_no_hz(fast_no_hz, cpu); + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", + cpu, ticks_value, ticks_title, + atomic_read(&rdtp->dynticks) & 0xfff, + rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + fast_no_hz); +} + +/* Terminate the stall-info list. */ +static void print_cpu_stall_info_end(void) +{ + printk(KERN_ERR "\t"); +} + +/* Zero ->ticks_this_gp for all flavors of RCU. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; +} + +/* Increment ->ticks_this_gp for all flavors of RCU. */ +static void increment_cpu_stall_ticks(void) +{ + __get_cpu_var(rcu_sched_data).ticks_this_gp++; + __get_cpu_var(rcu_bh_data).ticks_this_gp++; +#ifdef CONFIG_TREE_PREEMPT_RCU + __get_cpu_var(rcu_preempt_data).ticks_this_gp++; +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void print_cpu_stall_info_begin(void) +{ + printk(KERN_CONT " {"); +} + +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + printk(KERN_CONT " %d", cpu); +} + +static void print_cpu_stall_info_end(void) +{ + printk(KERN_CONT "} "); +} + +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +} + +static void increment_cpu_stall_ticks(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3cc419d05f2..d27a2aa3e81 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -927,6 +927,20 @@ config RCU_CPU_STALL_VERBOSE Say Y if you want to enable such checks. +config RCU_CPU_STALL_INFO + bool "Print additional diagnostics on RCU CPU stall" + depends on (TREE_RCU || TREE_PREEMPT_RCU) && DEBUG_KERNEL + default n + help + For each stalled CPU that is aware of the current RCU grace + period, print out additional per-CPU diagnostic information + regarding scheduling-clock ticks, idle state, and, + for RCU_FAST_NO_HZ kernels, idle-entry state. + + Say N if you are unsure. + + Say Y if you want to enable such diagnostics. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL -- cgit v1.2.3-70-g09d2 From c0d6d01bffdce19fa19baad6cb8cc3eed7bfd6f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 12:41:26 -0800 Subject: rcu: Check for illegal use of RCU from offlined CPUs Although it is legal to use RCU during early boot, it is anything but legal to use RCU at runtime from an offlined CPU. After all, RCU explicitly ignores offlined CPUs. This commit therefore adds checks for runtime use of RCU from offlined CPUs. These checks are not perfect, in particular, they can be subverted through use of things like rcu_dereference_raw(). Note that it is not possible to put checks in rcu_read_lock() and friends due to the fact that these primitives are used in code that might be used under either RCU or lock-based protection, which means that checking rcu_read_lock() gets you fat piles of false positives. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 19 +++++++++++++++++++ include/linux/srcu.h | 11 +++++++---- kernel/rcupdate.c | 5 +++++ kernel/rcutree.c | 29 +++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 1 + 5 files changed, 61 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f409529ff35..146d37d3177 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -226,6 +226,15 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) +bool rcu_lockdep_current_cpu_online(void); +#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ +static inline bool rcu_lockdep_current_cpu_online(void) +{ + return 1; +} +#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #ifdef CONFIG_PROVE_RCU @@ -270,6 +279,9 @@ extern int debug_lockdep_rcu_enabled(void); * occur in the same context, for example, it is illegal to invoke * rcu_read_unlock() in process context if the matching rcu_read_lock() * was invoked from within an irq handler. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. */ static inline int rcu_read_lock_held(void) { @@ -277,6 +289,8 @@ static inline int rcu_read_lock_held(void) return 1; if (rcu_is_cpu_idle()) return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; return lock_is_held(&rcu_lock_map); } @@ -313,6 +327,9 @@ extern int rcu_read_lock_bh_held(void); * notice an extended quiescent state to other CPUs that started a grace * period. Otherwise we would delay any grace period as long as we run in * the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. */ #ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) @@ -323,6 +340,8 @@ static inline int rcu_read_lock_sched_held(void) return 1; if (rcu_is_cpu_idle()) return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e1b005918bb..9a323728e60 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -99,15 +99,18 @@ long srcu_batches_completed(struct srcu_struct *sp); * power mode. This way we can notice an extended quiescent state to * other CPUs that started a grace period. Otherwise we would delay any * grace period as long as we run in the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. */ static inline int srcu_read_lock_held(struct srcu_struct *sp) { - if (rcu_is_cpu_idle()) - return 0; - if (!debug_lockdep_rcu_enabled()) return 1; - + if (rcu_is_cpu_idle()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; return lock_is_held(&sp->dep_map); } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2bc4e135ff2..a86f1741cc2 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. */ int rcu_read_lock_bh_held(void) { @@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void) return 1; if (rcu_is_cpu_idle()) return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dccd2f78db4..bcf7db2f2fd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -591,6 +591,35 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Is the current CPU online? Disable preemption to avoid false positives + * that could otherwise happen due to the current CPU number being sampled, + * this task being preempted, its old CPU being taken offline, resuming + * on some other CPU, then determining that its old CPU is now offline. + * It is OK to use RCU on an offline processor during initial boot, hence + * the check for rcu_scheduler_fully_active. + * + * Disable checking if in an NMI handler because we cannot safely report + * errors from NMI handlers anyway. + */ +bool rcu_lockdep_current_cpu_online(void) +{ + bool ret; + + if (in_nmi()) + return 1; + preempt_disable(); + ret = cpu_online(smp_processor_id()) || + !rcu_scheduler_fully_active; + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #endif /* #ifdef CONFIG_PROVE_RCU */ /** diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index aa93b074bb2..cecea84f4f3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1946,6 +1946,7 @@ void synchronize_sched_expedited(void) /* Note that atomic_inc_return() implies full memory barrier. */ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); get_online_cpus(); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); /* * Each pass through the following loop attempts to force a -- cgit v1.2.3-70-g09d2 From 3d3b7db0a22085cfc05c3318b9874f7fb8266d18 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 17:05:46 -0800 Subject: rcu: Move synchronize_sched_expedited() to rcutree.c Now that TREE_RCU and TREE_PREEMPT_RCU no longer do anything different for the single-CPU case, there is no need for multiple definitions of synchronize_sched_expedited(). It is no longer in any sense a plug-in, so move it from kernel/rcutree_plugin.h to kernel/rcutree.c. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 116 ----------------------------------------------- 2 files changed, 117 insertions(+), 116 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index bcf7db2f2fd..05470d4caba 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -50,6 +50,8 @@ #include #include #include +#include +#include #include "rcutree.h" #include @@ -1918,6 +1920,121 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + int firstsnap, s, snap, trycount = 0; + + /* Note that atomic_inc_return() implies full memory barrier. */ + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + get_online_cpus(); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + + /* Check to see if someone else did our work for us. */ + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We subtract + * 1 to get the same token that the last incrementer got. + * We retry after they started, so our grace period works + * for them, and they started after our first try, so their + * grace period works for us. + */ + get_online_cpus(); + snap = atomic_read(&sync_sched_expedited_started); + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did beat us to the punch. + */ + do { + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + smp_mb(); /* ensure test happens before caller kfree */ + break; + } + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index cecea84f4f3..98ce17cf1fb 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,7 +25,6 @@ */ #include -#include #define RCU_KTHREAD_PRIO 1 @@ -1888,121 +1887,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); -static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ - return 0; -} - -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. - * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. - * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). - */ -void synchronize_sched_expedited(void) -{ - int firstsnap, s, snap, trycount = 0; - - /* Note that atomic_inc_return() implies full memory barrier. */ - firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); - get_online_cpus(); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - - /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. - */ - while (try_stop_cpus(cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) - udelay(trycount * num_online_cpus()); - else { - synchronize_sched(); - return; - } - - /* Check to see if someone else did our work for us. */ - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ - return; - } - - /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We subtract - * 1 to get the same token that the last incrementer got. - * We retry after they started, so our grace period works - * for them, and they started after our first try, so their - * grace period works for us. - */ - get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started); - smp_mb(); /* ensure read is before try_stop_cpus(). */ - } - - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did beat us to the punch. - */ - do { - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { - smp_mb(); /* ensure test happens before caller kfree */ - break; - } - } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); - - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - #if !defined(CONFIG_RCU_FAST_NO_HZ) /* -- cgit v1.2.3-70-g09d2 From c0cfbbb0d4fca14b828a7635a59784adfba8989d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 17:23:35 -0800 Subject: rcu: No interrupt disabling for rcu_prepare_for_idle() The rcu_prepare_for_idle() function is always called with interrupts disabled, so there is no reason to disable interrupts again within rcu_prepare_for_idle(). Therefore, this commit removes all of the interrupt disabling, also removing a latent disabling-unbalance bug. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 98ce17cf1fb..5e25ee327cc 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2096,10 +2096,6 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { - unsigned long flags; - - local_irq_save(flags); - /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. @@ -2107,7 +2103,6 @@ static void rcu_prepare_for_idle(int cpu) if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; per_cpu(rcu_dyntick_drain, cpu) = 0; - local_irq_restore(flags); trace_rcu_prep_idle("No callbacks"); return; } @@ -2117,7 +2112,6 @@ static void rcu_prepare_for_idle(int cpu) * refrained from disabling the scheduling-clock tick. */ if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { - local_irq_restore(flags); trace_rcu_prep_idle("In holdoff"); return; } @@ -2142,7 +2136,6 @@ static void rcu_prepare_for_idle(int cpu) } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - local_irq_restore(flags); trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2154,23 +2147,17 @@ static void rcu_prepare_for_idle(int cpu) */ #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_preempt_qs(cpu); force_quiescent_state(&rcu_preempt_state, 0); - local_irq_save(flags); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); - local_irq_save(flags); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_bh_qs(cpu); force_quiescent_state(&rcu_bh_state, 0); - local_irq_save(flags); } /* @@ -2178,13 +2165,10 @@ static void rcu_prepare_for_idle(int cpu) * So try forcing the callbacks through the grace period. */ if (rcu_cpu_has_callbacks(cpu)) { - local_irq_restore(flags); trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else { - local_irq_restore(flags); + } else trace_rcu_prep_idle("Callbacks drained"); - } } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -- cgit v1.2.3-70-g09d2 From 2036d94a7b61ca5032ce90f2bda06afec0fe713e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Jan 2012 17:02:47 -0800 Subject: rcu: Rework detection of use of RCU by offline CPUs Because newly offlined CPUs continue executing after completing the CPU_DYING notifiers, they legitimately enter the scheduler and use RCU while appearing to be offline. This calls for a more sophisticated approach as follows: 1. RCU marks the CPU online during the CPU_UP_PREPARE phase. 2. RCU marks the CPU offline during the CPU_DEAD phase. 3. Diagnostics regarding use of read-side RCU by offline CPUs use RCU's accounting rather than the cpu_online_map. (Note that __call_rcu() still uses cpu_online_map to detect illegal invocations within CPU_DYING notifiers.) 4. Offline CPUs are prevented from hanging the system by force_quiescent_state(), which pays attention to cpu_online_map. Some additional work (in a later commit) will be needed to guarantee that force_quiescent_state() waits a full jiffy before assuming that a CPU is offline, for example, when called from idle entry. (This commit also makes the one-jiffy wait explicit, since the old-style implicit wait can now be defeated by RCU_FAST_NO_HZ and by rcutorture.) This approach avoids the false positives encountered when attempting to use more exact classification of CPU online/offline state. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 36 +++++++------- kernel/rcutree.c | 113 ++++++++++++++++++++++++++------------------ kernel/rcutree.h | 1 - kernel/rcutree_plugin.h | 2 +- kernel/rcutree_trace.c | 6 +-- 5 files changed, 87 insertions(+), 71 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 49587abfc2f..f6f15ce3990 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -33,23 +33,23 @@ rcu/rcuboost: The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ri=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 - 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ri=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 - 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ri=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 - 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ri=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 - 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ri=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 - 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ri=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 - 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ri=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 - 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ri=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 + 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 + 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 + 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 + 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 + 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 + 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 + 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 + 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 rcu_bh: - 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 - 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ri=1 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0 - 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0 - 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0 - 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0 - 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ri=1 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0 - 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0 - 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0 + 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 + 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0 + 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0 + 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0 + 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0 + 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0 + 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0 + 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -119,10 +119,6 @@ o "of" is the number of times that some other CPU has forced a CPU is offline when it is really alive and kicking) is a fatal error, so it makes sense to err conservatively. -o "ri" is the number of times that RCU has seen fit to send a - reschedule IPI to this CPU in order to get it to report a - quiescent state. - o "ql" is the number of RCU callbacks currently residing on this CPU. This is the total number of callbacks, regardless of what state they are in (new, waiting for grace period to diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 05470d4caba..708469a0686 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -320,25 +320,18 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_implicit_offline_qs(struct rcu_data *rdp) { /* - * If the CPU is offline, it is in a quiescent state. We can - * trust its state not to change because interrupts are disabled. + * If the CPU is offline for more than a jiffy, it is in a quiescent + * state. We can trust its state not to change because interrupts + * are disabled. The reason for the jiffy's worth of slack is to + * handle CPUs initializing on the way up and finding their way + * to the idle loop on the way down. */ - if (cpu_is_offline(rdp->cpu)) { + if (cpu_is_offline(rdp->cpu) && + ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); rdp->offline_fqs++; return 1; } - - /* - * The CPU is online, so send it a reschedule IPI. This forces - * it through the scheduler, and (inefficiently) also handles cases - * where idle loops fail to inform RCU about the CPU being idle. - */ - if (rdp->cpu != smp_processor_id()) - smp_send_reschedule(rdp->cpu); - else - set_need_resched(); - rdp->resched_ipi++; return 0; } @@ -601,19 +594,33 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); * this task being preempted, its old CPU being taken offline, resuming * on some other CPU, then determining that its old CPU is now offline. * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. + * the check for rcu_scheduler_fully_active. Note also that it is OK + * for a CPU coming online to use RCU for one jiffy prior to marking itself + * online in the cpu_online_mask. Similarly, it is OK for a CPU going + * offline to continue to use RCU for one jiffy after marking itself + * offline in the cpu_online_mask. This leniency is necessary given the + * non-atomic nature of the online and offline processing, for example, + * the fact that a CPU enters the scheduler after completing the CPU_DYING + * notifiers. + * + * This is also why RCU internally marks CPUs online during the + * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. * * Disable checking if in an NMI handler because we cannot safely report * errors from NMI handlers anyway. */ bool rcu_lockdep_current_cpu_online(void) { + struct rcu_data *rdp; + struct rcu_node *rnp; bool ret; if (in_nmi()) return 1; preempt_disable(); - ret = cpu_online(smp_processor_id()) || + rdp = &__get_cpu_var(rcu_sched_data); + rnp = rdp->mynode; + ret = (rdp->grpmask & rnp->qsmaskinit) || !rcu_scheduler_fully_active; preempt_enable(); return ret; @@ -1308,14 +1315,12 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - unsigned long flags; int i; unsigned long mask; - int need_report; int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); - struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ /* First, adjust the counts. */ if (rdp->nxtlist != NULL) { @@ -1381,32 +1386,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) "cpuofl"); rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ - - /* - * Remove the dying CPU from the bitmasks in the rcu_node - * hierarchy. Because we are in stop_machine() context, we - * automatically exclude ->onofflock critical sections. - */ - do { - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp == rdp->mynode) { - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - } else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); } /* @@ -1417,11 +1396,53 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { + unsigned long flags; + unsigned long mask; + int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ + /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); rcu_node_kthread_setaffinity(rnp, -1); + + /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ + + /* Exclude any attempts to start a new grace period. */ + raw_spin_lock_irqsave(&rsp->onofflock, flags); + + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ + mask = rdp->grpmask; /* rnp->grplo is constant. */ + do { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit != 0) { + if (rnp != rdp->mynode) + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + break; + } + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + mask = rnp->grpmask; + rnp = rnp->parent; + } while (rnp != NULL); + + /* + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * held leads to deadlock. + */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp, true); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e2ac8ee415b..cdd1be0a407 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -289,7 +289,6 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ unsigned long offline_fqs; /* Kicked due to being offline. */ - unsigned long resched_ipi; /* Sent a resched IPI. */ /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5e25ee327cc..07f880445d8 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -610,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * absolutely necessary, but this is a good performance/complexity * tradeoff. */ - if (rcu_preempt_blocked_readers_cgp(rnp)) + if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index db0987c1e1b..ed459edeff4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -72,7 +72,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); - seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, " of=%lu", rdp->offline_fqs); seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != @@ -144,7 +144,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); - seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, ",%lu", rdp->offline_fqs); seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], @@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"ri\",\"qll\",\"ql\",\"qs\""); + seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.3-70-g09d2 From 236fefafe5d3d34b78ed2ccf5510909716112326 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jan 2012 14:00:41 -0800 Subject: rcu: Call out dangers of expedited RCU primitives The expedited RCU primitives can be quite useful, but they have some high costs as well. This commit updates and creates docbook comments calling out the costs, and updates the RCU documentation as well. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 14 ++++++++++++++ include/linux/rcutree.h | 16 ++++++++++++++++ kernel/rcutree.c | 22 ++++++++++++++-------- kernel/rcutree_plugin.h | 20 ++++++++++++++++---- kernel/srcu.c | 27 +++++++++++++++++---------- 5 files changed, 77 insertions(+), 22 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index bff2d8be1e1..5c8d7496809 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -180,6 +180,20 @@ over a rather long period of time, but improvements are always welcome! operations that would not normally be undertaken while a real-time workload is running. + In particular, if you find yourself invoking one of the expedited + primitives repeatedly in a loop, please do everyone a favor: + Restructure your code so that it batches the updates, allowing + a single non-expedited primitive to cover the entire batch. + This will very likely be faster than the loop containing the + expedited primitive, and will be much much easier on the rest + of the system, especially to real-time workloads running on + the rest of the system. + + In addition, it is illegal to call the expedited forms from + a CPU-hotplug notifier, or while holding a lock that is acquired + by a CPU-hotplug notifier. Failing to observe this restriction + will result in deadlock. + 7. If the updater uses call_rcu() or synchronize_rcu(), then the corresponding readers must use rcu_read_lock() and rcu_read_unlock(). If the updater uses call_rcu_bh() or diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 73892483fd0..e8ee5dd0854 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -63,6 +63,22 @@ extern void synchronize_rcu_expedited(void); void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +/** + * synchronize_rcu_bh_expedited - Brute-force RCU-bh grace period + * + * Wait for an RCU-bh grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_rcu_bh_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_rcu_bh() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. + */ static inline void synchronize_rcu_bh_expedited(void) { synchronize_sched_expedited(); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 708469a0686..df0e3c1bb68 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1961,15 +1961,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data) return 0; } -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. * * This implementation can be thought of as an application of ticket * locking to RCU, with sync_sched_expedited_started and diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 07f880445d8..f7ceadf4986 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -835,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } -/* - * Wait for an rcu-preempt grace period, but expedite it. The basic idea - * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blkd_tasks lists and wait for this list to drain. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. + * In fact, if you are using synchronize_rcu_expedited() in a loop, + * please restructure your code to batch your updates, and then Use a + * single synchronize_rcu() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. */ void synchronize_rcu_expedited(void) { diff --git a/kernel/srcu.c b/kernel/srcu.c index 3f99fa0e8ed..ba35f3a4a1f 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -286,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp) EXPORT_SYMBOL_GPL(synchronize_srcu); /** - * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * synchronize_srcu_expedited - Brute-force SRCU grace period * @sp: srcu_struct with which to synchronize. * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. + * Wait for an SRCU grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_srcu_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_srcu() instead. * - * Note that it is illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; doing so - * will result in deadlock. However, it is perfectly legal to call - * synchronize_srcu_expedited() on one srcu_struct from some other - * srcu_struct's read-side critical section. + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. It is also illegal to call + * synchronize_srcu_expedited() from the corresponding SRCU read-side + * critical section; doing so will result in deadlock. However, it is + * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct + * from some other srcu_struct's read-side critical section, as long as + * the resulting graph of srcu_structs is acyclic. */ void synchronize_srcu_expedited(struct srcu_struct *sp) { -- cgit v1.2.3-70-g09d2 From c3ce910b1456a45fa88959af3735bd6b285e54af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Feb 2012 10:12:54 -0800 Subject: rcu: Eliminate softirq-mediated RCU_FAST_NO_HZ idle-entry loop If a softirq is pending, the current CPU has RCU callbacks pending, and RCU does not immediately need anything from this CPU, then the current code resets the RCU_FAST_NO_HZ state machine. This means that upon exit from the subsequent softirq handler, RCU_FAST_NO_HZ will try really hard to force RCU into dyntick-idle mode. And if the same conditions hold after a few tries (determined by RCU_IDLE_OPT_FLUSHES), the same situation can repeat, possibly endlessly. This scenario is not particularly good for battery lifetime. This commit therefore suppresses the early exit from the RCU_FAST_NO_HZ state machine in the case where there is a softirq pending. This change forces the state machine to retain its memory, and to enter holdoff if this condition persists. Reported-by: "Abou Gazala, Neven M" Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f7ceadf4986..392a65136a7 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2133,7 +2133,8 @@ static void rcu_prepare_for_idle(int cpu) /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && - !rcu_pending(cpu)) { + !rcu_pending(cpu) && + !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; -- cgit v1.2.3-70-g09d2 From 696a02cc16b182dd78b1f395ae336f449cc90f11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 Feb 2012 21:59:33 -0800 Subject: rcu: Hold off RCU_FAST_NO_HZ after timer posted This commit handles workloads that transition quickly between idle and non-idle, and where the CPU's callbacks cannot be invoked, but where RCU does not have anything immediate for the CPU to do. Without this patch, the RCU_FAST_NO_HZ code can be invoked repeatedly on each entry to idle. The commit sets the per-CPU rcu_dyntick_holdoff variable to hold off further attempts for a tick. Reported-by: "Abou Gazala, Neven M" Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 392a65136a7..c023464816b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2138,7 +2138,7 @@ static void rcu_prepare_for_idle(int cpu) /* Can we go dyntick-idle despite still having callbacks? */ trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), rcu_idle_gp_wait, HRTIMER_MODE_REL); -- cgit v1.2.3-70-g09d2