From a41bfeb2f8ed59410be7ca0f8fbc6138a758b746 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 12 Jul 2013 17:00:28 -0400 Subject: rcu: Simplify RCU_STATE_INITIALIZER() macro The RCU_STATE_INITIALIZER() macro is used only in the rcutree.c file as well as the rcutree_plugin.h file. It is passed as a rvalue to a variable of a similar name. A per_cpu variable is also created with a similar name as well. The uses of RCU_STATE_INITIALIZER() can be simplified to remove some of the duplicate code that is done. Currently the three users of this macro has this format: struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); Notice that "rcu_sched" is called three times. This is the same with the other two users. This can be condensed to just: RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); by moving the rest into the macro itself. This also opens the door to allow the RCU tracepoint strings and their addresses to be exported so that userspace tracing tools can translate the contents of the pointers of the RCU tracepoints. The change will allow for helper code to be placed in the RCU_STATE_INITIALIZER() macro to export the name that is used. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/rcutree_plugin.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 769e12e3151..6976a7dde87 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = - RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); -- cgit v1.2.3-70-g09d2 From f7f7bac9cb1c50783f15937a11743655a5756a36 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 12 Jul 2013 17:18:47 -0400 Subject: rcu: Have the RCU tracepoints use the tracepoint_string infrastructure Currently, RCU tracepoints save only a pointer to strings in the ring buffer. When displayed via the /sys/kernel/debug/tracing/trace file they are referenced like the printf "%s" that looks at the address in the ring buffer and prints out the string it points too. This requires that the strings are constant and persistent in the kernel. The problem with this is for tools like trace-cmd and perf that read the binary data from the buffers but have no access to the kernel memory to find out what string is represented by the address in the buffer. By using the tracepoint_string infrastructure, the RCU tracepoint strings can be exported such that userspace tools can map the addresses to the strings. # cat /sys/kernel/debug/tracing/printk_formats 0xffffffff81a4a0e8 : "rcu_preempt" 0xffffffff81a4a0f4 : "rcu_bh" 0xffffffff81a4a100 : "rcu_sched" 0xffffffff818437a0 : "cpuqs" 0xffffffff818437a6 : "rcu_sched" 0xffffffff818437a0 : "cpuqs" 0xffffffff818437b0 : "rcu_bh" 0xffffffff818437b7 : "Start context switch" 0xffffffff818437cc : "End context switch" 0xffffffff818437a0 : "cpuqs" [...] Now userspaces tools can display: rcu_utilization: Start context switch rcu_dyntick: Start 1 0 rcu_utilization: End context switch rcu_batch_start: rcu_preempt CBs=0/5 bl=10 rcu_dyntick: End 0 140000000000000 rcu_invoke_callback: rcu_preempt rhp=0xffff880071c0d600 func=proc_i_callback rcu_invoke_callback: rcu_preempt rhp=0xffff880077b5b230 func=__d_free rcu_dyntick: Start 140000000000000 0 rcu_invoke_callback: rcu_preempt rhp=0xffff880077563980 func=file_free_rcu rcu_batch_end: rcu_preempt CBs-invoked=3 idle=>c<>c<>c<>c< rcu_utilization: End RCU core rcu_grace_period: rcu_preempt 9741 start rcu_dyntick: Start 1 0 rcu_dyntick: End 0 140000000000000 rcu_dyntick: Start 140000000000000 0 Instead of: rcu_utilization: ffffffff81843110 rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f32 rcu_batch_start: ffffffff81842f1d CBs=0/4 bl=10 rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f3c rcu_grace_period: ffffffff81842f1d 9939 ffffffff81842f80 rcu_invoke_callback: ffffffff81842f1d rhp=0xffff88007888aac0 func=file_free_rcu rcu_grace_period: ffffffff81842f1d 9939 ffffffff81842f95 rcu_invoke_callback: ffffffff81842f1d rhp=0xffff88006aeb4600 func=proc_i_callback rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f32 rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f3c rcu_invoke_callback: ffffffff81842f1d rhp=0xffff880071cb9fc0 func=__d_free rcu_grace_period: ffffffff81842f1d 9939 ffffffff81842f80 rcu_invoke_callback: ffffffff81842f1d rhp=0xffff88007888ae80 func=file_free_rcu rcu_batch_end: ffffffff81842f1d CBs-invoked=4 idle=>c<>c<>c<>c< rcu_utilization: ffffffff8184311f Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/rcutree.c | 87 ++++++++++++++++++++++++++++++------------------- kernel/rcutree_plugin.h | 32 +++++++++--------- 2 files changed, 69 insertions(+), 50 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 97994a329d8..338f1d1c1c6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -53,18 +53,36 @@ #include <linux/delay.h> #include <linux/stop_machine.h> #include <linux/random.h> +#include <linux/ftrace_event.h> #include "rcutree.h" #include <trace/events/rcu.h> #include "rcu.h" +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) + /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +/* + * In order to export the rcu_state name to the tracing tools, it + * needs to be added in the __tracepoint_string section. + * This requires defining a separate variable tp_<sname>_varname + * that points to the string being used, and this will allow + * the tracing userspace tools to be able to decipher the string + * address to the matching string. + */ #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +static char sname##_varname[] = #sname; \ +static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ @@ -76,7 +94,7 @@ struct rcu_state sname##_state = { \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ - .name = #sname, \ + .name = sname##_varname, \ .abbr = sabbr, \ }; \ DEFINE_PER_CPU(struct rcu_data, sname##_data) @@ -176,7 +194,7 @@ void rcu_sched_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; } @@ -185,7 +203,7 @@ void rcu_bh_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; } @@ -196,10 +214,10 @@ void rcu_bh_qs(int cpu) */ void rcu_note_context_switch(int cpu) { - trace_rcu_utilization("Start context switch"); + trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); - trace_rcu_utilization("End context switch"); + trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -343,11 +361,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { - trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -477,7 +495,7 @@ void rcu_irq_exit(void) rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); if (rdtp->dynticks_nesting) - trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else rcu_eqs_enter_common(rdtp, oldval, true); local_irq_restore(flags); @@ -499,11 +517,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); - trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on exit: not idle task", + trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -618,7 +636,7 @@ void rcu_irq_enter(void) rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); if (oldval) - trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(rdtp, oldval, true); local_irq_restore(flags); @@ -773,7 +791,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * of the current RCU grace period. */ if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; return 1; } @@ -793,7 +811,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; /* Grace period is not old enough. */ barrier(); if (cpu_is_offline(rdp->cpu)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; return 1; } @@ -1056,9 +1074,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * grace period is already marked as needed, return to the caller. */ c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); return c; } @@ -1072,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) if (rnp->gpnum != rnp->completed || ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); return c; } @@ -1100,7 +1118,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * recorded, trace and leave. */ if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); goto unlock_out; } @@ -1109,9 +1127,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { - trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: @@ -1135,7 +1153,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + trace_rcu_future_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1203,9 +1222,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) - trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); } /* @@ -1271,7 +1290,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } if (rdp->gpnum != rnp->gpnum) { @@ -1281,7 +1300,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc * go looking for one. */ rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); @@ -1324,7 +1343,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); @@ -1446,7 +1465,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ @@ -1855,7 +1874,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) RCU_TRACE(mask = rdp->grpmask); trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), - "cpuofl"); + TPS("cpuofl")); } /* @@ -2042,7 +2061,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { - trace_rcu_utilization("Start scheduler-tick"); + trace_rcu_utilization(TPS("Start scheduler-tick")); increment_cpu_stall_ticks(); if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -2075,7 +2094,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); - trace_rcu_utilization("End scheduler-tick"); + trace_rcu_utilization(TPS("End scheduler-tick")); } /* @@ -2206,10 +2225,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) if (cpu_is_offline(smp_processor_id())) return; - trace_rcu_utilization("Start RCU core"); + trace_rcu_utilization(TPS("Start RCU core")); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); - trace_rcu_utilization("End RCU core"); + trace_rcu_utilization(TPS("End RCU core")); } /* @@ -2950,7 +2969,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; @@ -2980,7 +2999,7 @@ static int rcu_cpu_notify(struct notifier_block *self, struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - trace_rcu_utilization("Start CPU hotplug"); + trace_rcu_utilization(TPS("Start CPU hotplug")); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -3009,7 +3028,7 @@ static int rcu_cpu_notify(struct notifier_block *self, default: break; } - trace_rcu_utilization("End CPU hotplug"); + trace_rcu_utilization(TPS("End CPU hotplug")); return NOTIFY_OK; } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6976a7dde87..dff86f53ee0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -167,7 +167,7 @@ static void rcu_preempt_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; } @@ -386,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t) np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; - trace_rcu_unlock_preempted_task("rcu_preempt", + trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gpnum, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; @@ -410,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ empty_exp_now = !rcu_preempted_readers_exp(rnp); if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { - trace_rcu_quiescent_state_report("preempt_rcu", + trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, rnp->level, @@ -1248,12 +1248,12 @@ static int rcu_boost_kthread(void *arg) int spincnt = 0; int more2boost; - trace_rcu_utilization("Start boost kthread@init"); + trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; - trace_rcu_utilization("End boost kthread@rcu_wait"); + trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); - trace_rcu_utilization("Start boost kthread@rcu_wait"); + trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1262,14 +1262,14 @@ static int rcu_boost_kthread(void *arg) spincnt = 0; if (spincnt > 10) { rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("End boost kthread@rcu_yield"); + trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); schedule_timeout_interruptible(2); - trace_rcu_utilization("Start boost kthread@rcu_yield"); + trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } } /* NOTREACHED */ - trace_rcu_utilization("End boost kthread@notreached"); + trace_rcu_utilization(TPS("End boost kthread@notreached")); return 0; } @@ -1417,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu) int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization("Start CPU kthread@rcu_wait"); + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; this_cpu_inc(rcu_cpu_kthread_loops); @@ -1429,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu) rcu_kthread_do_work(); local_bh_enable(); if (*workp == 0) { - trace_rcu_utilization("End CPU kthread@rcu_wait"); + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; } } *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("Start CPU kthread@rcu_yield"); + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); schedule_timeout_interruptible(2); - trace_rcu_utilization("End CPU kthread@rcu_yield"); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } @@ -2200,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, "StartWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2208,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; flush_signals(current); - trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, "EndWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } -- cgit v1.2.3-70-g09d2 From 2333210b26cf7aaf48d71343029afb860103d9f9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Fri, 21 Jun 2013 12:34:33 -0700 Subject: nohz_full: Add rcu_dyntick data for scalable detection of all-idle state This commit adds fields to the rcu_dyntick structure that are used to detect idle CPUs. These new fields differ from the existing ones in that the existing ones consider a CPU executing in user mode to be idle, where the new ones consider CPUs executing in user mode to be busy. The handling of these new fields is otherwise quite similar to that for the exiting fields. This commit also adds the initialization required for these fields. So, why is usermode execution treated differently, with RCU considering it a quiescent state equivalent to idle, while in contrast the new full-system idle state detection considers usermode execution to be non-idle? It turns out that although one of RCU's quiescent states is usermode execution, it is not a full-system idle state. This is because the purpose of the full-system idle state is not RCU, but rather determining when accurate timekeeping can safely be disabled. Whenever accurate timekeeping is required in a CONFIG_NO_HZ_FULL kernel, at least one CPU must keep the scheduling-clock tick going. If even one CPU is executing in user mode, accurate timekeeping is requires, particularly for architectures where gettimeofday() and friends do not enter the kernel. Only when all CPUs are really and truly idle can accurate timekeeping be disabled, allowing all CPUs to turn off the scheduling clock interrupt, thus greatly improving energy efficiency. This naturally raises the question "Why is this code in RCU rather than in timekeeping?", and the answer is that RCU has the data and infrastructure to efficiently make this determination. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Reviewed-by: Josh Triplett <josh@joshtriplett.org> --- kernel/rcutree.c | 5 +++++ kernel/rcutree.h | 9 +++++++++ kernel/rcutree_plugin.h | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8807019138c..4f27b85d8c8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -224,6 +224,10 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -2904,6 +2908,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->blimit = blimit; init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cbdeac6cea9..52d1be108e7 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,6 +88,14 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + long long dynticks_idle_nesting; + /* irq/process nesting level from idle. */ + atomic_t dynticks_idle; /* Even value for idle, else odd. */ + /* "Idle" excludes userspace execution. */ + unsigned long dynticks_idle_jiffies; + /* End of last non-NMI non-idle period. */ +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -545,6 +553,7 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index dff86f53ee0..e5baccbd803 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2373,3 +2373,22 @@ static void rcu_kick_nohz_cpu(int cpu) smp_send_reschedule(cpu); #endif /* #ifdef CONFIG_NO_HZ_FULL */ } + + +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + +/* + * Initialize dynticks sysidle state for CPUs coming online. + */ +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ + rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; +} + +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -- cgit v1.2.3-70-g09d2 From eb348b898290da242e46df75ab0b9772003e08b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Fri, 21 Jun 2013 13:00:57 -0700 Subject: nohz_full: Add per-CPU idle-state tracking This commit adds the code that updates the rcu_dyntick structure's new fields to track the per-CPU idle state based on interrupts and transitions into and out of the idle loop (NMIs are ignored because NMI handlers cannot cleanly read out the time anyway). This code is similar to the code that maintains RCU's idea of per-CPU idleness, but differs in that RCU treats CPUs running in user mode as idle, where this new code does not. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Reviewed-by: Josh Triplett <josh@joshtriplett.org> --- kernel/rcutree.c | 4 +++ kernel/rcutree.h | 2 ++ kernel/rcutree_plugin.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4f27b85d8c8..b0d2cc3ea15 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -431,6 +431,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); + rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -481,6 +482,7 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else rcu_eqs_enter_common(rdtp, oldval, true); + rcu_sysidle_enter(rdtp, 1); local_irq_restore(flags); } @@ -549,6 +551,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); + rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -600,6 +603,7 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(rdtp, oldval, true); + rcu_sysidle_exit(rdtp, 1); local_irq_restore(flags); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 52d1be108e7..9dd8b177f1a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -553,6 +553,8 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index e5baccbd803..eab81da614b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2377,6 +2377,77 @@ static void rcu_kick_nohz_cpu(int cpu) #ifdef CONFIG_NO_HZ_FULL_SYSIDLE +/* + * Invoked to note exit from irq or task transition to idle. Note that + * usermode execution does -not- count as idle here! After all, we want + * to detect full-system idle states, not RCU quiescent states and grace + * periods. The caller must have disabled interrupts. + */ +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ + unsigned long j; + + /* Adjust nesting, check for fully idle. */ + if (irq) { + rdtp->dynticks_idle_nesting--; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + if (rdtp->dynticks_idle_nesting != 0) + return; /* Still not fully idle. */ + } else { + if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) { + rdtp->dynticks_idle_nesting = 0; + } else { + rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + return; /* Still not fully idle. */ + } + } + + /* Record start of fully idle period. */ + j = jiffies; + ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); +} + +/* + * Invoked to note entry to irq or task transition from idle. Note that + * usermode execution does -not- count as idle here! The caller must + * have disabled interrupts. + */ +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ + /* Adjust nesting, check for already non-idle. */ + if (irq) { + rdtp->dynticks_idle_nesting++; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + if (rdtp->dynticks_idle_nesting != 1) + return; /* Already non-idle. */ + } else { + /* + * Allow for irq misnesting. Yes, it really is possible + * to enter an irq handler then never leave it, and maybe + * also vice versa. Handle both possibilities. + */ + if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { + rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + return; /* Already non-idle. */ + } else { + rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; + } + } + + /* Record end of idle period. */ + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); +} + /* * Initialize dynticks sysidle state for CPUs coming online. */ @@ -2387,6 +2458,14 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ +} + static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) { } -- cgit v1.2.3-70-g09d2 From d4bd54fbac2ea5c30eb976ca557e905f489d55f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Fri, 21 Jun 2013 14:51:40 -0700 Subject: nohz_full: Add full-system idle states and variables This commit adds control variables and states for full-system idle. The system will progress through the states in numerical order when the system is fully idle (other than the timekeeping CPU), and reset down to the initial state if any non-timekeeping CPU goes non-idle. The current state is kept in full_sysidle_state. One flavor of RCU will be in charge of driving the state machine, defined by rcu_sysidle_state. This should be the busiest flavor of RCU. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Reviewed-by: Josh Triplett <josh@joshtriplett.org> --- kernel/rcutree_plugin.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index eab81da614b..a7419ceb19a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2377,6 +2377,23 @@ static void rcu_kick_nohz_cpu(int cpu) #ifdef CONFIG_NO_HZ_FULL_SYSIDLE +/* + * Define RCU flavor that holds sysidle state. This needs to be the + * most active flavor of RCU. + */ +#ifdef CONFIG_PREEMPT_RCU +static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_preempt_state; +#else /* #ifdef CONFIG_PREEMPT_RCU */ +static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state; +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +static int __maybe_unused full_sysidle_state; /* Current system-idle state. */ +#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ +#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ +#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ +#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ +#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ + /* * Invoked to note exit from irq or task transition to idle. Note that * usermode execution does -not- count as idle here! After all, we want -- cgit v1.2.3-70-g09d2 From 0edd1b1784cbdad55aca2c1293be018f53c0ab1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Fri, 21 Jun 2013 16:37:22 -0700 Subject: nohz_full: Add full-system-idle state machine This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. For large systems the state machine is driven out of RCU's force-quiescent-state logic, which provides good scalability at the price of millisecond-scale latencies on the transition to full-system-idle state. This is not so good for battery-powered systems, which are usually small enough that they don't need to care about scalability, but which do care deeply about energy efficiency. Small systems therefore drive the state machine directly out of the idle-entry code. The number of CPUs in a "small" system is defined by a new NO_HZ_FULL_SYSIDLE_SMALL Kconfig parameter, which defaults to 8. Note that this is a build-time definition. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> [ paulmck: Use true and false for boolean constants per Lai Jiangshan. ] Reviewed-by: Josh Triplett <josh@joshtriplett.org> [ paulmck: Simplify logic and provide better comments for memory barriers, based on review comments and questions by Lai Jiangshan. ] --- include/linux/rcupdate.h | 18 +++ kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 296 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/time/Kconfig | 27 +++++ 5 files changed, 355 insertions(+), 7 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 30bea9c2573..f1f1bc39346 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7b5be56d95a..eca70f4469c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -734,6 +734,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp->dynticks_snap & 0x1) == 0; } @@ -1373,11 +1374,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, &isidle, &maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2103,9 +2110,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp->qsmask & bit) != 0) { + if ((rnp->qsmaskinit & bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 9dd8b177f1a..6fd3659cf01 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a7419ceb19a..45ebba747af 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> -#include <linux/tick.h> +#include "time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -2382,12 +2382,12 @@ static void rcu_kick_nohz_cpu(int cpu) * most active flavor of RCU. */ #ifdef CONFIG_PREEMPT_RCU -static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_preempt_state; +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state; +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -static int __maybe_unused full_sysidle_state; /* Current system-idle state. */ +static int full_sysidle_state; /* Current system-idle state. */ #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ @@ -2430,6 +2430,38 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); } +/* + * Unconditionally force exit from full system-idle state. This is + * invoked when a normal CPU exits idle, but must be called separately + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this + * is that the timekeeping CPU is permitted to take scheduling-clock + * interrupts while the system is in system-idle state, and of course + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock + * interrupt from any other type of interrupt. + */ +void rcu_sysidle_force_exit(void) +{ + int oldstate = ACCESS_ONCE(full_sysidle_state); + int newoldstate; + + /* + * Each pass through the following loop attempts to exit full + * system-idle state. If contention proves to be a problem, + * a trylock-based contention tree could be used here. + */ + while (oldstate > RCU_SYSIDLE_SHORT) { + newoldstate = cmpxchg(&full_sysidle_state, + oldstate, RCU_SYSIDLE_NOT); + if (oldstate == newoldstate && + oldstate == RCU_SYSIDLE_FULL_NOTED) { + rcu_kick_nohz_cpu(tick_do_timer_cpu); + return; /* We cleared it, done! */ + } + oldstate = newoldstate; + } + smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ +} + /* * Invoked to note entry to irq or task transition from idle. Note that * usermode execution does -not- count as idle here! The caller must @@ -2463,6 +2495,247 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) atomic_inc(&rdtp->dynticks_idle); smp_mb__after_atomic_inc(); WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); + + /* + * If we are the timekeeping CPU, we are permitted to be non-idle + * during a system-idle state. This must be the case, because + * the timekeeping CPU has to take scheduling-clock interrupts + * during the time that the system is transitioning to full + * system-idle state. This means that the timekeeping CPU must + * invoke rcu_sysidle_force_exit() directly if it does anything + * more than take a scheduling-clock interrupt. + */ + if (smp_processor_id() == tick_do_timer_cpu) + return; + + /* Update system-idle state: We are clearly no longer fully idle! */ + rcu_sysidle_force_exit(); +} + +/* + * Check to see if the current CPU is idle. Note that usermode execution + * does not count as idle. The caller must have disabled interrupts. + */ +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ + int cur; + unsigned long j; + struct rcu_dynticks *rdtp = rdp->dynticks; + + /* + * If some other CPU has already reported non-idle, if this is + * not the flavor of RCU that tracks sysidle state, or if this + * is an offline or the timekeeping CPU, nothing to do. + */ + if (!*isidle || rdp->rsp != rcu_sysidle_state || + cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) + return; + /* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */ + + /* Pick up current idle and NMI-nesting counter and check. */ + cur = atomic_read(&rdtp->dynticks_idle); + if (cur & 0x1) { + *isidle = false; /* We are not idle! */ + return; + } + smp_mb(); /* Read counters before timestamps. */ + + /* Pick up timestamps. */ + j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); + /* If this CPU entered idle more recently, update maxj timestamp. */ + if (ULONG_CMP_LT(*maxj, j)) + *maxj = j; +} + +/* + * Is this the flavor of RCU that is handling full-system idle? + */ +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return rsp == rcu_sysidle_state; +} + +/* + * Return a delay in jiffies based on the number of CPUs, rcu_node + * leaf fanout, and jiffies tick rate. The idea is to allow larger + * systems more time to transition to full-idle state in order to + * avoid the cache thrashing that otherwise occur on the state variable. + * Really small systems (less than a couple of tens of CPUs) should + * instead use a single global atomically incremented counter, and later + * versions of this will automatically reconfigure themselves accordingly. + */ +static unsigned long rcu_sysidle_delay(void) +{ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return 0; + return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); +} + +/* + * Advance the full-system-idle state. This is invoked when all of + * the non-timekeeping CPUs are idle. + */ +static void rcu_sysidle(unsigned long j) +{ + /* Check the current state. */ + switch (ACCESS_ONCE(full_sysidle_state)) { + case RCU_SYSIDLE_NOT: + + /* First time all are idle, so note a short idle period. */ + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; + break; + + case RCU_SYSIDLE_SHORT: + + /* + * Idle for a bit, time to advance to next state? + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); + break; + + case RCU_SYSIDLE_LONG: + + /* + * Do an additional check pass before advancing to full. + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); + break; + + default: + break; + } +} + +/* + * Found a non-idle non-timekeeping CPU, so kick the system-idle state + * back to the beginning. + */ +static void rcu_sysidle_cancel(void) +{ + smp_mb(); + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; +} + +/* + * Update the sysidle state based on the results of a force-quiescent-state + * scan of the CPUs' dyntick-idle state. + */ +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, + unsigned long maxj, bool gpkt) +{ + if (rsp != rcu_sysidle_state) + return; /* Wrong flavor, ignore. */ + if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return; /* Running state machine from timekeeping CPU. */ + if (isidle) + rcu_sysidle(maxj); /* More idle! */ + else + rcu_sysidle_cancel(); /* Idle is over. */ +} + +/* + * Wrapper for rcu_sysidle_report() when called from the grace-period + * kthread's context. + */ +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ + rcu_sysidle_report(rsp, isidle, maxj, true); +} + +/* Callback and function for forcing an RCU grace period. */ +struct rcu_sysidle_head { + struct rcu_head rh; + int inuse; +}; + +static void rcu_sysidle_cb(struct rcu_head *rhp) +{ + struct rcu_sysidle_head *rshp; + + /* + * The following memory barrier is needed to replace the + * memory barriers that would normally be in the memory + * allocator. + */ + smp_mb(); /* grace period precedes setting inuse. */ + + rshp = container_of(rhp, struct rcu_sysidle_head, rh); + ACCESS_ONCE(rshp->inuse) = 0; +} + +/* + * Check to see if the system is fully idle, other than the timekeeping CPU. + * The caller must have disabled interrupts. + */ +bool rcu_sys_is_idle(void) +{ + static struct rcu_sysidle_head rsh; + int rss = ACCESS_ONCE(full_sysidle_state); + + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) + return false; + + /* Handle small-system case by doing a full scan of CPUs. */ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { + int oldrss = rss - 1; + + /* + * One pass to advance to each state up to _FULL. + * Give up if any pass fails to advance the state. + */ + while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { + int cpu; + bool isidle = true; + unsigned long maxj = jiffies - ULONG_MAX / 4; + struct rcu_data *rdp; + + /* Scan all the CPUs looking for nonidle CPUs. */ + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rcu_sysidle_check_cpu(rdp, &isidle, &maxj); + if (!isidle) + break; + } + rcu_sysidle_report(rcu_sysidle_state, + isidle, maxj, false); + oldrss = rss; + rss = ACCESS_ONCE(full_sysidle_state); + } + } + + /* If this is the first observation of an idle period, record it. */ + if (rss == RCU_SYSIDLE_FULL) { + rss = cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); + return rss == RCU_SYSIDLE_FULL; + } + + smp_mb(); /* ensure rss load happens before later caller actions. */ + + /* If already fully idle, tell the caller (in case of races). */ + if (rss == RCU_SYSIDLE_FULL_NOTED) + return true; + + /* + * If we aren't there yet, and a grace period is not in flight, + * initiate a grace period. Either way, tell the caller that + * we are not there yet. We use an xchg() rather than an assignment + * to make up for the memory barriers that would otherwise be + * provided by the memory allocator. + */ + if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && + !rcu_gp_in_progress(rcu_sysidle_state) && + !rsh.inuse && xchg(&rsh.inuse, 1) == 0) + call_rcu(&rsh.rh, rcu_sysidle_cb); + return false; } /* @@ -2483,6 +2756,21 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) { } +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ +} + +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return false; +} + +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ +} + static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) { } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index c7d2fd67799..3381f098070 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -157,6 +157,33 @@ config NO_HZ_FULL_SYSIDLE Say N if you are unsure. +config NO_HZ_FULL_SYSIDLE_SMALL + int "Number of CPUs above which large-system approach is used" + depends on NO_HZ_FULL_SYSIDLE + range 1 NR_CPUS + default 8 + help + The full-system idle detection mechanism takes a lazy approach + on large systems, as is required to attain decent scalability. + However, on smaller systems, scalability is not anywhere near as + large a concern as is energy efficiency. The sysidle subsystem + therefore uses a fast but non-scalable algorithm for small + systems and a lazier but scalable algorithm for large systems. + This Kconfig parameter defines the number of CPUs in the largest + system that will be considered to be "small". + + The default value will be fine in most cases. Battery-powered + systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger + numbers of CPUs, and (3) are suffering from battery-lifetime + problems due to long sysidle latencies might wish to experiment + with larger values for this Kconfig parameter. On the other + hand, they might be even better served by disabling NO_HZ_FULL + entirely, given that NO_HZ_FULL is intended for HPC and + real-time workloads that at present do not tend to be run on + battery-powered systems. + + Take the default if you are unsure. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS -- cgit v1.2.3-70-g09d2 From eb75767be0e514f97bf1b5cec763696cfc7f7e2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Fri, 21 Jun 2013 17:10:40 -0700 Subject: nohz_full: Force RCU's grace-period kthreads onto timekeeping CPU Because RCU's quiescent-state-forcing mechanism is used to drive the full-system-idle state machine, and because this mechanism is executed by RCU's grace-period kthreads, this commit forces these kthreads to run on the timekeeping CPU (tick_do_timer_cpu). To do otherwise would mean that the RCU grace-period kthreads would force the system into non-idle state every time they drove the state machine, which would be just a bit on the futile side. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> --- kernel/rcutree.c | 1 + kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 21 ++++++++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index eca70f4469c..64eaafb6c8f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1303,6 +1303,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); rsp->gp_flags = 0; /* Clear all flags: New grace period. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 6fd3659cf01..5f97eab602c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -560,6 +560,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, static bool is_sysidle_rcu_state(struct rcu_state *rsp); static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj); +static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 45ebba747af..130c97b027f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2531,7 +2531,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, if (!*isidle || rdp->rsp != rcu_sysidle_state || cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) return; - /* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */ + if (rcu_gp_in_progress(rdp->rsp)) + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); /* Pick up current idle and NMI-nesting counter and check. */ cur = atomic_read(&rdtp->dynticks_idle); @@ -2556,6 +2557,20 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return rsp == rcu_sysidle_state; } +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /* * Return a delay in jiffies based on the number of CPUs, rcu_node * leaf fanout, and jiffies tick rate. The idea is to allow larger @@ -2766,6 +2781,10 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return false; } +static void rcu_bind_gp_kthread(void) +{ +} + static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { -- cgit v1.2.3-70-g09d2