diff options
Diffstat (limited to 'kernel/rcutree.c')
-rw-r--r-- | kernel/rcutree.c | 311 |
1 files changed, 221 insertions, 90 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4154c9567a6..117218a4372 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -60,36 +60,44 @@ /* Data structures. */ -static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; - -#define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname##_state.node[0] }, \ - .levelcnt = { \ - NUM_RCU_LVL_0, /* root of hierarchy. */ \ - NUM_RCU_LVL_1, \ - NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, \ - NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ - }, \ +static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + +#define RCU_STATE_INITIALIZER(sname, cr) { \ + .level = { &sname##_state.node[0] }, \ + .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ - .orphan_nxttail = &structname##_state.orphan_nxtlist, \ - .orphan_donetail = &structname##_state.orphan_donelist, \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ - .n_force_qs = 0, \ - .n_force_qs_ngp = 0, \ - .name = #structname, \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .orphan_nxttail = &sname##_state.orphan_nxtlist, \ + .orphan_donetail = &sname##_state.orphan_donelist, \ + .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ + .name = #sname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); +struct rcu_state rcu_sched_state = + RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +LIST_HEAD(rcu_struct_flavors); + +/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +module_param(rcu_fanout_leaf, int, 0); +int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; +static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ + NUM_RCU_LVL_0, + NUM_RCU_LVL_1, + NUM_RCU_LVL_2, + NUM_RCU_LVL_3, + NUM_RCU_LVL_4, +}; +int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* * The rcu_scheduler_active variable transitions from zero to one just @@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; -/* State information for rcu_barrier() and friends. */ - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -358,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -468,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) trace_rcu_dyntick("Error on exit: not idle task", oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -856,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) */ void rcu_cpu_stall_reset(void) { - rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_preempt_stall_reset(); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } static struct notifier_block rcu_panic_block = { @@ -1753,8 +1755,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: - if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) - break; /* So gcc recognizes the dead code. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ @@ -1796,9 +1796,10 @@ unlock_fqs_ret: * whom the rdp belongs. */ static void -__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +__rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -1834,11 +1835,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_process_callbacks(struct softirq_action *unused) { + struct rcu_state *rsp; + trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state, - &__get_cpu_var(rcu_sched_data)); - __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); - rcu_preempt_process_callbacks(); + for_each_rcu_flavor(rsp) + __rcu_process_callbacks(rsp); trace_rcu_utilization("End RCU core"); } @@ -2252,9 +2253,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_pending(int cpu) { - return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_pending(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + return 1; + return 0; } /* @@ -2264,20 +2268,41 @@ static int rcu_pending(int cpu) */ static int rcu_cpu_has_callbacks(int cpu) { + struct rcu_state *rsp; + /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_sched_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_cpu_has_callbacks(cpu); + for_each_rcu_flavor(rsp) + if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) + return 1; + return 0; +} + +/* + * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * the compiler is expected to optimize this away. + */ +static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, + int cpu, unsigned long done) +{ + trace_rcu_barrier(rsp->name, s, cpu, + atomic_read(&rsp->barrier_cpu_count), done); } /* * RCU callback function for _rcu_barrier(). If we are last, wake * up the task executing _rcu_barrier(). */ -static void rcu_barrier_callback(struct rcu_head *notused) +static void rcu_barrier_callback(struct rcu_head *rhp) { - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); + struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); + struct rcu_state *rsp = rdp->rsp; + + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + complete(&rsp->barrier_completion); + } else { + _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + } } /* @@ -2285,35 +2310,63 @@ static void rcu_barrier_callback(struct rcu_head *notused) */ static void rcu_barrier_func(void *type) { - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); + struct rcu_state *rsp = type; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + rsp->call(&rdp->barrier_head, rcu_barrier_callback); } /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(struct rcu_state *rsp, - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) +static void _rcu_barrier(struct rcu_state *rsp) { int cpu; unsigned long flags; struct rcu_data *rdp; - struct rcu_head rh; + struct rcu_data rd; + unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); + unsigned long snap_done; - init_rcu_head_on_stack(&rh); + init_rcu_head_on_stack(&rd.barrier_head); + _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rcu_barrier_mutex); + mutex_lock(&rsp->barrier_mutex); - smp_mb(); /* Prevent any prior operations from leaking in. */ + /* + * Ensure that all prior references, including to ->n_barrier_done, + * are ordered before the _rcu_barrier() machinery. + */ + smp_mb(); /* See above block comment. */ + + /* + * Recheck ->n_barrier_done to see if others did our work for us. + * This means checking ->n_barrier_done for an even-to-odd-to-even + * transition. The "if" expression below therefore rounds the old + * value up to the next even number and adds two before comparing. + */ + snap_done = ACCESS_ONCE(rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "Check", -1, snap_done); + if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + smp_mb(); /* caller's subsequent code after above check. */ + mutex_unlock(&rsp->barrier_mutex); + return; + } + + /* + * Increment ->n_barrier_done to avoid duplicate work. Use + * ACCESS_ONCE() to prevent the compiler from speculating + * the increment to precede the early-exit check. + */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); + smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ /* * Initialize the count to one rather than to zero in order to @@ -2332,8 +2385,8 @@ static void _rcu_barrier(struct rcu_state *rsp, * 6. Both rcu_barrier_callback() callbacks are invoked, awakening * us -- but before CPU 1's orphaned callbacks are invoked!!! */ - init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 1); + init_completion(&rsp->barrier_completion); + atomic_set(&rsp->barrier_cpu_count, 1); raw_spin_lock_irqsave(&rsp->onofflock, flags); rsp->rcu_barrier_in_progress = current; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); @@ -2349,14 +2402,19 @@ static void _rcu_barrier(struct rcu_state *rsp, preempt_disable(); rdp = per_cpu_ptr(rsp->rda, cpu); if (cpu_is_offline(cpu)) { + _rcu_barrier_trace(rsp, "Offline", cpu, + rsp->n_barrier_done); preempt_enable(); while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) schedule_timeout_interruptible(1); } else if (ACCESS_ONCE(rdp->qlen)) { - smp_call_function_single(cpu, rcu_barrier_func, - (void *)call_rcu_func, 1); + _rcu_barrier_trace(rsp, "OnlineQ", cpu, + rsp->n_barrier_done); + smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); preempt_enable(); } else { + _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + rsp->n_barrier_done); preempt_enable(); } } @@ -2373,24 +2431,32 @@ static void _rcu_barrier(struct rcu_state *rsp, rcu_adopt_orphan_cbs(rsp); rsp->rcu_barrier_in_progress = NULL; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rcu_barrier_cpu_count); + atomic_inc(&rsp->barrier_cpu_count); smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - call_rcu_func(&rh, rcu_barrier_callback); + rd.rsp = rsp; + rsp->call(&rd.barrier_head, rcu_barrier_callback); /* * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) + complete(&rsp->barrier_completion); + + /* Increment ->n_barrier_done to prevent duplicate work. */ + smp_mb(); /* Keep increment after above mechanism. */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); + smp_mb(); /* Keep increment before caller's subsequent code. */ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rcu_barrier_completion); + wait_for_completion(&rsp->barrier_completion); /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rcu_barrier_mutex); + mutex_unlock(&rsp->barrier_mutex); - destroy_rcu_head_on_stack(&rh); + destroy_rcu_head_on_stack(&rd.barrier_head); } /** @@ -2398,7 +2464,7 @@ static void _rcu_barrier(struct rcu_state *rsp, */ void rcu_barrier_bh(void) { - _rcu_barrier(&rcu_bh_state, call_rcu_bh); + _rcu_barrier(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -2407,7 +2473,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(&rcu_sched_state, call_rcu_sched); + _rcu_barrier(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); @@ -2500,9 +2566,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) static void __cpuinit rcu_prepare_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_sched_state, 0); - rcu_init_percpu_data(cpu, &rcu_bh_state, 0); - rcu_preempt_init_percpu_data(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_init_percpu_data(cpu, rsp, + strcmp(rsp->name, "rcu_preempt") == 0); } /* @@ -2514,6 +2582,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + struct rcu_state *rsp; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2538,18 +2607,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_cleanup_dying_cpu(&rcu_bh_state); - rcu_cleanup_dying_cpu(&rcu_sched_state); - rcu_preempt_cleanup_dying_cpu(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); - rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); - rcu_preempt_cleanup_dead_cpu(cpu); + for_each_rcu_flavor(rsp) + rcu_cleanup_dead_cpu(cpu, rsp); break; default: break; @@ -2582,9 +2649,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i > 0; i--) + for (i = rcu_num_lvls - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; + rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -2594,7 +2661,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int i; cprv = NR_CPUS; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; @@ -2621,13 +2688,15 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ - for (i = 1; i < NUM_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) + rsp->levelcnt[i] = num_rcu_lvl[i]; + for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); /* Initialize the elements themselves, starting from the leaves. */ - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { @@ -2657,13 +2726,74 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; - rnp = rsp->level[NUM_RCU_LVLS - 1]; + rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } + list_add(&rsp->flavors, &rcu_struct_flavors); +} + +/* + * Compute the rcu_node tree geometry from kernel parameters. This cannot + * replace the definitions in rcutree.h because those are needed to size + * the ->node array in the rcu_state structure. + */ +static void __init rcu_init_geometry(void) +{ + int i; + int j; + int n = nr_cpu_ids; + int rcu_capacity[MAX_RCU_LVLS + 1]; + + /* If the compile-time values are accurate, just leave. */ + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + return; + + /* + * Compute number of nodes that can be handled an rcu_node tree + * with the given number of levels. Setting rcu_capacity[0] makes + * some of the arithmetic easier. + */ + rcu_capacity[0] = 1; + rcu_capacity[1] = rcu_fanout_leaf; + for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + + /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Finally, the tree must be able to accommodate + * the configured number of CPUs. Complain and fall back to the + * compile-time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8 || + n > rcu_capacity[MAX_RCU_LVLS]) { + WARN_ON(1); + return; + } + + /* Calculate the number of rcu_nodes at each level of the tree. */ + for (i = 1; i <= MAX_RCU_LVLS; i++) + if (n <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) + num_rcu_lvl[j] = + DIV_ROUND_UP(n, rcu_capacity[i - j]); + rcu_num_lvls = i; + for (j = i + 1; j <= MAX_RCU_LVLS; j++) + num_rcu_lvl[j] = 0; + break; + } + + /* Calculate the total number of rcu_node structures. */ + rcu_num_nodes = 0; + for (i = 0; i <= MAX_RCU_LVLS; i++) + rcu_num_nodes += num_rcu_lvl[i]; + rcu_num_nodes -= n; } void __init rcu_init(void) @@ -2671,6 +2801,7 @@ void __init rcu_init(void) int cpu; rcu_bootup_announce(); + rcu_init_geometry(); rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); |