diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/rcu.h | 10 | ||||
-rw-r--r-- | kernel/rcupdate.c | 100 | ||||
-rw-r--r-- | kernel/rcutorture.c | 388 | ||||
-rw-r--r-- | kernel/rcutree.c | 150 | ||||
-rw-r--r-- | kernel/rcutree.h | 17 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 424 | ||||
-rw-r--r-- | kernel/time/Kconfig | 50 |
7 files changed, 764 insertions, 375 deletions
diff --git a/kernel/rcu.h b/kernel/rcu.h index 0a90ccc65bf..77131966c4a 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -67,12 +67,15 @@ extern struct debug_obj_descr rcuhead_debug_descr; -static inline void debug_rcu_head_queue(struct rcu_head *head) +static inline int debug_rcu_head_queue(struct rcu_head *head) { - debug_object_activate(head, &rcuhead_debug_descr); + int r1; + + r1 = debug_object_activate(head, &rcuhead_debug_descr); debug_object_active_state(head, &rcuhead_debug_descr, STATE_RCU_HEAD_READY, STATE_RCU_HEAD_QUEUED); + return r1; } static inline void debug_rcu_head_unqueue(struct rcu_head *head) @@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) debug_object_deactivate(head, &rcuhead_debug_descr); } #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -static inline void debug_rcu_head_queue(struct rcu_head *head) +static inline int debug_rcu_head_queue(struct rcu_head *head) { + return 0; } static inline void debug_rcu_head_unqueue(struct rcu_head *head) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 14994d4e1a5..33eb4620aa1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head) } /* - * fixup_init is called when: - * - an active object is initialized - */ -static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_init(head, &rcuhead_debug_descr); - return 1; - default: - return 0; - } -} - -/* * fixup_activate is called when: * - an active object is activated * - an unknown object is activated (might be a statically initialized object) @@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) debug_object_init(head, &rcuhead_debug_descr); debug_object_activate(head, &rcuhead_debug_descr); return 0; - - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_activate(head, &rcuhead_debug_descr); - return 1; default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_free(head, &rcuhead_debug_descr); return 1; - default: - return 0; } } @@ -369,9 +271,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_init = rcuhead_fixup_init, .fixup_activate = rcuhead_fixup_activate, - .fixup_free = rcuhead_fixup_free, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 3d936f0fbcd..be63101c617 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -52,72 +52,78 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); -static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ -static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval = 60; /* Interval between stats, in seconds. */ - /* Zero means "only at end of test". */ -static bool verbose; /* Print more debug info. */ -static bool test_no_idle_hz = true; - /* Test RCU support for tickless idle CPUs. */ -static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ -static int stutter = 5; /* Start/stop testing interval (in sec) */ -static int irqreader = 1; /* RCU readers from irq (timers). */ -static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ -static int fqs_holdoff; /* Hold time within burst (us). */ -static int fqs_stutter = 3; /* Wait time between bursts (s). */ -static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ -static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ -static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ -static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ -static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ -static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ -static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ -static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ -static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ -static char *torture_type = "rcu"; /* What RCU implementation to torture. */ - -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int fqs_duration; module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); +static int fqs_holdoff; module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +static int fqs_stutter = 3; module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +static bool gp_exp; +module_param(gp_exp, bool, 0444); +MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); +static bool gp_normal; +module_param(gp_normal, bool, 0444); +MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +static int irqreader = 1; +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int n_barrier_cbs; module_param(n_barrier_cbs, int, 0444); MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int nfakewriters = 4; +module_param(nfakewriters, int, 0444); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); +static int nreaders = -1; +module_param(nreaders, int, 0444); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +static int object_debug; +module_param(object_debug, int, 0444); +MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); +static int onoff_holdoff; module_param(onoff_holdoff, int, 0444); MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); +static int onoff_interval; +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int shuffle_interval = 3; +module_param(shuffle_interval, int, 0444); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +static int shutdown_secs; module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); +static int stall_cpu; module_param(stall_cpu, int, 0444); MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +static int stall_cpu_holdoff = 10; module_param(stall_cpu_holdoff, int, 0444); MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); +static int stat_interval = 60; +module_param(stat_interval, int, 0644); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +static int stutter = 5; +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +static int test_boost = 1; module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static int test_boost_duration = 4; module_param(test_boost_duration, int, 0444); MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); +static int test_boost_interval = 7; +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static bool test_no_idle_hz = true; +module_param(test_no_idle_hz, bool, 0444); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +static bool verbose; +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ @@ -360,6 +366,7 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); + void (*exp_sync)(void); void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); @@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) call_rcu(&p->rtort_rcu, rcu_torture_cb); } -static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu" -}; - -static void rcu_sync_torture_deferred_free(struct rcu_torture *p) -{ - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - - cur_ops->sync(); - list_add(&p->rtort_free, &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } -} - static void rcu_sync_torture_init(void) { INIT_LIST_HEAD(&rcu_torture_removed); } -static struct rcu_torture_ops rcu_sync_ops = { +static struct rcu_torture_ops rcu_ops = { .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, + .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu_sync" -}; - -static struct rcu_torture_ops rcu_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_expedited, - .call = NULL, - .cb_barrier = NULL, + .exp_sync = synchronize_rcu_expedited, + .call = call_rcu, + .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .can_boost = rcu_can_boost(), - .name = "rcu_expedited" + .name = "rcu" }; /* @@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, + .init = rcu_sync_torture_init, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, .call = call_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, @@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = { .name = "rcu_bh" }; -static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_sync" -}; - -static struct rcu_torture_ops rcu_bh_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh_expedited, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_expedited" -}; - /* * Definitions for srcu torture testing. */ @@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page) return cnt; } +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + static struct rcu_torture_ops srcu_ops = { .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, @@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .name = "srcu" }; -static struct rcu_torture_ops srcu_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_sync" -}; - -static void srcu_torture_synchronize_expedited(void) -{ - synchronize_srcu_expedited(&srcu_ctl); -} - -static struct rcu_torture_ops srcu_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize_expedited, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_expedited" -}; - /* * Definitions for sched torture testing. */ @@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = { .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops sched_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .name = "sched_sync" -}; - -static struct rcu_torture_ops sched_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched_expedited, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched_expedited" -}; - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { + bool exp; int i; - long oldbatch = rcu_batches_completed(); struct rcu_torture *rp; + struct rcu_torture *rp1; struct rcu_torture *old_rp; static DEFINE_RCU_RANDOM(rand); @@ -954,10 +823,33 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - cur_ops->deferred_free(old_rp); + if (gp_normal == gp_exp) + exp = !!(rcu_random(&rand) & 0x80); + else + exp = gp_exp; + if (!exp) { + cur_ops->deferred_free(old_rp); + } else { + cur_ops->exp_sync(); + list_add(&old_rp->rtort_free, + &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, + &rcu_torture_removed, + rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= + RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } + } } rcutorture_record_progress(++rcu_torture_current_version); - oldbatch = cur_ops->completed(); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); @@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) + rcu_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); - else + } else if (gp_normal == gp_exp) { + if (rcu_random(&rand) & 0x80) + cur_ops->sync(); + else + cur_ops->exp_sync(); + } else if (gp_normal) { cur_ops->sync(); + } else { + cur_ops->exp_sync(); + } rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg) torture_type, cpu); starttime = jiffies; n_online_attempts++; - if (cpu_up(cpu) == 0) { + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { if (verbose) pr_alert("%s" TORTURE_FLAG "rcu_torture_onoff task: onlined %d\n", @@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void) rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); } +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static void rcu_torture_leak_cb(struct rcu_head *rhp) +{ +} + +static void rcu_torture_err_cb(struct rcu_head *rhp) +{ + /* + * This -might- happen due to race conditions, but is unlikely. + * The scenario that leads to this happening is that the + * first of the pair of duplicate callbacks is queued, + * someone else starts a grace period that includes that + * callback, then the second of the pair must wait for the + * next grace period. Unlikely, but can happen. If it + * does happen, the debug-objects subsystem won't have splatted. + */ + pr_alert("rcutorture: duplicated callback was invoked.\n"); +} +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +/* + * Verify that double-free causes debug-objects to complain, but only + * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test + * cannot be carried out. + */ +static void rcu_test_debug_objects(void) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + struct rcu_head rh1; + struct rcu_head rh2; + + init_rcu_head_on_stack(&rh1); + init_rcu_head_on_stack(&rh2); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + + /* Try to queue the rh2 pair of callbacks for the same grace period. */ + preempt_disable(); /* Prevent preemption from interrupting test. */ + rcu_read_lock(); /* Make it impossible to finish a grace period. */ + call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + local_irq_disable(); /* Make it harder to start a new grace period. */ + call_rcu(&rh2, rcu_torture_leak_cb); + call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + local_irq_enable(); + rcu_read_unlock(); + preempt_enable(); + + /* Wait for them all to get done so we can safely return. */ + rcu_barrier(); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + destroy_rcu_head_on_stack(&rh1); + destroy_rcu_head_on_stack(&rh2); +#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); +#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +} + static int __init rcu_torture_init(void) { @@ -1941,11 +1903,9 @@ rcu_torture_init(void) int cpu; int firsterr = 0; int retval; - static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, - &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, - &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; + static struct rcu_torture_ops *torture_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + }; mutex_lock(&fullstop_mutex); @@ -2163,6 +2123,8 @@ rcu_torture_init(void) firsterr = retval; goto unwind; } + if (object_debug) + rcu_test_debug_objects(); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 338f1d1c1c6..32618b3fe4e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -54,6 +54,7 @@ #include <linux/stop_machine.h> #include <linux/random.h> #include <linux/ftrace_event.h> +#include <linux/suspend.h> #include "rcutree.h" #include <trace/events/rcu.h> @@ -224,6 +225,10 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -242,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644); static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -427,6 +435,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); + rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -444,27 +453,6 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } - -/** - * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace - * after the current irq returns. - * - * This is similar to rcu_user_enter() but in the context of a non-nesting - * irq. After this call, RCU enters into idle mode when the interrupt - * returns. - */ -void rcu_user_enter_after_irq(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - /* Ensure this irq is interrupting a non-idle RCU state. */ - WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); - rdtp->dynticks_nesting = 1; - local_irq_restore(flags); -} #endif /* CONFIG_RCU_USER_QS */ /** @@ -498,6 +486,7 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else rcu_eqs_enter_common(rdtp, oldval, true); + rcu_sysidle_enter(rdtp, 1); local_irq_restore(flags); } @@ -566,6 +555,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); + rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -581,28 +571,6 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } - -/** - * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace - * idle mode after the current non-nesting irq returns. - * - * This is similar to rcu_user_exit() but in the context of an irq. - * This is called when the irq has interrupted a userspace RCU idle mode - * context. When the current non-nesting interrupt returns after this call, - * the CPU won't restore the RCU idle mode. - */ -void rcu_user_exit_after_irq(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - /* Ensure we are interrupting an RCU idle mode. */ - WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); - rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; - local_irq_restore(flags); -} #endif /* CONFIG_RCU_USER_QS */ /** @@ -639,6 +607,7 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(rdtp, oldval, true); + rcu_sysidle_exit(rdtp, 1); local_irq_restore(flags); } @@ -762,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) +static int dyntick_save_progress_counter(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp->dynticks_snap & 0x1) == 0; } @@ -774,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) { unsigned int curr; unsigned int snap; @@ -1332,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); rsp->gp_flags = 0; /* Clear all flags: New grace period. */ @@ -1396,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp) int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { int fqs_state = fqs_state_in; + bool isidle = false; + unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } + force_qs_rnp(rsp, dyntick_save_progress_counter, + &isidle, &maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + isidle = 0; + force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -1575,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, /* * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Deter - * the wakeup to interrupt context. + * could cause possible deadlocks with the rq->lock. Defer + * the wakeup to interrupt context. And don't bother waking + * up the running kthread. */ - irq_work_queue(&rsp->wakeup_work); + if (current != rsp->gp_kthread) + irq_work_queue(&rsp->wakeup_work); } /* @@ -2104,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj) { unsigned long bit; int cpu; @@ -2127,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu))) - mask |= bit; + if ((rnp->qsmask & bit) != 0) { + if ((rnp->qsmaskinit & bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { @@ -2304,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, } /* + * RCU callback function to leak a callback. + */ +static void rcu_leak_callback(struct rcu_head *rhp) +{ +} + +/* * Helper function for call_rcu() and friends. The cpu argument will * normally be -1, indicating "currently running CPU". It may specify * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() @@ -2317,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_data *rdp; WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + /* Probable double call_rcu(), so leak the callback. */ + ACCESS_ONCE(head->func) = rcu_leak_callback; + WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); + return; + } head->func = func; head->next = NULL; @@ -2802,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp) * transition. The "if" expression below therefore rounds the old * value up to the next even number and adds two before comparing. */ - snap_done = ACCESS_ONCE(rsp->n_barrier_done); + snap_done = rsp->n_barrier_done; _rcu_barrier_trace(rsp, "Check", -1, snap_done); - if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + + /* + * If the value in snap is odd, we needed to wait for the current + * rcu_barrier() to complete, then wait for the next one, in other + * words, we need the value of snap_done to be three larger than + * the value of snap. On the other hand, if the value in snap is + * even, we only had to wait for the next rcu_barrier() to complete, + * in other words, we need the value of snap_done to be only two + * greater than the value of snap. The "(snap + 3) & ~0x1" computes + * this for us (thank you, Linus!). + */ + if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); @@ -2947,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->blimit = blimit; init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -3032,6 +3046,25 @@ static int rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } +static int rcu_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ + rcu_expedited = 1; + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + rcu_expedited = 0; + break; + default: + break; + } + return NOTIFY_OK; +} + /* * Spawn the kthread that handles this RCU flavor's grace periods. */ @@ -3273,6 +3306,7 @@ void __init rcu_init(void) * or the scheduler are operational. */ cpu_notifier(rcu_cpu_notify, 0); + pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cbdeac6cea9..5f97eab602c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,6 +88,14 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + long long dynticks_idle_nesting; + /* irq/process nesting level from idle. */ + atomic_t dynticks_idle; /* Even value for idle, else odd. */ + /* "Idle" excludes userspace execution. */ + unsigned long dynticks_idle_jiffies; + /* End of last non-NMI non-idle period. */ +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); +static void rcu_bind_gp_kthread(void); +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index dff86f53ee0..130c97b027f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> -#include <linux/tick.h> +#include "time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -2373,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu) smp_send_reschedule(cpu); #endif /* #ifdef CONFIG_NO_HZ_FULL */ } + + +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + +/* + * Define RCU flavor that holds sysidle state. This needs to be the + * most active flavor of RCU. + */ +#ifdef CONFIG_PREEMPT_RCU +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; +#else /* #ifdef CONFIG_PREEMPT_RCU */ +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +static int full_sysidle_state; /* Current system-idle state. */ +#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ +#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ +#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ +#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ +#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ + +/* + * Invoked to note exit from irq or task transition to idle. Note that + * usermode execution does -not- count as idle here! After all, we want + * to detect full-system idle states, not RCU quiescent states and grace + * periods. The caller must have disabled interrupts. + */ +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ + unsigned long j; + + /* Adjust nesting, check for fully idle. */ + if (irq) { + rdtp->dynticks_idle_nesting--; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + if (rdtp->dynticks_idle_nesting != 0) + return; /* Still not fully idle. */ + } else { + if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) { + rdtp->dynticks_idle_nesting = 0; + } else { + rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + return; /* Still not fully idle. */ + } + } + + /* Record start of fully idle period. */ + j = jiffies; + ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); +} + +/* + * Unconditionally force exit from full system-idle state. This is + * invoked when a normal CPU exits idle, but must be called separately + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this + * is that the timekeeping CPU is permitted to take scheduling-clock + * interrupts while the system is in system-idle state, and of course + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock + * interrupt from any other type of interrupt. + */ +void rcu_sysidle_force_exit(void) +{ + int oldstate = ACCESS_ONCE(full_sysidle_state); + int newoldstate; + + /* + * Each pass through the following loop attempts to exit full + * system-idle state. If contention proves to be a problem, + * a trylock-based contention tree could be used here. + */ + while (oldstate > RCU_SYSIDLE_SHORT) { + newoldstate = cmpxchg(&full_sysidle_state, + oldstate, RCU_SYSIDLE_NOT); + if (oldstate == newoldstate && + oldstate == RCU_SYSIDLE_FULL_NOTED) { + rcu_kick_nohz_cpu(tick_do_timer_cpu); + return; /* We cleared it, done! */ + } + oldstate = newoldstate; + } + smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ +} + +/* + * Invoked to note entry to irq or task transition from idle. Note that + * usermode execution does -not- count as idle here! The caller must + * have disabled interrupts. + */ +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ + /* Adjust nesting, check for already non-idle. */ + if (irq) { + rdtp->dynticks_idle_nesting++; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + if (rdtp->dynticks_idle_nesting != 1) + return; /* Already non-idle. */ + } else { + /* + * Allow for irq misnesting. Yes, it really is possible + * to enter an irq handler then never leave it, and maybe + * also vice versa. Handle both possibilities. + */ + if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { + rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + return; /* Already non-idle. */ + } else { + rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; + } + } + + /* Record end of idle period. */ + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); + + /* + * If we are the timekeeping CPU, we are permitted to be non-idle + * during a system-idle state. This must be the case, because + * the timekeeping CPU has to take scheduling-clock interrupts + * during the time that the system is transitioning to full + * system-idle state. This means that the timekeeping CPU must + * invoke rcu_sysidle_force_exit() directly if it does anything + * more than take a scheduling-clock interrupt. + */ + if (smp_processor_id() == tick_do_timer_cpu) + return; + + /* Update system-idle state: We are clearly no longer fully idle! */ + rcu_sysidle_force_exit(); +} + +/* + * Check to see if the current CPU is idle. Note that usermode execution + * does not count as idle. The caller must have disabled interrupts. + */ +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ + int cur; + unsigned long j; + struct rcu_dynticks *rdtp = rdp->dynticks; + + /* + * If some other CPU has already reported non-idle, if this is + * not the flavor of RCU that tracks sysidle state, or if this + * is an offline or the timekeeping CPU, nothing to do. + */ + if (!*isidle || rdp->rsp != rcu_sysidle_state || + cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) + return; + if (rcu_gp_in_progress(rdp->rsp)) + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); + + /* Pick up current idle and NMI-nesting counter and check. */ + cur = atomic_read(&rdtp->dynticks_idle); + if (cur & 0x1) { + *isidle = false; /* We are not idle! */ + return; + } + smp_mb(); /* Read counters before timestamps. */ + + /* Pick up timestamps. */ + j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); + /* If this CPU entered idle more recently, update maxj timestamp. */ + if (ULONG_CMP_LT(*maxj, j)) + *maxj = j; +} + +/* + * Is this the flavor of RCU that is handling full-system idle? + */ +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return rsp == rcu_sysidle_state; +} + +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/* + * Return a delay in jiffies based on the number of CPUs, rcu_node + * leaf fanout, and jiffies tick rate. The idea is to allow larger + * systems more time to transition to full-idle state in order to + * avoid the cache thrashing that otherwise occur on the state variable. + * Really small systems (less than a couple of tens of CPUs) should + * instead use a single global atomically incremented counter, and later + * versions of this will automatically reconfigure themselves accordingly. + */ +static unsigned long rcu_sysidle_delay(void) +{ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return 0; + return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); +} + +/* + * Advance the full-system-idle state. This is invoked when all of + * the non-timekeeping CPUs are idle. + */ +static void rcu_sysidle(unsigned long j) +{ + /* Check the current state. */ + switch (ACCESS_ONCE(full_sysidle_state)) { + case RCU_SYSIDLE_NOT: + + /* First time all are idle, so note a short idle period. */ + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; + break; + + case RCU_SYSIDLE_SHORT: + + /* + * Idle for a bit, time to advance to next state? + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); + break; + + case RCU_SYSIDLE_LONG: + + /* + * Do an additional check pass before advancing to full. + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); + break; + + default: + break; + } +} + +/* + * Found a non-idle non-timekeeping CPU, so kick the system-idle state + * back to the beginning. + */ +static void rcu_sysidle_cancel(void) +{ + smp_mb(); + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; +} + +/* + * Update the sysidle state based on the results of a force-quiescent-state + * scan of the CPUs' dyntick-idle state. + */ +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, + unsigned long maxj, bool gpkt) +{ + if (rsp != rcu_sysidle_state) + return; /* Wrong flavor, ignore. */ + if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return; /* Running state machine from timekeeping CPU. */ + if (isidle) + rcu_sysidle(maxj); /* More idle! */ + else + rcu_sysidle_cancel(); /* Idle is over. */ +} + +/* + * Wrapper for rcu_sysidle_report() when called from the grace-period + * kthread's context. + */ +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ + rcu_sysidle_report(rsp, isidle, maxj, true); +} + +/* Callback and function for forcing an RCU grace period. */ +struct rcu_sysidle_head { + struct rcu_head rh; + int inuse; +}; + +static void rcu_sysidle_cb(struct rcu_head *rhp) +{ + struct rcu_sysidle_head *rshp; + + /* + * The following memory barrier is needed to replace the + * memory barriers that would normally be in the memory + * allocator. + */ + smp_mb(); /* grace period precedes setting inuse. */ + + rshp = container_of(rhp, struct rcu_sysidle_head, rh); + ACCESS_ONCE(rshp->inuse) = 0; +} + +/* + * Check to see if the system is fully idle, other than the timekeeping CPU. + * The caller must have disabled interrupts. + */ +bool rcu_sys_is_idle(void) +{ + static struct rcu_sysidle_head rsh; + int rss = ACCESS_ONCE(full_sysidle_state); + + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) + return false; + + /* Handle small-system case by doing a full scan of CPUs. */ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { + int oldrss = rss - 1; + + /* + * One pass to advance to each state up to _FULL. + * Give up if any pass fails to advance the state. + */ + while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { + int cpu; + bool isidle = true; + unsigned long maxj = jiffies - ULONG_MAX / 4; + struct rcu_data *rdp; + + /* Scan all the CPUs looking for nonidle CPUs. */ + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rcu_sysidle_check_cpu(rdp, &isidle, &maxj); + if (!isidle) + break; + } + rcu_sysidle_report(rcu_sysidle_state, + isidle, maxj, false); + oldrss = rss; + rss = ACCESS_ONCE(full_sysidle_state); + } + } + + /* If this is the first observation of an idle period, record it. */ + if (rss == RCU_SYSIDLE_FULL) { + rss = cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); + return rss == RCU_SYSIDLE_FULL; + } + + smp_mb(); /* ensure rss load happens before later caller actions. */ + + /* If already fully idle, tell the caller (in case of races). */ + if (rss == RCU_SYSIDLE_FULL_NOTED) + return true; + + /* + * If we aren't there yet, and a grace period is not in flight, + * initiate a grace period. Either way, tell the caller that + * we are not there yet. We use an xchg() rather than an assignment + * to make up for the memory barriers that would otherwise be + * provided by the memory allocator. + */ + if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && + !rcu_gp_in_progress(rcu_sysidle_state) && + !rsh.inuse && xchg(&rsh.inuse, 1) == 0) + call_rcu(&rsh.rh, rcu_sysidle_cb); + return false; +} + +/* + * Initialize dynticks sysidle state for CPUs coming online. + */ +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ + rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; +} + +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ +} + +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return false; +} + +static void rcu_bind_gp_kthread(void) +{ +} + +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ +} + +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012..3381f098070 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -134,6 +134,56 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. +config NO_HZ_FULL_SYSIDLE + bool "Detect full-system idle state for full dynticks system" + depends on NO_HZ_FULL + default n + help + At least one CPU must keep the scheduling-clock tick running for + timekeeping purposes whenever there is a non-idle CPU, where + "non-idle" also includes dynticks CPUs as long as they are + running non-idle tasks. Because the underlying adaptive-tick + support cannot distinguish between all CPUs being idle and + all CPUs each running a single task in dynticks mode, the + underlying support simply ensures that there is always a CPU + handling the scheduling-clock tick, whether or not all CPUs + are idle. This Kconfig option enables scalable detection of + the all-CPUs-idle state, thus allowing the scheduling-clock + tick to be disabled when all CPUs are idle. Note that scalable + detection of the all-CPUs-idle state means that larger systems + will be slower to declare the all-CPUs-idle state. + + Say Y if you would like to help debug all-CPUs-idle detection. + + Say N if you are unsure. + +config NO_HZ_FULL_SYSIDLE_SMALL + int "Number of CPUs above which large-system approach is used" + depends on NO_HZ_FULL_SYSIDLE + range 1 NR_CPUS + default 8 + help + The full-system idle detection mechanism takes a lazy approach + on large systems, as is required to attain decent scalability. + However, on smaller systems, scalability is not anywhere near as + large a concern as is energy efficiency. The sysidle subsystem + therefore uses a fast but non-scalable algorithm for small + systems and a lazier but scalable algorithm for large systems. + This Kconfig parameter defines the number of CPUs in the largest + system that will be considered to be "small". + + The default value will be fine in most cases. Battery-powered + systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger + numbers of CPUs, and (3) are suffering from battery-lifetime + problems due to long sysidle latencies might wish to experiment + with larger values for this Kconfig parameter. On the other + hand, they might be even better served by disabling NO_HZ_FULL + entirely, given that NO_HZ_FULL is intended for HPC and + real-time workloads that at present do not tend to be run on + battery-powered systems. + + Take the default if you are unsure. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |