From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- kernel/module.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2..f5934954fa9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, -- cgit v1.2.3-70-g09d2 From b9bf3121af348d9255f1c917830fe8c2df52efcb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:47 +0900 Subject: percpu: use DEFINE_PER_CPU_SHARED_ALIGNED() There are a few places where ___cacheline_aligned* is used with DEFINE_PER_CPU(). Use DEFINE_PER_CPU_SHARED_ALIGNED() instead. DEFINE_PER_CPU_SHARED_ALIGNED() applies alignment only on SMPs. While all other converted places used _in_smp variant or only get compiled for SMP, net/rds used unconditional ____cacheline_aligned. I don't see any reason these data structures should be aligned on UP and thus converted together. Signed-off-by: Tejun Heo Cc: Mike Frysinger Cc: Tony Luck Cc: Andy Grover --- arch/blackfin/mm/sram-alloc.c | 6 +++--- arch/ia64/kernel/smp.c | 3 ++- kernel/sched.c | 4 ++-- net/rds/ib_stats.c | 2 +- net/rds/iw_stats.c | 2 +- net/rds/page.c | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c index 0bc3c4ef0aa..99e4dbb1dfd 100644 --- a/arch/blackfin/mm/sram-alloc.c +++ b/arch/blackfin/mm/sram-alloc.c @@ -42,9 +42,9 @@ #include #include "blackfin_sram.h" -static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_data_sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_inst_sram_lock); static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp; /* the data structure for L1 scratchpad and DATA SRAM */ diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 94cf78ba28f..93ebfea43c6 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -58,7 +58,8 @@ static struct local_tlb_flush_counts { unsigned int count; } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -static DEFINE_PER_CPU(unsigned short [NR_CPUS], shadow_flush_counts) ____cacheline_aligned; +static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS], + shadow_flush_counts); #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e..34fd81d2178 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -318,12 +318,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 02e3e3d50d4..301ae51ae40 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "ib.h" -DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static char *rds_ib_stat_names[] = { "ib_connect_raced", diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c index ccc7e8f0bf0..fafea3cc92d 100644 --- a/net/rds/iw_stats.c +++ b/net/rds/iw_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "iw.h" -DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); static char *rds_iw_stat_names[] = { "iw_connect_raced", diff --git a/net/rds/page.c b/net/rds/page.c index c460743a89a..de7bb84bcd7 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -39,7 +39,7 @@ struct rds_page_remainder { unsigned long r_offset; }; -DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /* * returns 0 on success or -errno on failure. -- cgit v1.2.3-70-g09d2 From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:48 +0900 Subject: percpu: clean up percpu variable definitions Percpu variable definition is about to be updated such that all percpu symbols including the static ones must be unique. Update percpu variable definitions accordingly. * as,cfq: rename ioc_count uniquely * cpufreq: rename cpu_dbs_info uniquely * xen: move nesting_count out of xen_evtchn_do_upcall() and rename it * mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and rename it * ipv4,6: rename cookie_scratch uniquely * x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to pmc_irq_entry and nmi_entry to pmc_nmi_entry * perf_counter: rename disable_count to perf_disable_count * ftrace: rename test_event_disable to ftrace_test_event_disable * kmemleak: rename test_pointer to kmemleak_test_pointer * mce: rename next_interval to mce_next_interval [ Impact: percpu usage cleanups, no duplicate static percpu var names ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ivan Kokshaysky Cc: Jens Axboe Cc: Dave Jones Cc: Jeremy Fitzhardinge Cc: linux-mm Cc: David S. Miller Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Li Zefan Cc: Catalin Marinas Cc: Andi Kleen --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- block/as-iosched.c | 10 +++++----- block/cfq-iosched.c | 10 +++++----- drivers/cpufreq/cpufreq_conservative.c | 12 ++++++------ drivers/cpufreq/cpufreq_ondemand.c | 15 ++++++++------- drivers/xen/events.c | 9 +++++---- kernel/perf_counter.c | 6 +++--- kernel/trace/trace_events.c | 6 +++--- mm/kmemleak-test.c | 6 +++--- mm/page-writeback.c | 5 +++-- net/ipv4/syncookies.c | 5 +++-- net/ipv6/syncookies.c | 5 +++-- 13 files changed, 58 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..cba8cd3e957 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1914,7 +1914,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4946288d683..5fdf63aaaba 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -901,7 +901,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -1089,7 +1089,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1561,8 +1561,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static void @@ -1709,9 +1709,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6ee1d..ce8ba57c655 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 833ec18eaa6..0f1cc7d3855 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -1422,7 +1422,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1431,7 +1431,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1557,7 +1557,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -2658,7 +2658,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7fc58af748b..a7ef465c83b 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -65,7 +65,7 @@ struct cpu_dbs_info_s { int cpu; unsigned int enable:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -138,7 +138,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, + struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, freq->cpu); struct cpufreq_policy *policy; @@ -298,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(cs_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -388,7 +388,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cputime64_t cur_wall_time, cur_idle_time; unsigned int idle_time, wall_time; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -528,7 +528,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -548,7 +548,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 1911d172935..36f292a7bd0 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -73,7 +73,7 @@ struct cpu_dbs_info_s { unsigned int enable:1, sample_type:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -151,7 +151,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, unsigned int freq_hi, freq_lo; unsigned int index = 0; unsigned int jiffies_total, jiffies_hi, jiffies_lo; - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, + policy->cpu); if (!dbs_info->freq_table) { dbs_info->freq_lo = 0; @@ -196,7 +197,7 @@ static void ondemand_powersave_bias_init(void) { int i; for_each_online_cpu(i) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i); dbs_info->freq_table = cpufreq_frequency_get_table(i); dbs_info->freq_lo = 0; } @@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(od_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -391,7 +392,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) unsigned int load, load_freq; int freq_avg; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -548,7 +549,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -570,7 +571,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ab581fa6268..7d2987e9b1b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); unsigned count; exit_idle(); @@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__get_cpu_var(xed_nesting_count)++) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; } while(count != 1); out: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea..1fd7a2e7575 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count); void __perf_disable(void) { - __get_cpu_var(disable_count)++; + __get_cpu_var(perf_disable_count)++; } bool __perf_enable(void) { - return !--__get_cpu_var(disable_count); + return !--__get_cpu_var(perf_disable_count); } void perf_disable(void) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..54b1de5074b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f52..177a5169bbd 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935..2c075dcf03d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -607,6 +607,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -624,7 +626,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -637,7 +638,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { *p = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 84d90f2799b..a6e0e077ac3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,12 +37,13 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 23d0d6db046..6b6ae913b5d 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); /* * we have 320 bits of information to hash, copy in the remaining -- cgit v1.2.3-70-g09d2 From 134e63756d5f3d0f7604dfcca847b09d1b14fd66 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jul 2009 09:51:34 +0000 Subject: genetlink: make netns aware This makes generic netlink network namespace aware. No generic netlink families except for the controller family are made namespace aware, they need to be checked one by one and then set the family->netnsok member to true. A new function genlmsg_multicast_netns() is introduced to allow sending a multicast message in a given namespace, for example when it applies to an object that lives in that namespace, a new function genlmsg_multicast_allns() to send a message to all network namespaces (for objects that do not have an associated netns). The function genlmsg_multicast() is changed to multicast the message in just init_net, which is currently correct for all generic netlink families since they only work in init_net right now. Some will later want to work in all net namespaces because they do not care about the netns at all -- those will have to be converted to use one of the new functions genlmsg_multicast_allns() or genlmsg_multicast_netns() whenever they are made netns aware in some way. After this patch families can easily decide whether or not they should be available in all net namespaces. Many genl families us it for objects not related to networking and should therefore be available in all namespaces, but that will have to be done on a per family basis. Note that this doesn't touch on the checkpoint/restart problem where network namespaces could be used, genl families and multicast groups are numbered globally and I see no easy way of changing that, especially since it must be possible to multicast to all network namespaces for those families that do not care about netns. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- fs/dlm/netlink.c | 2 +- include/net/genetlink.h | 66 +++++++++++++-- include/net/net_namespace.h | 2 + kernel/taskstats.c | 10 +-- net/irda/irnetlink.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- net/netlink/genetlink.c | 186 ++++++++++++++++++++++++++++++++--------- net/tipc/netlink.c | 2 +- net/wireless/nl80211.c | 14 ++-- 9 files changed, 223 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index ccc9d62c462..55ea369f43a 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb) return rv; } - return genlmsg_unicast(skb, listener_nlpid); + return genlmsg_unicast(&init_net, skb, listener_nlpid); } static int user_cmd(struct sk_buff *skb, struct genl_info *info) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 1b0e3ee4ddd..2a1c06874c4 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -3,6 +3,7 @@ #include #include +#include /** * struct genl_multicast_group - generic netlink multicast group @@ -27,6 +28,8 @@ struct genl_multicast_group * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported + * @netnsok: set to true if the family can handle network + * namespaces and should be presented in all of them * @attrbuf: buffer to store parsed attributes * @ops_list: list of all assigned operations * @family_list: family list @@ -39,6 +42,7 @@ struct genl_family char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; + bool netnsok; struct nlattr ** attrbuf; /* private */ struct list_head ops_list; /* private */ struct list_head family_list; /* private */ @@ -62,8 +66,32 @@ struct genl_info struct genlmsghdr * genlhdr; void * userhdr; struct nlattr ** attrs; +#ifdef CONFIG_NET_NS + struct net * _net; +#endif }; +#ifdef CONFIG_NET_NS +static inline struct net *genl_info_net(struct genl_info *info) +{ + return info->_net; +} + +static inline void genl_info_net_set(struct genl_info *info, struct net *net) +{ + info->_net = net; +} +#else +static inline struct net *genl_info_net(struct genl_info *info) +{ + return &init_net; +} + +static inline void genl_info_net_set(struct genl_info *info, struct net *net) +{ +} +#endif + /** * struct genl_ops - generic netlink operations * @cmd: command identifier @@ -98,8 +126,6 @@ extern int genl_register_mc_group(struct genl_family *family, extern void genl_unregister_mc_group(struct genl_family *family, struct genl_multicast_group *grp); -extern struct sock *genl_sock; - /** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message @@ -170,7 +196,21 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) } /** - * genlmsg_multicast - multicast a netlink message + * genlmsg_multicast_netns - multicast a netlink message to a specific netns + * @net: the net namespace + * @skb: netlink message as socket buffer + * @pid: own netlink pid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + */ +static inline int genlmsg_multicast_netns(struct net *net, struct sk_buff *skb, + u32 pid, unsigned int group, gfp_t flags) +{ + return nlmsg_multicast(net->genl_sock, skb, pid, group, flags); +} + +/** + * genlmsg_multicast - multicast a netlink message to the default netns * @skb: netlink message as socket buffer * @pid: own netlink pid to avoid sending to yourself * @group: multicast group id @@ -179,17 +219,29 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) static inline int genlmsg_multicast(struct sk_buff *skb, u32 pid, unsigned int group, gfp_t flags) { - return nlmsg_multicast(genl_sock, skb, pid, group, flags); + return genlmsg_multicast_netns(&init_net, skb, pid, group, flags); } +/** + * genlmsg_multicast_allns - multicast a netlink message to all net namespaces + * @skb: netlink message as socket buffer + * @pid: own netlink pid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + * + * This function must hold the RTNL or rcu_read_lock(). + */ +int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, + unsigned int group, gfp_t flags); + /** * genlmsg_unicast - unicast a netlink message * @skb: netlink message as socket buffer * @pid: netlink pid of the destination socket */ -static inline int genlmsg_unicast(struct sk_buff *skb, u32 pid) +static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 pid) { - return nlmsg_unicast(genl_sock, skb, pid); + return nlmsg_unicast(net->genl_sock, skb, pid); } /** @@ -199,7 +251,7 @@ static inline int genlmsg_unicast(struct sk_buff *skb, u32 pid) */ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) { - return genlmsg_unicast(skb, info->snd_pid); + return genlmsg_unicast(genl_info_net(info), skb, info->snd_pid); } /** diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b34a6ee7375..3173d12d946 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -26,6 +26,7 @@ struct net_device; struct sock; struct ctl_table_header; struct net_generic; +struct sock; struct net { atomic_t count; /* To decided when the network @@ -57,6 +58,7 @@ struct net { spinlock_t rules_mod_lock; struct sock *rtnl; /* rtnetlink socket */ + struct sock *genl_sock; struct netns_core core; struct netns_mib mib; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 888adbcca30..ea8384d3caa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * Send taskstats data in @skb to listener with nl_pid @pid */ -static int send_reply(struct sk_buff *skb, pid_t pid) +static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); @@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid) return rc; } - return genlmsg_unicast(skb, pid); + return genlmsg_reply(skb, info); } /* @@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb, if (!skb_next) break; } - rc = genlmsg_unicast(skb_cur, s->pid); + rc = genlmsg_unicast(&init_net, skb_cur, s->pid); if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; @@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) goto err; } - rc = send_reply(rep_skb, info->snd_pid); + rc = send_reply(rep_skb, info); err: fput_light(file, fput_needed); @@ -487,7 +487,7 @@ free_return_rc: } else goto err; - return send_reply(rep_skb, info->snd_pid); + return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c index 8dd7ed7e7c1..476b307bd80 100644 --- a/net/irda/irnetlink.c +++ b/net/irda/irnetlink.c @@ -115,7 +115,7 @@ static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info) genlmsg_end(msg, hdr); - return genlmsg_unicast(msg, info->snd_pid); + return genlmsg_reply(msg, info); err_out: nlmsg_free(msg); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7c1333c67ff..2d24d81474c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3231,7 +3231,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) } genlmsg_end(msg, reply); - ret = genlmsg_unicast(msg, info->snd_pid); + ret = genlmsg_reply(msg, info); goto out; nla_put_failure: diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index eed4c6a8afc..575c6434150 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -18,8 +18,6 @@ #include #include -struct sock *genl_sock = NULL; - static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static inline void genl_lock(void) @@ -175,10 +173,31 @@ int genl_register_mc_group(struct genl_family *family, mc_groups_longs++; } - err = netlink_change_ngroups(genl_sock, - mc_groups_longs * BITS_PER_LONG); - if (err) - goto out; + if (family->netnsok) { + struct net *net; + + rcu_read_lock(); + for_each_net_rcu(net) { + err = netlink_change_ngroups(net->genl_sock, + mc_groups_longs * BITS_PER_LONG); + if (err) { + /* + * No need to roll back, can only fail if + * memory allocation fails and then the + * number of _possible_ groups has been + * increased on some sockets which is ok. + */ + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + } else { + err = netlink_change_ngroups(init_net.genl_sock, + mc_groups_longs * BITS_PER_LONG); + if (err) + goto out; + } grp->id = id; set_bit(id, mc_groups); @@ -195,8 +214,14 @@ EXPORT_SYMBOL(genl_register_mc_group); static void __genl_unregister_mc_group(struct genl_family *family, struct genl_multicast_group *grp) { + struct net *net; BUG_ON(grp->family != family); - netlink_clear_multicast_users(genl_sock, grp->id); + + rcu_read_lock(); + for_each_net_rcu(net) + netlink_clear_multicast_users(net->genl_sock, grp->id); + rcu_read_unlock(); + clear_bit(grp->id, mc_groups); list_del(&grp->list); genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, grp); @@ -467,6 +492,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct genl_ops *ops; struct genl_family *family; + struct net *net = sock_net(skb->sk); struct genl_info info; struct genlmsghdr *hdr = nlmsg_data(nlh); int hdrlen, err; @@ -475,6 +501,10 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (family == NULL) return -ENOENT; + /* this family doesn't exist in this netns */ + if (!family->netnsok && !net_eq(net, &init_net)) + return -ENOENT; + hdrlen = GENL_HDRLEN + family->hdrsize; if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; @@ -492,7 +522,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; genl_unlock(); - err = netlink_dump_start(genl_sock, skb, nlh, + err = netlink_dump_start(net->genl_sock, skb, nlh, ops->dumpit, ops->done); genl_lock(); return err; @@ -514,6 +544,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = family->attrbuf; + genl_info_net_set(&info, net); return ops->doit(skb, &info); } @@ -534,6 +565,7 @@ static struct genl_family genl_ctrl = { .name = "nlctrl", .version = 0x2, .maxattr = CTRL_ATTR_MAX, + .netnsok = true, }; static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, @@ -650,6 +682,7 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) int i, n = 0; struct genl_family *rt; + struct net *net = sock_net(skb->sk); int chains_to_skip = cb->args[0]; int fams_to_skip = cb->args[1]; @@ -658,6 +691,8 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) continue; n = 0; list_for_each_entry(rt, genl_family_chain(i), family_list) { + if (!rt->netnsok && !net_eq(net, &init_net)) + continue; if (++n < fams_to_skip) continue; if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, @@ -729,6 +764,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) if (info->attrs[CTRL_ATTR_FAMILY_ID]) { u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); res = genl_family_find_byid(id); + err = -ENOENT; } if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { @@ -736,49 +772,61 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); res = genl_family_find_byname(name); + err = -ENOENT; } - if (res == NULL) { - err = -ENOENT; - goto errout; + if (res == NULL) + return err; + + if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) { + /* family doesn't exist here */ + return -ENOENT; } msg = ctrl_build_family_msg(res, info->snd_pid, info->snd_seq, CTRL_CMD_NEWFAMILY); - if (IS_ERR(msg)) { - err = PTR_ERR(msg); - goto errout; - } + if (IS_ERR(msg)) + return PTR_ERR(msg); - err = genlmsg_reply(msg, info); -errout: - return err; + return genlmsg_reply(msg, info); } static int genl_ctrl_event(int event, void *data) { struct sk_buff *msg; + struct genl_family *family; + struct genl_multicast_group *grp; - if (genl_sock == NULL) + /* genl is still initialising */ + if (!init_net.genl_sock) return 0; switch (event) { case CTRL_CMD_NEWFAMILY: case CTRL_CMD_DELFAMILY: - msg = ctrl_build_family_msg(data, 0, 0, event); - if (IS_ERR(msg)) - return PTR_ERR(msg); - - genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL); + family = data; + msg = ctrl_build_family_msg(family, 0, 0, event); break; case CTRL_CMD_NEWMCAST_GRP: case CTRL_CMD_DELMCAST_GRP: + grp = data; + family = grp->family; msg = ctrl_build_mcgrp_msg(data, 0, 0, event); - if (IS_ERR(msg)) - return PTR_ERR(msg); - - genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL); break; + default: + return -EINVAL; + } + + if (IS_ERR(msg)) + return PTR_ERR(msg); + + if (!family->netnsok) { + genlmsg_multicast_netns(&init_net, msg, 0, + GENL_ID_CTRL, GFP_KERNEL); + } else { + rcu_read_lock(); + genlmsg_multicast_allns(msg, 0, GENL_ID_CTRL, GFP_ATOMIC); + rcu_read_unlock(); } return 0; @@ -795,6 +843,33 @@ static struct genl_multicast_group notify_grp = { .name = "notify", }; +static int __net_init genl_pernet_init(struct net *net) +{ + /* we'll bump the group number right afterwards */ + net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0, + genl_rcv, &genl_mutex, + THIS_MODULE); + + if (!net->genl_sock && net_eq(net, &init_net)) + panic("GENL: Cannot initialize generic netlink\n"); + + if (!net->genl_sock) + return -ENOMEM; + + return 0; +} + +static void __net_exit genl_pernet_exit(struct net *net) +{ + netlink_kernel_release(net->genl_sock); + net->genl_sock = NULL; +} + +static struct pernet_operations genl_pernet_ops = { + .init = genl_pernet_init, + .exit = genl_pernet_exit, +}; + static int __init genl_init(void) { int i, err; @@ -804,36 +879,67 @@ static int __init genl_init(void) err = genl_register_family(&genl_ctrl); if (err < 0) - goto errout; + goto problem; err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops); if (err < 0) - goto errout_register; + goto problem; netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); - /* we'll bump the group number right afterwards */ - genl_sock = netlink_kernel_create(&init_net, NETLINK_GENERIC, 0, - genl_rcv, &genl_mutex, THIS_MODULE); - if (genl_sock == NULL) - panic("GENL: Cannot initialize generic netlink\n"); + err = register_pernet_subsys(&genl_pernet_ops); + if (err) + goto problem; err = genl_register_mc_group(&genl_ctrl, ¬ify_grp); if (err < 0) - goto errout_register; + goto problem; return 0; -errout_register: - genl_unregister_family(&genl_ctrl); -errout: +problem: panic("GENL: Cannot register controller: %d\n", err); } subsys_initcall(genl_init); -EXPORT_SYMBOL(genl_sock); EXPORT_SYMBOL(genl_register_ops); EXPORT_SYMBOL(genl_unregister_ops); EXPORT_SYMBOL(genl_register_family); EXPORT_SYMBOL(genl_unregister_family); + +static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group, + gfp_t flags) +{ + struct sk_buff *tmp; + struct net *net, *prev = NULL; + int err; + + for_each_net_rcu(net) { + if (prev) { + tmp = skb_clone(skb, flags); + if (!tmp) { + err = -ENOMEM; + goto error; + } + err = nlmsg_multicast(prev->genl_sock, tmp, + pid, group, flags); + if (err) + goto error; + } + + prev = net; + } + + return nlmsg_multicast(prev->genl_sock, skb, pid, group, flags); + error: + kfree_skb(skb); + return err; +} + +int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group, + gfp_t flags) +{ + return genlmsg_mcast(skb, pid, group, flags); +} +EXPORT_SYMBOL(genlmsg_multicast_allns); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 3c57005e44d..7bda8e3d139 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -62,7 +62,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) rep_nlh = nlmsg_hdr(rep_buf); memcpy(rep_nlh, req_nlh, hdr_space); rep_nlh->nlmsg_len = rep_buf->len; - genlmsg_unicast(rep_buf, NETLINK_CB(skb).pid); + genlmsg_unicast(&init_net, rep_buf, NETLINK_CB(skb).pid); } return 0; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9deb12f73c4..2a04beba436 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -413,7 +413,7 @@ static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info) cfg80211_unlock_rdev(dev); - return genlmsg_unicast(msg, info->snd_pid); + return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); @@ -739,7 +739,7 @@ static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) dev_put(netdev); cfg80211_unlock_rdev(dev); - return genlmsg_unicast(msg, info->snd_pid); + return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); @@ -1030,7 +1030,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) goto nla_put_failure; genlmsg_end(msg, hdr); - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; nla_put_failure: @@ -1618,7 +1618,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) dev, mac_addr, &sinfo) < 0) goto out_free; - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; out_free: @@ -2087,7 +2087,7 @@ static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info) dev, dst, next_hop, &pinfo) < 0) goto out_free; - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; out_free: @@ -2436,7 +2436,7 @@ static int nl80211_get_mesh_params(struct sk_buff *skb, cur_params.dot11MeshHWMPnetDiameterTraversalTime); nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; nla_put_failure: @@ -2624,7 +2624,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) nla_nest_end(msg, nl_reg_rules); genlmsg_end(msg, hdr); - err = genlmsg_unicast(msg, info->snd_pid); + err = genlmsg_reply(msg, info); goto out; nla_put_failure: -- cgit v1.2.3-70-g09d2 From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:07 -0700 Subject: x86, intel_txt: Intel TXT Sx shutdown support Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch. Without this patch, attempting to place the system in one of the ACPI sleep states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and will cause a system reset, with memory locked. Not only may the subsequent memory scrub take some time, but the platform will be unable to enter the requested power state. This patch calls back into the tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag, after which it will place the system into the requested sleep state using ACPI information passed by the kernel. arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..61cc40887c4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1317,6 +1318,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index db307a356f0..8c01dd3724e 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -45,6 +45,7 @@ #include #include "accommon.h" #include "actables.h" +#include #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwsleep") @@ -342,6 +343,8 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) ACPI_FLUSH_CPU_CACHE(); + tboot_sleep(sleep_state, pm1a_control, pm1b_control); + /* Write #2: Write both SLP_TYP + SLP_EN */ status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..ff071e022a8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error, num_cpus = 0; error = stop_machine_create(); if (error) @@ -391,6 +392,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + num_cpus++; error = _cpu_down(cpu, 1); if (!error) { cpumask_set_cpu(cpu, frozen_cpus); @@ -401,6 +403,9 @@ int disable_nonboot_cpus(void) break; } } + /* ensure all CPUs have gone into wait-for-SIPI */ + error |= tboot_wait_for_aps(num_cpus); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ -- cgit v1.2.3-70-g09d2 From 5a165657bef7c47e5ff4cd138f7758ef6278e87b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 13 Aug 2009 05:20:45 +0000 Subject: net: skb ftracer - Add config option to enable new ftracer (v3) skb allocation / consumption corelator - Add config option This patch adds a Kconfig option to enable the addtition of the skb source tracer. Signed-off-by: Neil Horman Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 019f380fd76..f09d7635c0e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -234,6 +234,16 @@ config BOOT_TRACER You must pass in initcall_debug and ftrace=initcall to the kernel command line to enable this on bootup. +config SKB_SOURCES_TRACER + bool "Trace skb source information" + select GENERIC_TRACER + help + This tracer helps developers/sysadmins correlate skb allocation and + consumption. The idea being that some processes will primarily consume data + that was allocated on certain numa nodes. By being able to visualize which + nodes the data was allocated on, a sysadmin or developer can optimize the + scheduling of those processes to cut back on cross node chatter. + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER -- cgit v1.2.3-70-g09d2 From 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 13 Aug 2009 05:23:56 +0000 Subject: net: skb ftracer - Add actual ftrace code to kernel (v3) skb allocation / consumption correlator Add ftracer module to kernel to print out a list that correlates a process id, an skb it read, and the numa nodes on wich the process was running when it was read along with the numa node the skbuff was allocated on. Signed-off-by: Neil Horman Makefile | 1 trace.h | 19 ++++++ trace_skb_sources.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) Signed-off-by: David S. Miller --- kernel/trace/Makefile | 1 + kernel/trace/trace.h | 19 +++++ kernel/trace/trace_skb_sources.c | 154 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 kernel/trace/trace_skb_sources.c (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164dca90..ee5e5b1b928 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif +obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b9f4f6e955..8a6281b2330 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, + TRACE_SKB_SOURCE, __TRACE_LAST_TYPE, }; @@ -171,6 +173,21 @@ struct trace_power { struct power_trace state_data; }; +struct skb_record { + pid_t pid; /* pid of the copying process */ + int anid; /* node where skb was allocated */ + int cnid; /* node to which skb was copied in userspace */ + char ifname[IFNAMSIZ]; /* Name of the receiving interface */ + int rx_queue; /* The rx queue the skb was received on */ + int ccpu; /* Cpu the application got this frame from */ + int len; /* length of the data copied */ +}; + +struct trace_skb_event { + struct trace_entry ent; + struct skb_record event_data; +}; + enum kmemtrace_type_id { KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ @@ -323,6 +340,8 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ + IF_ASSIGN(var, ent, struct trace_skb_event, \ + TRACE_SKB_SOURCE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c new file mode 100644 index 00000000000..40eb071afdb --- /dev/null +++ b/kernel/trace/trace_skb_sources.c @@ -0,0 +1,154 @@ +/* + * ring buffer based tracer for analyzing per-socket skb sources + * + * Neil Horman + * Copyright (C) 2009 + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec); + +static struct trace_array *skb_trace; +static int __read_mostly trace_skb_source_enabled; + +static void probe_skb_dequeue(const struct sk_buff *skb, int len) +{ + struct ring_buffer_event *event; + struct trace_skb_event *entry; + struct trace_array *tr = skb_trace; + struct net_device *dev; + + if (!trace_skb_source_enabled) + return; + + if (in_interrupt()) + return; + + event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE, + sizeof(*entry), 0, 0); + if (!event) + return; + entry = ring_buffer_event_data(event); + + entry->event_data.pid = current->pid; + entry->event_data.anid = page_to_nid(virt_to_page(skb->data)); + entry->event_data.cnid = cpu_to_node(smp_processor_id()); + entry->event_data.len = len; + entry->event_data.rx_queue = skb->queue_mapping; + entry->event_data.ccpu = smp_processor_id(); + + dev = dev_get_by_index(sock_net(skb->sk), skb->iif); + if (dev) { + memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ); + dev_put(dev); + } else { + strcpy(entry->event_data.ifname, "Unknown"); + } + + trace_buffer_unlock_commit(tr, event, 0, 0); +} + +static int tracing_skb_source_register(void) +{ + int ret; + + ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue); + if (ret) + pr_info("skb source trace: Couldn't activate dequeue tracepoint"); + + return ret; +} + +static void start_skb_source_trace(struct trace_array *tr) +{ + trace_skb_source_enabled = 1; +} + +static void stop_skb_source_trace(struct trace_array *tr) +{ + trace_skb_source_enabled = 0; +} + +static void skb_source_trace_reset(struct trace_array *tr) +{ + trace_skb_source_enabled = 0; + unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue); +} + + +static int skb_source_trace_init(struct trace_array *tr) +{ + int cpu; + skb_trace = tr; + + trace_skb_source_enabled = 1; + tracing_skb_source_register(); + + for_each_cpu(cpu, cpu_possible_mask) + tracing_reset(tr, cpu); + return 0; +} + +static enum print_line_t skb_source_print_line(struct trace_iterator *iter) +{ + int ret = 0; + struct trace_entry *entry = iter->ent; + struct trace_skb_event *event; + struct skb_record *record; + struct trace_seq *s = &iter->seq; + + trace_assign_type(event, entry); + record = &event->event_data; + if (entry->type != TRACE_SKB_SOURCE) + return TRACE_TYPE_UNHANDLED; + + ret = trace_seq_printf(s, " %d %d %d %s %d %d %d\n", + record->pid, + record->anid, + record->cnid, + record->ifname, + record->rx_queue, + record->ccpu, + record->len); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static void skb_source_print_header(struct seq_file *s) +{ + seq_puts(s, "# PID ANID CNID IFC RXQ CCPU LEN\n"); + seq_puts(s, "# | | | | | | |\n"); +} + +static struct tracer skb_source_tracer __read_mostly = +{ + .name = "skb_sources", + .init = skb_source_trace_init, + .start = start_skb_source_trace, + .stop = stop_skb_source_trace, + .reset = skb_source_trace_reset, + .print_line = skb_source_print_line, + .print_header = skb_source_print_header, +}; + +static int init_skb_source_trace(void) +{ + return register_tracer(&skb_source_tracer); +} +device_initcall(init_skb_source_trace); -- cgit v1.2.3-70-g09d2 From 3a5209e3b26386fd174820ee6e8e27479b24869a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 17 Aug 2009 16:00:30 -0700 Subject: trace_skb: fix build when CONFIG_NET is not enabled Fix trace_skb_sources build when CONFIG_NET is not enabled: kernel/built-in.o: In function `probe_skb_dequeue': trace_skb_sources.c:(.text+0xd9152): undefined reference to `init_net' trace_skb_sources.c:(.text+0xd9188): undefined reference to `dev_get_by_index' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f09d7635c0e..9a2f19bf051 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -236,6 +236,7 @@ config BOOT_TRACER config SKB_SOURCES_TRACER bool "Trace skb source information" + depends on NET select GENERIC_TRACER help This tracer helps developers/sysadmins correlate skb allocation and -- cgit v1.2.3-70-g09d2 From de481560eb0bd9d940b90311eba85711e4b1150b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Apr 2009 12:32:04 -0400 Subject: kconfig: keep config.gz around even if CONFIG_IKCONFIG_PROC is not set If CONFIG_IKCONFIG is set but CONFIG_IKCONFIG_PROC is not, then gcc will optimize the config.gz out, because nobody uses it. This patch adds "__used" to the config.gz data to keep it around so that code like extract-ikconfig can still find it. [ Impact: allow extract-ikconfig to find config.gz ] Signed-off-by: Steven Rostedt --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2093a691f1c..d0c84e6bf50 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -119,7 +119,7 @@ $(obj)/config_data.gz: .config FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ - cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ + cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call if_changed,ikconfiggz) -- cgit v1.2.3-70-g09d2 From a15098c90df1ac2b1bfe1d33dd1c47063213aa9a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 9 Aug 2009 19:02:51 +0000 Subject: powerpc: Enable GCOV Make it possible to enable GCOV code coverage measurement on powerpc. Lightly tested on 64-bit, seems to work as expected. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/Makefile | 7 +++++++ arch/powerpc/kernel/vdso32/Makefile | 1 + arch/powerpc/kernel/vdso64/Makefile | 2 ++ arch/powerpc/xmon/Makefile | 2 ++ kernel/gcov/Kconfig | 2 +- 5 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 035946f9d5f..720e8c3b8e9 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -115,6 +115,13 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) obj-y += ppc_save_regs.o endif +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_prom_init.o := n +GCOV_PROFILE_ftrace.o := n +GCOV_PROFILE_machine_kexec_64.o := n +GCOV_PROFILE_machine_kexec_32.o := n +GCOV_PROFILE_kprobes.o := n + extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index c3d57bd01a8..b54b8168813 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -12,6 +12,7 @@ endif targets := $(obj-vdso32) vdso32.so vdso32.so.dbg obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) +GCOV_PROFILE := n EXTRA_CFLAGS := -shared -fno-common -fno-builtin EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index fa7f1b8f3e5..dd0c8e93677 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -7,6 +7,8 @@ obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o targets := $(obj-vdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) +GCOV_PROFILE := n + EXTRA_CFLAGS := -shared -fno-common -fno-builtin EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ $(call ld-option, -Wl$(comma)--hash-style=sysv) diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 85ab97ab840..faa81b6a661 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -2,6 +2,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +GCOV_PROFILE := n + ifdef CONFIG_PPC64 EXTRA_CFLAGS += -mno-minimal-toc endif diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcfaa3d..654efd09f6a 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 + depends on S390 || X86 || (PPC && EXPERIMENTAL) default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3-70-g09d2 From 269c861baa2fe7c114c3bc7831292758d29eb336 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:35 -0700 Subject: generic-ipi: Allow cpus not yet online to call smp_call_function with irqs disabled Because of deadlock possiblities smp_call_function() is not allowed to be called with interrupts disabled. Add an exception for the cpu not yet online, as no one else can send smp call function interrupt to this cpu that is not yet online and as such deadlock condition is not possible. Signed-off-by: Suresh Siddha Acked-by: Nick Piggin Signed-off-by: H. Peter Anvin --- kernel/smp.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ad63d850120..2accdf6712e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -176,6 +176,11 @@ void generic_smp_call_function_interrupt(void) struct call_function_data *data; int cpu = get_cpu(); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + /* * Ensure entry is visible on call_function_queue after we have * entered the IPI. See comment in smp_call_function_many. @@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + spin_lock(&q->lock); list_replace_init(&q->list, &list); spin_unlock(&q->lock); @@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, */ this_cpu = get_cpu(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, { csd_lock(data); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); generic_exec_single(cpu, data, wait); } @@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask, unsigned long flags; int cpu, next_cpu, this_cpu = smp_processor_id(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); -- cgit v1.2.3-70-g09d2 From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:36 -0700 Subject: x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init SDM Vol 3a section titled "MTRR considerations in MP systems" specifies the need for synchronizing the logical cpu's while initializing/updating MTRR. Currently Linux kernel does the synchronization of all cpu's only when a single MTRR register is programmed/updated. During an AP online (during boot/cpu-online/resume) where we initialize all the MTRR/PAT registers, we don't follow this synchronization algorithm. This can lead to scenarios where during a dynamic cpu online, that logical cpu is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical HT sibling continue to run (also with cache disabled because of cr0.cd=1 on its sibling). Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly (because of some VMX performance optimizations) and the above scenario (with one logical cpu doing VMX activity and another logical cpu coming online) can result in system crash. Fix the MTRR initialization by doing rendezvous of all the cpus. During boot and resume, we delay the MTRR/PAT init for APs till all the logical cpu's come online and the rendezvous process at the end of AP's bringup, will initialize the MTRR/PAT for all AP's. For dynamic single cpu online, we synchronize all the logical cpus and do the MTRR/PAT init on the AP that is coming online. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mtrr.h | 7 +++++++ arch/x86/kernel/cpu/mtrr/main.c | 46 +++++++++++++++++++++++++++++++++-------- arch/x86/kernel/smpboot.c | 14 +++++++++++++ arch/x86/power/cpu.c | 2 +- kernel/cpu.c | 14 +++++++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a51ada8467d..d5366ec5cb8 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -121,8 +121,12 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); +extern void set_mtrr_aps_delayed_init(void); +extern void mtrr_aps_init(void); +extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -161,6 +165,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) +#define set_mtrr_aps_delayed_init() do {} while (0) +#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_restore() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7af0f88a416..7339be0aa58 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +u32 mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -163,7 +164,10 @@ static void ipi_handler(void *info) if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else { + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); } @@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); /* Wait for the others */ while (atomic_read(&data.count)) @@ -721,9 +727,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries @@ -738,11 +742,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -753,6 +753,34 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = 1; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = 0; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..d720b7e0cf3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1116,9 +1116,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1140,6 +1153,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b3d20b9cac6..417c9f5b4af 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -242,7 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); - mtrr_ap_init(); + mtrr_bp_restore(); #ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..f5f9485b8c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -413,6 +413,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); -- cgit v1.2.3-70-g09d2 From 5e928f77a09a07f9dd595bb8a489965d69a83458 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Aug 2009 23:38:32 +0200 Subject: PM: Introduce core framework for run-time PM of I/O devices (rev. 17) Introduce a core framework for run-time power management of I/O devices. Add device run-time PM fields to 'struct dev_pm_info' and device run-time PM callbacks to 'struct dev_pm_ops'. Introduce a run-time PM workqueue and define some device run-time PM helper functions at the core level. Document all these things. Special thanks to Alan Stern for his help with the design and multiple detailed reviews of the pereceding versions of this patch and to Magnus Damm for testing feedback. Signed-off-by: Rafael J. Wysocki Acked-by: Magnus Damm --- Documentation/power/runtime_pm.txt | 378 ++++++++++++++ drivers/base/dd.c | 11 + drivers/base/power/Makefile | 1 + drivers/base/power/main.c | 22 +- drivers/base/power/power.h | 31 +- drivers/base/power/runtime.c | 1011 ++++++++++++++++++++++++++++++++++++ include/linux/pm.h | 101 +++- include/linux/pm_runtime.h | 114 ++++ kernel/power/Kconfig | 14 + kernel/power/main.c | 17 + 10 files changed, 1689 insertions(+), 11 deletions(-) create mode 100644 Documentation/power/runtime_pm.txt create mode 100644 drivers/base/power/runtime.c create mode 100644 include/linux/pm_runtime.h (limited to 'kernel') diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt new file mode 100644 index 00000000000..f49a33b704d --- /dev/null +++ b/Documentation/power/runtime_pm.txt @@ -0,0 +1,378 @@ +Run-time Power Management Framework for I/O Devices + +(C) 2009 Rafael J. Wysocki , Novell Inc. + +1. Introduction + +Support for run-time power management (run-time PM) of I/O devices is provided +at the power management core (PM core) level by means of: + +* The power management workqueue pm_wq in which bus types and device drivers can + put their PM-related work items. It is strongly recommended that pm_wq be + used for queuing all work items related to run-time PM, because this allows + them to be synchronized with system-wide power transitions (suspend to RAM, + hibernation and resume from system sleep states). pm_wq is declared in + include/linux/pm_runtime.h and defined in kernel/power/main.c. + +* A number of run-time PM fields in the 'power' member of 'struct device' (which + is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can + be used for synchronizing run-time PM operations with one another. + +* Three device run-time PM callbacks in 'struct dev_pm_ops' (defined in + include/linux/pm.h). + +* A set of helper functions defined in drivers/base/power/runtime.c that can be + used for carrying out run-time PM operations in such a way that the + synchronization between them is taken care of by the PM core. Bus types and + device drivers are encouraged to use these functions. + +The run-time PM callbacks present in 'struct dev_pm_ops', the device run-time PM +fields of 'struct dev_pm_info' and the core helper functions provided for +run-time PM are described below. + +2. Device Run-time PM Callbacks + +There are three device run-time PM callbacks defined in 'struct dev_pm_ops': + +struct dev_pm_ops { + ... + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + void (*runtime_idle)(struct device *dev); + ... +}; + +The ->runtime_suspend() callback is executed by the PM core for the bus type of +the device being suspended. The bus type's callback is then _entirely_ +_responsible_ for handling the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_suspend() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_suspend() +callback in a device driver as long as the bus type's ->runtime_suspend() knows +what to do to handle the device). + + * Once the bus type's ->runtime_suspend() callback has completed successfully + for given device, the PM core regards the device as suspended, which need + not mean that the device has been put into a low power state. It is + supposed to mean, however, that the device will not process data and will + not communicate with the CPU(s) and RAM until its bus type's + ->runtime_resume() callback is executed for it. The run-time PM status of + a device after successful execution of its bus type's ->runtime_suspend() + callback is 'suspended'. + + * If the bus type's ->runtime_suspend() callback returns -EBUSY or -EAGAIN, + the device's run-time PM status is supposed to be 'active', which means that + the device _must_ be fully operational afterwards. + + * If the bus type's ->runtime_suspend() callback returns an error code + different from -EBUSY or -EAGAIN, the PM core regards this as a fatal + error and will refuse to run the helper functions described in Section 4 + for the device, until the status of it is directly set either to 'active' + or to 'suspended' (the PM core provides special helper functions for this + purpose). + +In particular, if the driver requires remote wakeup capability for proper +functioning and device_may_wakeup() returns 'false' for the device, then +->runtime_suspend() should return -EBUSY. On the other hand, if +device_may_wakeup() returns 'true' for the device and the device is put +into a low power state during the execution of its bus type's +->runtime_suspend(), it is expected that remote wake-up (i.e. hardware mechanism +allowing the device to request a change of its power state, such as PCI PME) +will be enabled for the device. Generally, remote wake-up should be enabled +for all input devices put into a low power state at run time. + +The ->runtime_resume() callback is executed by the PM core for the bus type of +the device being woken up. The bus type's callback is then _entirely_ +_responsible_ for handling the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_resume() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_resume() +callback in a device driver as long as the bus type's ->runtime_resume() knows +what to do to handle the device). + + * Once the bus type's ->runtime_resume() callback has completed successfully, + the PM core regards the device as fully operational, which means that the + device _must_ be able to complete I/O operations as needed. The run-time + PM status of the device is then 'active'. + + * If the bus type's ->runtime_resume() callback returns an error code, the PM + core regards this as a fatal error and will refuse to run the helper + functions described in Section 4 for the device, until its status is + directly set either to 'active' or to 'suspended' (the PM core provides + special helper functions for this purpose). + +The ->runtime_idle() callback is executed by the PM core for the bus type of +given device whenever the device appears to be idle, which is indicated to the +PM core by two counters, the device's usage counter and the counter of 'active' +children of the device. + + * If any of these counters is decreased using a helper function provided by + the PM core and it turns out to be equal to zero, the other counter is + checked. If that counter also is equal to zero, the PM core executes the + device bus type's ->runtime_idle() callback (with the device as an + argument). + +The action performed by a bus type's ->runtime_idle() callback is totally +dependent on the bus type in question, but the expected and recommended action +is to check if the device can be suspended (i.e. if all of the conditions +necessary for suspending the device are satisfied) and to queue up a suspend +request for the device in that case. + +The helper functions provided by the PM core, described in Section 4, guarantee +that the following constraints are met with respect to the bus type's run-time +PM callbacks: + +(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute + ->runtime_suspend() in parallel with ->runtime_resume() or with another + instance of ->runtime_suspend() for the same device) with the exception that + ->runtime_suspend() or ->runtime_resume() can be executed in parallel with + ->runtime_idle() (although ->runtime_idle() will not be started while any + of the other callbacks is being executed for the same device). + +(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' + devices (i.e. the PM core will only execute ->runtime_idle() or + ->runtime_suspend() for the devices the run-time PM status of which is + 'active'). + +(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device + the usage counter of which is equal to zero _and_ either the counter of + 'active' children of which is equal to zero, or the 'power.ignore_children' + flag of which is set. + +(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the + PM core will only execute ->runtime_resume() for the devices the run-time + PM status of which is 'suspended'). + +Additionally, the helper functions provided by the PM core obey the following +rules: + + * If ->runtime_suspend() is about to be executed or there's a pending request + to execute it, ->runtime_idle() will not be executed for the same device. + + * A request to execute or to schedule the execution of ->runtime_suspend() + will cancel any pending requests to execute ->runtime_idle() for the same + device. + + * If ->runtime_resume() is about to be executed or there's a pending request + to execute it, the other callbacks will not be executed for the same device. + + * A request to execute ->runtime_resume() will cancel any pending or + scheduled requests to execute the other callbacks for the same device. + +3. Run-time PM Device Fields + +The following device run-time PM fields are present in 'struct dev_pm_info', as +defined in include/linux/pm.h: + + struct timer_list suspend_timer; + - timer used for scheduling (delayed) suspend request + + unsigned long timer_expires; + - timer expiration time, in jiffies (if this is different from zero, the + timer is running and will expire at that time, otherwise the timer is not + running) + + struct work_struct work; + - work structure used for queuing up requests (i.e. work items in pm_wq) + + wait_queue_head_t wait_queue; + - wait queue used if any of the helper functions needs to wait for another + one to complete + + spinlock_t lock; + - lock used for synchronisation + + atomic_t usage_count; + - the usage counter of the device + + atomic_t child_count; + - the count of 'active' children of the device + + unsigned int ignore_children; + - if set, the value of child_count is ignored (but still updated) + + unsigned int disable_depth; + - used for disabling the helper funcions (they work normally if this is + equal to zero); the initial value of it is 1 (i.e. run-time PM is + initially disabled for all devices) + + unsigned int runtime_error; + - if set, there was a fatal error (one of the callbacks returned error code + as described in Section 2), so the helper funtions will not work until + this flag is cleared; this is the error code returned by the failing + callback + + unsigned int idle_notification; + - if set, ->runtime_idle() is being executed + + unsigned int request_pending; + - if set, there's a pending request (i.e. a work item queued up into pm_wq) + + enum rpm_request request; + - type of request that's pending (valid if request_pending is set) + + unsigned int deferred_resume; + - set if ->runtime_resume() is about to be run while ->runtime_suspend() is + being executed for that device and it is not practical to wait for the + suspend to complete; means "start a resume as soon as you've suspended" + + enum rpm_status runtime_status; + - the run-time PM status of the device; this field's initial value is + RPM_SUSPENDED, which means that each device is initially regarded by the + PM core as 'suspended', regardless of its real hardware status + +All of the above fields are members of the 'power' member of 'struct device'. + +4. Run-time PM Device Helper Functions + +The following run-time PM helper functions are defined in +drivers/base/power/runtime.c and include/linux/pm_runtime.h: + + void pm_runtime_init(struct device *dev); + - initialize the device run-time PM fields in 'struct dev_pm_info' + + void pm_runtime_remove(struct device *dev); + - make sure that the run-time PM of the device will be disabled after + removing the device from device hierarchy + + int pm_runtime_idle(struct device *dev); + - execute ->runtime_idle() for the device's bus type; returns 0 on success + or error code on failure, where -EINPROGRESS means that ->runtime_idle() + is already being executed + + int pm_runtime_suspend(struct device *dev); + - execute ->runtime_suspend() for the device's bus type; returns 0 on + success, 1 if the device's run-time PM status was already 'suspended', or + error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt + to suspend the device again in future + + int pm_runtime_resume(struct device *dev); + - execute ->runtime_resume() for the device's bus type; returns 0 on + success, 1 if the device's run-time PM status was already 'active' or + error code on failure, where -EAGAIN means it may be safe to attempt to + resume the device again in future, but 'power.runtime_error' should be + checked additionally + + int pm_request_idle(struct device *dev); + - submit a request to execute ->runtime_idle() for the device's bus type + (the request is represented by a work item in pm_wq); returns 0 on success + or error code if the request has not been queued up + + int pm_schedule_suspend(struct device *dev, unsigned int delay); + - schedule the execution of ->runtime_suspend() for the device's bus type + in future, where 'delay' is the time to wait before queuing up a suspend + work item in pm_wq, in milliseconds (if 'delay' is zero, the work item is + queued up immediately); returns 0 on success, 1 if the device's PM + run-time status was already 'suspended', or error code if the request + hasn't been scheduled (or queued up if 'delay' is 0); if the execution of + ->runtime_suspend() is already scheduled and not yet expired, the new + value of 'delay' will be used as the time to wait + + int pm_request_resume(struct device *dev); + - submit a request to execute ->runtime_resume() for the device's bus type + (the request is represented by a work item in pm_wq); returns 0 on + success, 1 if the device's run-time PM status was already 'active', or + error code if the request hasn't been queued up + + void pm_runtime_get_noresume(struct device *dev); + - increment the device's usage counter + + int pm_runtime_get(struct device *dev); + - increment the device's usage counter, run pm_request_resume(dev) and + return its result + + int pm_runtime_get_sync(struct device *dev); + - increment the device's usage counter, run pm_runtime_resume(dev) and + return its result + + void pm_runtime_put_noidle(struct device *dev); + - decrement the device's usage counter + + int pm_runtime_put(struct device *dev); + - decrement the device's usage counter, run pm_request_idle(dev) and return + its result + + int pm_runtime_put_sync(struct device *dev); + - decrement the device's usage counter, run pm_runtime_idle(dev) and return + its result + + void pm_runtime_enable(struct device *dev); + - enable the run-time PM helper functions to run the device bus type's + run-time PM callbacks described in Section 2 + + int pm_runtime_disable(struct device *dev); + - prevent the run-time PM helper functions from running the device bus + type's run-time PM callbacks, make sure that all of the pending run-time + PM operations on the device are either completed or canceled; returns + 1 if there was a resume request pending and it was necessary to execute + ->runtime_resume() for the device's bus type to satisfy that request, + otherwise 0 is returned + + void pm_suspend_ignore_children(struct device *dev, bool enable); + - set/unset the power.ignore_children flag of the device + + int pm_runtime_set_active(struct device *dev); + - clear the device's 'power.runtime_error' flag, set the device's run-time + PM status to 'active' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero); it will fail and return error code if the device has a parent + which is not active and the 'power.ignore_children' flag of which is unset + + void pm_runtime_set_suspended(struct device *dev); + - clear the device's 'power.runtime_error' flag, set the device's run-time + PM status to 'suspended' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero) + +It is safe to execute the following helper functions from interrupt context: + +pm_request_idle() +pm_schedule_suspend() +pm_request_resume() +pm_runtime_get_noresume() +pm_runtime_get() +pm_runtime_put_noidle() +pm_runtime_put() +pm_suspend_ignore_children() +pm_runtime_set_active() +pm_runtime_set_suspended() +pm_runtime_enable() + +5. Run-time PM Initialization, Device Probing and Removal + +Initially, the run-time PM is disabled for all devices, which means that the +majority of the run-time PM helper funtions described in Section 4 will return +-EAGAIN until pm_runtime_enable() is called for the device. + +In addition to that, the initial run-time PM status of all devices is +'suspended', but it need not reflect the actual physical state of the device. +Thus, if the device is initially active (i.e. it is able to process I/O), its +run-time PM status must be changed to 'active', with the help of +pm_runtime_set_active(), before pm_runtime_enable() is called for the device. + +However, if the device has a parent and the parent's run-time PM is enabled, +calling pm_runtime_set_active() for the device will affect the parent, unless +the parent's 'power.ignore_children' flag is set. Namely, in that case the +parent won't be able to suspend at run time, using the PM core's helper +functions, as long as the child's status is 'active', even if the child's +run-time PM is still disabled (i.e. pm_runtime_enable() hasn't been called for +the child yet or pm_runtime_disable() has been called for it). For this reason, +once pm_runtime_set_active() has been called for the device, pm_runtime_enable() +should be called for it too as soon as reasonably possible or its run-time PM +status should be changed back to 'suspended' with the help of +pm_runtime_set_suspended(). + +If the default initial run-time PM status of the device (i.e. 'suspended') +reflects the actual state of the device, its bus type's or its driver's +->probe() callback will likely need to wake it up using one of the PM core's +helper functions described in Section 4. In that case, pm_runtime_resume() +should be used. Of course, for this purpose the device's run-time PM has to be +enabled earlier by calling pm_runtime_enable(). + +If the device bus type's or driver's ->probe() or ->remove() callback runs +pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts, +they will fail returning -EAGAIN, because the device's usage counter is +incremented by the core before executing ->probe() and ->remove(). Still, it +may be desirable to suspend the device as soon as ->probe() or ->remove() has +finished, so the PM core uses pm_runtime_idle_sync() to invoke the device bus +type's ->runtime_idle() callback at that time. diff --git a/drivers/base/dd.c b/drivers/base/dd.c index f0106875f01..7b34b3a48f6 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "base.h" #include "power/power.h" @@ -202,7 +203,10 @@ int driver_probe_device(struct device_driver *drv, struct device *dev) pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); + pm_runtime_get_noresume(dev); + pm_runtime_barrier(dev); ret = really_probe(dev, drv); + pm_runtime_put_sync(dev); return ret; } @@ -245,7 +249,9 @@ int device_attach(struct device *dev) ret = 0; } } else { + pm_runtime_get_noresume(dev); ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach); + pm_runtime_put_sync(dev); } up(&dev->sem); return ret; @@ -306,6 +312,9 @@ static void __device_release_driver(struct device *dev) drv = dev->driver; if (drv) { + pm_runtime_get_noresume(dev); + pm_runtime_barrier(dev); + driver_sysfs_remove(dev); if (dev->bus) @@ -324,6 +333,8 @@ static void __device_release_driver(struct device *dev) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_UNBOUND_DRIVER, dev); + + pm_runtime_put_sync(dev); } } diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 911208b8925..3ce3519e8f3 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_PM) += sysfs.o obj-$(CONFIG_PM_SLEEP) += main.o +obj-$(CONFIG_PM_RUNTIME) += runtime.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 1b1a786b7de..86990011277 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,16 @@ static DEFINE_MUTEX(dpm_list_mtx); */ static bool transition_started; +/** + * device_pm_init - Initialize the PM-related part of a device object + * @dev: Device object being initialized. + */ +void device_pm_init(struct device *dev) +{ + dev->power.status = DPM_ON; + pm_runtime_init(dev); +} + /** * device_pm_lock - lock the list of active devices used by the PM core */ @@ -105,6 +116,7 @@ void device_pm_remove(struct device *dev) mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); mutex_unlock(&dpm_list_mtx); + pm_runtime_remove(dev); } /** @@ -512,6 +524,7 @@ static void dpm_complete(pm_message_t state) mutex_unlock(&dpm_list_mtx); device_complete(dev, state); + pm_runtime_put_noidle(dev); mutex_lock(&dpm_list_mtx); } @@ -757,7 +770,14 @@ static int dpm_prepare(pm_message_t state) dev->power.status = DPM_PREPARING; mutex_unlock(&dpm_list_mtx); - error = device_prepare(dev, state); + pm_runtime_get_noresume(dev); + if (pm_runtime_barrier(dev) && device_may_wakeup(dev)) { + /* Wake-up requested during system sleep transition. */ + pm_runtime_put_noidle(dev); + error = -EBUSY; + } else { + error = device_prepare(dev, state); + } mutex_lock(&dpm_list_mtx); if (error) { diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index c7cb4fc3735..b8fa1aa5225 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -1,7 +1,14 @@ -static inline void device_pm_init(struct device *dev) -{ - dev->power.status = DPM_ON; -} +#ifdef CONFIG_PM_RUNTIME + +extern void pm_runtime_init(struct device *dev); +extern void pm_runtime_remove(struct device *dev); + +#else /* !CONFIG_PM_RUNTIME */ + +static inline void pm_runtime_init(struct device *dev) {} +static inline void pm_runtime_remove(struct device *dev) {} + +#endif /* !CONFIG_PM_RUNTIME */ #ifdef CONFIG_PM_SLEEP @@ -16,23 +23,33 @@ static inline struct device *to_device(struct list_head *entry) return container_of(entry, struct device, power.entry); } +extern void device_pm_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); -#else /* CONFIG_PM_SLEEP */ +#else /* !CONFIG_PM_SLEEP */ + +static inline void device_pm_init(struct device *dev) +{ + pm_runtime_init(dev); +} + +static inline void device_pm_remove(struct device *dev) +{ + pm_runtime_remove(dev); +} static inline void device_pm_add(struct device *dev) {} -static inline void device_pm_remove(struct device *dev) {} static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} -#endif +#endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c new file mode 100644 index 00000000000..38556f6cc22 --- /dev/null +++ b/drivers/base/power/runtime.c @@ -0,0 +1,1011 @@ +/* + * drivers/base/power/runtime.c - Helper functions for device run-time PM + * + * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include + +static int __pm_runtime_resume(struct device *dev, bool from_wq); +static int __pm_request_idle(struct device *dev); +static int __pm_request_resume(struct device *dev); + +/** + * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. + * @dev: Device to handle. + */ +static void pm_runtime_deactivate_timer(struct device *dev) +{ + if (dev->power.timer_expires > 0) { + del_timer(&dev->power.suspend_timer); + dev->power.timer_expires = 0; + } +} + +/** + * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. + * @dev: Device to handle. + */ +static void pm_runtime_cancel_pending(struct device *dev) +{ + pm_runtime_deactivate_timer(dev); + /* + * In case there's a request pending, make sure its work function will + * return without doing anything. + */ + dev->power.request = RPM_REQ_NONE; +} + +/** + * __pm_runtime_idle - Notify device bus type if the device can be suspended. + * @dev: Device to notify the bus type about. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_runtime_idle(struct device *dev) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + int retval = 0; + + dev_dbg(dev, "__pm_runtime_idle()!\n"); + + if (dev->power.runtime_error) + retval = -EINVAL; + else if (dev->power.idle_notification) + retval = -EINPROGRESS; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0 + || dev->power.runtime_status != RPM_ACTIVE) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + if (dev->power.request_pending) { + /* + * If an idle notification request is pending, cancel it. Any + * other pending request takes precedence over us. + */ + if (dev->power.request == RPM_REQ_IDLE) { + dev->power.request = RPM_REQ_NONE; + } else if (dev->power.request != RPM_REQ_NONE) { + retval = -EAGAIN; + goto out; + } + } + + dev->power.idle_notification = true; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_idle) { + spin_unlock_irq(&dev->power.lock); + + dev->bus->pm->runtime_idle(dev); + + spin_lock_irq(&dev->power.lock); + } + + dev->power.idle_notification = false; + wake_up_all(&dev->power.wait_queue); + + out: + dev_dbg(dev, "__pm_runtime_idle() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_idle - Notify device bus type if the device can be suspended. + * @dev: Device to notify the bus type about. + */ +int pm_runtime_idle(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_idle(dev); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_idle); + +/** + * __pm_runtime_suspend - Carry out run-time suspend of given device. + * @dev: Device to suspend. + * @from_wq: If set, the function has been called via pm_wq. + * + * Check if the device can be suspended and run the ->runtime_suspend() callback + * provided by its bus type. If another suspend has been started earlier, wait + * for it to finish. If an idle notification or suspend request is pending or + * scheduled, cancel it. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +int __pm_runtime_suspend(struct device *dev, bool from_wq) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + struct device *parent = NULL; + bool notify = false; + int retval = 0; + + dev_dbg(dev, "__pm_runtime_suspend()%s!\n", + from_wq ? " from workqueue" : ""); + + repeat: + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + /* Pending resume requests take precedence over us. */ + if (dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + retval = -EAGAIN; + goto out; + } + + /* Other scheduled or pending requests need to be canceled. */ + pm_runtime_cancel_pending(dev); + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (dev->power.runtime_status == RPM_RESUMING + || dev->power.disable_depth > 0 + || atomic_read(&dev->power.usage_count) > 0) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + if (dev->power.runtime_status == RPM_SUSPENDING) { + DEFINE_WAIT(wait); + + if (from_wq) { + retval = -EINPROGRESS; + goto out; + } + + /* Wait for the other suspend running in parallel with us. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_SUSPENDING) + break; + + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + goto repeat; + } + + dev->power.runtime_status = RPM_SUSPENDING; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) { + spin_unlock_irq(&dev->power.lock); + + retval = dev->bus->pm->runtime_suspend(dev); + + spin_lock_irq(&dev->power.lock); + dev->power.runtime_error = retval; + } else { + retval = -ENOSYS; + } + + if (retval) { + dev->power.runtime_status = RPM_ACTIVE; + pm_runtime_cancel_pending(dev); + dev->power.deferred_resume = false; + + if (retval == -EAGAIN || retval == -EBUSY) { + notify = true; + dev->power.runtime_error = 0; + } + } else { + dev->power.runtime_status = RPM_SUSPENDED; + + if (dev->parent) { + parent = dev->parent; + atomic_add_unless(&parent->power.child_count, -1, 0); + } + } + wake_up_all(&dev->power.wait_queue); + + if (dev->power.deferred_resume) { + dev->power.deferred_resume = false; + __pm_runtime_resume(dev, false); + retval = -EAGAIN; + goto out; + } + + if (notify) + __pm_runtime_idle(dev); + + if (parent && !parent->power.ignore_children) { + spin_unlock_irq(&dev->power.lock); + + pm_request_idle(parent); + + spin_lock_irq(&dev->power.lock); + } + + out: + dev_dbg(dev, "__pm_runtime_suspend() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_suspend - Carry out run-time suspend of given device. + * @dev: Device to suspend. + */ +int pm_runtime_suspend(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_suspend(dev, false); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_suspend); + +/** + * __pm_runtime_resume - Carry out run-time resume of given device. + * @dev: Device to resume. + * @from_wq: If set, the function has been called via pm_wq. + * + * Check if the device can be woken up and run the ->runtime_resume() callback + * provided by its bus type. If another resume has been started earlier, wait + * for it to finish. If there's a suspend running in parallel with this + * function, wait for it to finish and resume the device. Cancel any scheduled + * or pending requests. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +int __pm_runtime_resume(struct device *dev, bool from_wq) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + struct device *parent = NULL; + int retval = 0; + + dev_dbg(dev, "__pm_runtime_resume()%s!\n", + from_wq ? " from workqueue" : ""); + + repeat: + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + pm_runtime_cancel_pending(dev); + + if (dev->power.runtime_status == RPM_ACTIVE) + retval = 1; + else if (dev->power.disable_depth > 0) + retval = -EAGAIN; + if (retval) + goto out; + + if (dev->power.runtime_status == RPM_RESUMING + || dev->power.runtime_status == RPM_SUSPENDING) { + DEFINE_WAIT(wait); + + if (from_wq) { + if (dev->power.runtime_status == RPM_SUSPENDING) + dev->power.deferred_resume = true; + retval = -EINPROGRESS; + goto out; + } + + /* Wait for the operation carried out in parallel with us. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_RESUMING + && dev->power.runtime_status != RPM_SUSPENDING) + break; + + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + goto repeat; + } + + if (!parent && dev->parent) { + /* + * Increment the parent's resume counter and resume it if + * necessary. + */ + parent = dev->parent; + spin_unlock_irq(&dev->power.lock); + + pm_runtime_get_noresume(parent); + + spin_lock_irq(&parent->power.lock); + /* + * We can resume if the parent's run-time PM is disabled or it + * is set to ignore children. + */ + if (!parent->power.disable_depth + && !parent->power.ignore_children) { + __pm_runtime_resume(parent, false); + if (parent->power.runtime_status != RPM_ACTIVE) + retval = -EBUSY; + } + spin_unlock_irq(&parent->power.lock); + + spin_lock_irq(&dev->power.lock); + if (retval) + goto out; + goto repeat; + } + + dev->power.runtime_status = RPM_RESUMING; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) { + spin_unlock_irq(&dev->power.lock); + + retval = dev->bus->pm->runtime_resume(dev); + + spin_lock_irq(&dev->power.lock); + dev->power.runtime_error = retval; + } else { + retval = -ENOSYS; + } + + if (retval) { + dev->power.runtime_status = RPM_SUSPENDED; + pm_runtime_cancel_pending(dev); + } else { + dev->power.runtime_status = RPM_ACTIVE; + if (parent) + atomic_inc(&parent->power.child_count); + } + wake_up_all(&dev->power.wait_queue); + + if (!retval) + __pm_request_idle(dev); + + out: + if (parent) { + spin_unlock_irq(&dev->power.lock); + + pm_runtime_put(parent); + + spin_lock_irq(&dev->power.lock); + } + + dev_dbg(dev, "__pm_runtime_resume() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_resume - Carry out run-time resume of given device. + * @dev: Device to suspend. + */ +int pm_runtime_resume(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_resume(dev, false); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_resume); + +/** + * pm_runtime_work - Universal run-time PM work function. + * @work: Work structure used for scheduling the execution of this function. + * + * Use @work to get the device object the work is to be done for, determine what + * is to be done and execute the appropriate run-time PM function. + */ +static void pm_runtime_work(struct work_struct *work) +{ + struct device *dev = container_of(work, struct device, power.work); + enum rpm_request req; + + spin_lock_irq(&dev->power.lock); + + if (!dev->power.request_pending) + goto out; + + req = dev->power.request; + dev->power.request = RPM_REQ_NONE; + dev->power.request_pending = false; + + switch (req) { + case RPM_REQ_NONE: + break; + case RPM_REQ_IDLE: + __pm_runtime_idle(dev); + break; + case RPM_REQ_SUSPEND: + __pm_runtime_suspend(dev, true); + break; + case RPM_REQ_RESUME: + __pm_runtime_resume(dev, true); + break; + } + + out: + spin_unlock_irq(&dev->power.lock); +} + +/** + * __pm_request_idle - Submit an idle notification request for given device. + * @dev: Device to handle. + * + * Check if the device's run-time PM status is correct for suspending the device + * and queue up a request to run __pm_runtime_idle() for it. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_idle(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + retval = -EINVAL; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0 + || dev->power.runtime_status == RPM_SUSPENDED + || dev->power.runtime_status == RPM_SUSPENDING) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + return retval; + + if (dev->power.request_pending) { + /* Any requests other then RPM_REQ_IDLE take precedence. */ + if (dev->power.request == RPM_REQ_NONE) + dev->power.request = RPM_REQ_IDLE; + else if (dev->power.request != RPM_REQ_IDLE) + retval = -EAGAIN; + return retval; + } + + dev->power.request = RPM_REQ_IDLE; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return retval; +} + +/** + * pm_request_idle - Submit an idle notification request for given device. + * @dev: Device to handle. + */ +int pm_request_idle(struct device *dev) +{ + unsigned long flags; + int retval; + + spin_lock_irqsave(&dev->power.lock, flags); + retval = __pm_request_idle(dev); + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_request_idle); + +/** + * __pm_request_suspend - Submit a suspend request for given device. + * @dev: Device to suspend. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_suspend(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + return -EINVAL; + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0) + retval = -EAGAIN; + else if (dev->power.runtime_status == RPM_SUSPENDING) + retval = -EINPROGRESS; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval < 0) + return retval; + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* + * Pending resume requests take precedence over us, but we can + * overtake any other pending request. + */ + if (dev->power.request == RPM_REQ_RESUME) + retval = -EAGAIN; + else if (dev->power.request != RPM_REQ_SUSPEND) + dev->power.request = retval ? + RPM_REQ_NONE : RPM_REQ_SUSPEND; + return retval; + } else if (retval) { + return retval; + } + + dev->power.request = RPM_REQ_SUSPEND; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return 0; +} + +/** + * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). + * @data: Device pointer passed by pm_schedule_suspend(). + * + * Check if the time is right and execute __pm_request_suspend() in that case. + */ +static void pm_suspend_timer_fn(unsigned long data) +{ + struct device *dev = (struct device *)data; + unsigned long flags; + unsigned long expires; + + spin_lock_irqsave(&dev->power.lock, flags); + + expires = dev->power.timer_expires; + /* If 'expire' is after 'jiffies' we've been called too early. */ + if (expires > 0 && !time_after(expires, jiffies)) { + dev->power.timer_expires = 0; + __pm_request_suspend(dev); + } + + spin_unlock_irqrestore(&dev->power.lock, flags); +} + +/** + * pm_schedule_suspend - Set up a timer to submit a suspend request in future. + * @dev: Device to suspend. + * @delay: Time to wait before submitting a suspend request, in milliseconds. + */ +int pm_schedule_suspend(struct device *dev, unsigned int delay) +{ + unsigned long flags; + int retval = 0; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + if (!delay) { + retval = __pm_request_suspend(dev); + goto out; + } + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* + * Pending resume requests take precedence over us, but any + * other pending requests have to be canceled. + */ + if (dev->power.request == RPM_REQ_RESUME) { + retval = -EAGAIN; + goto out; + } + dev->power.request = RPM_REQ_NONE; + } + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (dev->power.runtime_status == RPM_SUSPENDING) + retval = -EINPROGRESS; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + dev->power.timer_expires = jiffies + msecs_to_jiffies(delay); + mod_timer(&dev->power.suspend_timer, dev->power.timer_expires); + + out: + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_schedule_suspend); + +/** + * pm_request_resume - Submit a resume request for given device. + * @dev: Device to resume. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_resume(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + return -EINVAL; + + if (dev->power.runtime_status == RPM_ACTIVE) + retval = 1; + else if (dev->power.runtime_status == RPM_RESUMING) + retval = -EINPROGRESS; + else if (dev->power.disable_depth > 0) + retval = -EAGAIN; + if (retval < 0) + return retval; + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* If non-resume request is pending, we can overtake it. */ + dev->power.request = retval ? RPM_REQ_NONE : RPM_REQ_RESUME; + return retval; + } else if (retval) { + return retval; + } + + dev->power.request = RPM_REQ_RESUME; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return retval; +} + +/** + * pm_request_resume - Submit a resume request for given device. + * @dev: Device to resume. + */ +int pm_request_resume(struct device *dev) +{ + unsigned long flags; + int retval; + + spin_lock_irqsave(&dev->power.lock, flags); + retval = __pm_request_resume(dev); + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_request_resume); + +/** + * __pm_runtime_get - Reference count a device and wake it up, if necessary. + * @dev: Device to handle. + * @sync: If set and the device is suspended, resume it synchronously. + * + * Increment the usage count of the device and if it was zero previously, + * resume it or submit a resume request for it, depending on the value of @sync. + */ +int __pm_runtime_get(struct device *dev, bool sync) +{ + int retval = 1; + + if (atomic_add_return(1, &dev->power.usage_count) == 1) + retval = sync ? pm_runtime_resume(dev) : pm_request_resume(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(__pm_runtime_get); + +/** + * __pm_runtime_put - Decrement the device's usage counter and notify its bus. + * @dev: Device to handle. + * @sync: If the device's bus type is to be notified, do that synchronously. + * + * Decrement the usage count of the device and if it reaches zero, carry out a + * synchronous idle notification or submit an idle notification request for it, + * depending on the value of @sync. + */ +int __pm_runtime_put(struct device *dev, bool sync) +{ + int retval = 0; + + if (atomic_dec_and_test(&dev->power.usage_count)) + retval = sync ? pm_runtime_idle(dev) : pm_request_idle(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(__pm_runtime_put); + +/** + * __pm_runtime_set_status - Set run-time PM status of a device. + * @dev: Device to handle. + * @status: New run-time PM status of the device. + * + * If run-time PM of the device is disabled or its power.runtime_error field is + * different from zero, the status may be changed either to RPM_ACTIVE, or to + * RPM_SUSPENDED, as long as that reflects the actual state of the device. + * However, if the device has a parent and the parent is not active, and the + * parent's power.ignore_children flag is unset, the device's status cannot be + * set to RPM_ACTIVE, so -EBUSY is returned in that case. + * + * If successful, __pm_runtime_set_status() clears the power.runtime_error field + * and the device parent's counter of unsuspended children is modified to + * reflect the new status. If the new status is RPM_SUSPENDED, an idle + * notification request for the parent is submitted. + */ +int __pm_runtime_set_status(struct device *dev, unsigned int status) +{ + struct device *parent = dev->parent; + unsigned long flags; + bool notify_parent = false; + int error = 0; + + if (status != RPM_ACTIVE && status != RPM_SUSPENDED) + return -EINVAL; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (!dev->power.runtime_error && !dev->power.disable_depth) { + error = -EAGAIN; + goto out; + } + + if (dev->power.runtime_status == status) + goto out_set; + + if (status == RPM_SUSPENDED) { + /* It always is possible to set the status to 'suspended'. */ + if (parent) { + atomic_add_unless(&parent->power.child_count, -1, 0); + notify_parent = !parent->power.ignore_children; + } + goto out_set; + } + + if (parent) { + spin_lock_irq(&parent->power.lock); + + /* + * It is invalid to put an active child under a parent that is + * not active, has run-time PM enabled and the + * 'power.ignore_children' flag unset. + */ + if (!parent->power.disable_depth + && !parent->power.ignore_children + && parent->power.runtime_status != RPM_ACTIVE) { + error = -EBUSY; + } else { + if (dev->power.runtime_status == RPM_SUSPENDED) + atomic_inc(&parent->power.child_count); + } + + spin_unlock_irq(&parent->power.lock); + + if (error) + goto out; + } + + out_set: + dev->power.runtime_status = status; + dev->power.runtime_error = 0; + out: + spin_unlock_irqrestore(&dev->power.lock, flags); + + if (notify_parent) + pm_request_idle(parent); + + return error; +} +EXPORT_SYMBOL_GPL(__pm_runtime_set_status); + +/** + * __pm_runtime_barrier - Cancel pending requests and wait for completions. + * @dev: Device to handle. + * + * Flush all pending requests for the device from pm_wq and wait for all + * run-time PM operations involving the device in progress to complete. + * + * Should be called under dev->power.lock with interrupts disabled. + */ +static void __pm_runtime_barrier(struct device *dev) +{ + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + dev->power.request = RPM_REQ_NONE; + spin_unlock_irq(&dev->power.lock); + + cancel_work_sync(&dev->power.work); + + spin_lock_irq(&dev->power.lock); + dev->power.request_pending = false; + } + + if (dev->power.runtime_status == RPM_SUSPENDING + || dev->power.runtime_status == RPM_RESUMING + || dev->power.idle_notification) { + DEFINE_WAIT(wait); + + /* Suspend, wake-up or idle notification in progress. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_SUSPENDING + && dev->power.runtime_status != RPM_RESUMING + && !dev->power.idle_notification) + break; + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + } +} + +/** + * pm_runtime_barrier - Flush pending requests and wait for completions. + * @dev: Device to handle. + * + * Prevent the device from being suspended by incrementing its usage counter and + * if there's a pending resume request for the device, wake the device up. + * Next, make sure that all pending requests for the device have been flushed + * from pm_wq and wait for all run-time PM operations involving the device in + * progress to complete. + * + * Return value: + * 1, if there was a resume request pending and the device had to be woken up, + * 0, otherwise + */ +int pm_runtime_barrier(struct device *dev) +{ + int retval = 0; + + pm_runtime_get_noresume(dev); + spin_lock_irq(&dev->power.lock); + + if (dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + __pm_runtime_resume(dev, false); + retval = 1; + } + + __pm_runtime_barrier(dev); + + spin_unlock_irq(&dev->power.lock); + pm_runtime_put_noidle(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_barrier); + +/** + * __pm_runtime_disable - Disable run-time PM of a device. + * @dev: Device to handle. + * @check_resume: If set, check if there's a resume request for the device. + * + * Increment power.disable_depth for the device and if was zero previously, + * cancel all pending run-time PM requests for the device and wait for all + * operations in progress to complete. The device can be either active or + * suspended after its run-time PM has been disabled. + * + * If @check_resume is set and there's a resume request pending when + * __pm_runtime_disable() is called and power.disable_depth is zero, the + * function will wake up the device before disabling its run-time PM. + */ +void __pm_runtime_disable(struct device *dev, bool check_resume) +{ + spin_lock_irq(&dev->power.lock); + + if (dev->power.disable_depth > 0) { + dev->power.disable_depth++; + goto out; + } + + /* + * Wake up the device if there's a resume request pending, because that + * means there probably is some I/O to process and disabling run-time PM + * shouldn't prevent the device from processing the I/O. + */ + if (check_resume && dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + /* + * Prevent suspends and idle notifications from being carried + * out after we have woken up the device. + */ + pm_runtime_get_noresume(dev); + + __pm_runtime_resume(dev, false); + + pm_runtime_put_noidle(dev); + } + + if (!dev->power.disable_depth++) + __pm_runtime_barrier(dev); + + out: + spin_unlock_irq(&dev->power.lock); +} +EXPORT_SYMBOL_GPL(__pm_runtime_disable); + +/** + * pm_runtime_enable - Enable run-time PM of a device. + * @dev: Device to handle. + */ +void pm_runtime_enable(struct device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (dev->power.disable_depth > 0) + dev->power.disable_depth--; + else + dev_warn(dev, "Unbalanced %s!\n", __func__); + + spin_unlock_irqrestore(&dev->power.lock, flags); +} +EXPORT_SYMBOL_GPL(pm_runtime_enable); + +/** + * pm_runtime_init - Initialize run-time PM fields in given device object. + * @dev: Device object to initialize. + */ +void pm_runtime_init(struct device *dev) +{ + spin_lock_init(&dev->power.lock); + + dev->power.runtime_status = RPM_SUSPENDED; + dev->power.idle_notification = false; + + dev->power.disable_depth = 1; + atomic_set(&dev->power.usage_count, 0); + + dev->power.runtime_error = 0; + + atomic_set(&dev->power.child_count, 0); + pm_suspend_ignore_children(dev, false); + + dev->power.request_pending = false; + dev->power.request = RPM_REQ_NONE; + dev->power.deferred_resume = false; + INIT_WORK(&dev->power.work, pm_runtime_work); + + dev->power.timer_expires = 0; + setup_timer(&dev->power.suspend_timer, pm_suspend_timer_fn, + (unsigned long)dev); + + init_waitqueue_head(&dev->power.wait_queue); +} + +/** + * pm_runtime_remove - Prepare for removing a device from device hierarchy. + * @dev: Device object being removed from device hierarchy. + */ +void pm_runtime_remove(struct device *dev) +{ + __pm_runtime_disable(dev, false); + + /* Change the status back to 'suspended' to match the initial status. */ + if (dev->power.runtime_status == RPM_ACTIVE) + pm_runtime_set_suspended(dev); +} diff --git a/include/linux/pm.h b/include/linux/pm.h index b3f74764a58..2b6e20df0e5 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -22,6 +22,10 @@ #define _LINUX_PM_H #include +#include +#include +#include +#include /* * Callbacks for platform drivers to implement. @@ -165,6 +169,28 @@ typedef struct pm_message { * It is allowed to unregister devices while the above callbacks are being * executed. However, it is not allowed to unregister a device from within any * of its own callbacks. + * + * There also are the following callbacks related to run-time power management + * of devices: + * + * @runtime_suspend: Prepare the device for a condition in which it won't be + * able to communicate with the CPU(s) and RAM due to power management. + * This need not mean that the device should be put into a low power state. + * For example, if the device is behind a link which is about to be turned + * off, the device may remain at full power. If the device does go to low + * power and if device_may_wakeup(dev) is true, remote wake-up (i.e., a + * hardware mechanism allowing the device to request a change of its power + * state, such as PCI PME) should be enabled for it. + * + * @runtime_resume: Put the device into the fully active state in response to a + * wake-up event generated by hardware or at the request of software. If + * necessary, put the device into the full power state and restore its + * registers, so that it is fully operational. + * + * @runtime_idle: Device appears to be inactive and it might be put into a low + * power state if all of the necessary conditions are satisfied. Check + * these conditions and handle the device as appropriate, possibly queueing + * a suspend request for it. The return value is ignored by the PM core. */ struct dev_pm_ops { @@ -182,6 +208,9 @@ struct dev_pm_ops { int (*thaw_noirq)(struct device *dev); int (*poweroff_noirq)(struct device *dev); int (*restore_noirq)(struct device *dev); + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + int (*runtime_idle)(struct device *dev); }; /** @@ -315,14 +344,80 @@ enum dpm_state { DPM_OFF_IRQ, }; +/** + * Device run-time power management status. + * + * These status labels are used internally by the PM core to indicate the + * current status of a device with respect to the PM core operations. They do + * not reflect the actual power state of the device or its status as seen by the + * driver. + * + * RPM_ACTIVE Device is fully operational. Indicates that the device + * bus type's ->runtime_resume() callback has completed + * successfully. + * + * RPM_SUSPENDED Device bus type's ->runtime_suspend() callback has + * completed successfully. The device is regarded as + * suspended. + * + * RPM_RESUMING Device bus type's ->runtime_resume() callback is being + * executed. + * + * RPM_SUSPENDING Device bus type's ->runtime_suspend() callback is being + * executed. + */ + +enum rpm_status { + RPM_ACTIVE = 0, + RPM_RESUMING, + RPM_SUSPENDED, + RPM_SUSPENDING, +}; + +/** + * Device run-time power management request types. + * + * RPM_REQ_NONE Do nothing. + * + * RPM_REQ_IDLE Run the device bus type's ->runtime_idle() callback + * + * RPM_REQ_SUSPEND Run the device bus type's ->runtime_suspend() callback + * + * RPM_REQ_RESUME Run the device bus type's ->runtime_resume() callback + */ + +enum rpm_request { + RPM_REQ_NONE = 0, + RPM_REQ_IDLE, + RPM_REQ_SUSPEND, + RPM_REQ_RESUME, +}; + struct dev_pm_info { pm_message_t power_state; - unsigned can_wakeup:1; - unsigned should_wakeup:1; + unsigned int can_wakeup:1; + unsigned int should_wakeup:1; enum dpm_state status; /* Owned by the PM core */ -#ifdef CONFIG_PM_SLEEP +#ifdef CONFIG_PM_SLEEP struct list_head entry; #endif +#ifdef CONFIG_PM_RUNTIME + struct timer_list suspend_timer; + unsigned long timer_expires; + struct work_struct work; + wait_queue_head_t wait_queue; + spinlock_t lock; + atomic_t usage_count; + atomic_t child_count; + unsigned int disable_depth:3; + unsigned int ignore_children:1; + unsigned int idle_notification:1; + unsigned int request_pending:1; + unsigned int deferred_resume:1; + enum rpm_request request; + enum rpm_status runtime_status; + int runtime_error; +#endif }; /* diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h new file mode 100644 index 00000000000..44087044910 --- /dev/null +++ b/include/linux/pm_runtime.h @@ -0,0 +1,114 @@ +/* + * pm_runtime.h - Device run-time power management helper functions. + * + * Copyright (C) 2009 Rafael J. Wysocki + * + * This file is released under the GPLv2. + */ + +#ifndef _LINUX_PM_RUNTIME_H +#define _LINUX_PM_RUNTIME_H + +#include +#include + +#ifdef CONFIG_PM_RUNTIME + +extern struct workqueue_struct *pm_wq; + +extern int pm_runtime_idle(struct device *dev); +extern int pm_runtime_suspend(struct device *dev); +extern int pm_runtime_resume(struct device *dev); +extern int pm_request_idle(struct device *dev); +extern int pm_schedule_suspend(struct device *dev, unsigned int delay); +extern int pm_request_resume(struct device *dev); +extern int __pm_runtime_get(struct device *dev, bool sync); +extern int __pm_runtime_put(struct device *dev, bool sync); +extern int __pm_runtime_set_status(struct device *dev, unsigned int status); +extern int pm_runtime_barrier(struct device *dev); +extern void pm_runtime_enable(struct device *dev); +extern void __pm_runtime_disable(struct device *dev, bool check_resume); + +static inline bool pm_children_suspended(struct device *dev) +{ + return dev->power.ignore_children + || !atomic_read(&dev->power.child_count); +} + +static inline void pm_suspend_ignore_children(struct device *dev, bool enable) +{ + dev->power.ignore_children = enable; +} + +static inline void pm_runtime_get_noresume(struct device *dev) +{ + atomic_inc(&dev->power.usage_count); +} + +static inline void pm_runtime_put_noidle(struct device *dev) +{ + atomic_add_unless(&dev->power.usage_count, -1, 0); +} + +#else /* !CONFIG_PM_RUNTIME */ + +static inline int pm_runtime_idle(struct device *dev) { return -ENOSYS; } +static inline int pm_runtime_suspend(struct device *dev) { return -ENOSYS; } +static inline int pm_runtime_resume(struct device *dev) { return 0; } +static inline int pm_request_idle(struct device *dev) { return -ENOSYS; } +static inline int pm_schedule_suspend(struct device *dev, unsigned int delay) +{ + return -ENOSYS; +} +static inline int pm_request_resume(struct device *dev) { return 0; } +static inline int __pm_runtime_get(struct device *dev, bool sync) { return 1; } +static inline int __pm_runtime_put(struct device *dev, bool sync) { return 0; } +static inline int __pm_runtime_set_status(struct device *dev, + unsigned int status) { return 0; } +static inline int pm_runtime_barrier(struct device *dev) { return 0; } +static inline void pm_runtime_enable(struct device *dev) {} +static inline void __pm_runtime_disable(struct device *dev, bool c) {} + +static inline bool pm_children_suspended(struct device *dev) { return false; } +static inline void pm_suspend_ignore_children(struct device *dev, bool en) {} +static inline void pm_runtime_get_noresume(struct device *dev) {} +static inline void pm_runtime_put_noidle(struct device *dev) {} + +#endif /* !CONFIG_PM_RUNTIME */ + +static inline int pm_runtime_get(struct device *dev) +{ + return __pm_runtime_get(dev, false); +} + +static inline int pm_runtime_get_sync(struct device *dev) +{ + return __pm_runtime_get(dev, true); +} + +static inline int pm_runtime_put(struct device *dev) +{ + return __pm_runtime_put(dev, false); +} + +static inline int pm_runtime_put_sync(struct device *dev) +{ + return __pm_runtime_put(dev, true); +} + +static inline int pm_runtime_set_active(struct device *dev) +{ + return __pm_runtime_set_status(dev, RPM_ACTIVE); +} + +static inline void pm_runtime_set_suspended(struct device *dev) +{ + __pm_runtime_set_status(dev, RPM_SUSPENDED); +} + +static inline void pm_runtime_disable(struct device *dev) +{ + __pm_runtime_disable(dev, true); +} + +#endif diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 72067cbdb37..91e09d3b2eb 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -208,3 +208,17 @@ config APM_EMULATION random kernel OOPSes or reboots that don't seem to be related to anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on PM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. diff --git a/kernel/power/main.c b/kernel/power/main.c index f710e36930c..347d2cc88cd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "power.h" @@ -217,8 +218,24 @@ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; + +static int __init pm_start_workqueue(void) +{ + pm_wq = create_freezeable_workqueue("pm"); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif + static int __init pm_init(void) { + int error = pm_start_workqueue(); + if (error) + return error; power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 31ffe249e5426d2648d68568fa00a7b66666a5db Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 26 Aug 2009 16:32:37 -0700 Subject: net: Temporarily backout SKB sources tracer. Steven Rostedt has suggested that Neil work with the tracing folks, trying to use TRACE_EVENT as the mechanism for implementation. And if that doesn't workout we can investigate other solutions such as that one which was tried here. This reverts the following 2 commits: 5a165657bef7c47e5ff4cd138f7758ef6278e87b ("net: skb ftracer - Add config option to enable new ftracer (v3)") 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb ("net: skb ftracer - Add actual ftrace code to kernel (v3)") Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 11 --- kernel/trace/Makefile | 1 - kernel/trace/trace.h | 19 ----- kernel/trace/trace_skb_sources.c | 154 --------------------------------------- 4 files changed, 185 deletions(-) delete mode 100644 kernel/trace/trace_skb_sources.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9a2f19bf051..019f380fd76 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -234,17 +234,6 @@ config BOOT_TRACER You must pass in initcall_debug and ftrace=initcall to the kernel command line to enable this on bootup. -config SKB_SOURCES_TRACER - bool "Trace skb source information" - depends on NET - select GENERIC_TRACER - help - This tracer helps developers/sysadmins correlate skb allocation and - consumption. The idea being that some processes will primarily consume data - that was allocated on certain numa nodes. By being able to visualize which - nodes the data was allocated on, a sysadmin or developer can optimize the - scheduling of those processes to cut back on cross node chatter. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ee5e5b1b928..844164dca90 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -49,7 +49,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif -obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8a6281b2330..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -41,7 +40,6 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, - TRACE_SKB_SOURCE, __TRACE_LAST_TYPE, }; @@ -173,21 +171,6 @@ struct trace_power { struct power_trace state_data; }; -struct skb_record { - pid_t pid; /* pid of the copying process */ - int anid; /* node where skb was allocated */ - int cnid; /* node to which skb was copied in userspace */ - char ifname[IFNAMSIZ]; /* Name of the receiving interface */ - int rx_queue; /* The rx queue the skb was received on */ - int ccpu; /* Cpu the application got this frame from */ - int len; /* length of the data copied */ -}; - -struct trace_skb_event { - struct trace_entry ent; - struct skb_record event_data; -}; - enum kmemtrace_type_id { KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ @@ -340,8 +323,6 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ - IF_ASSIGN(var, ent, struct trace_skb_event, \ - TRACE_SKB_SOURCE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c deleted file mode 100644 index 40eb071afdb..00000000000 --- a/kernel/trace/trace_skb_sources.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * ring buffer based tracer for analyzing per-socket skb sources - * - * Neil Horman - * Copyright (C) 2009 - * - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec); - -static struct trace_array *skb_trace; -static int __read_mostly trace_skb_source_enabled; - -static void probe_skb_dequeue(const struct sk_buff *skb, int len) -{ - struct ring_buffer_event *event; - struct trace_skb_event *entry; - struct trace_array *tr = skb_trace; - struct net_device *dev; - - if (!trace_skb_source_enabled) - return; - - if (in_interrupt()) - return; - - event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE, - sizeof(*entry), 0, 0); - if (!event) - return; - entry = ring_buffer_event_data(event); - - entry->event_data.pid = current->pid; - entry->event_data.anid = page_to_nid(virt_to_page(skb->data)); - entry->event_data.cnid = cpu_to_node(smp_processor_id()); - entry->event_data.len = len; - entry->event_data.rx_queue = skb->queue_mapping; - entry->event_data.ccpu = smp_processor_id(); - - dev = dev_get_by_index(sock_net(skb->sk), skb->iif); - if (dev) { - memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ); - dev_put(dev); - } else { - strcpy(entry->event_data.ifname, "Unknown"); - } - - trace_buffer_unlock_commit(tr, event, 0, 0); -} - -static int tracing_skb_source_register(void) -{ - int ret; - - ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue); - if (ret) - pr_info("skb source trace: Couldn't activate dequeue tracepoint"); - - return ret; -} - -static void start_skb_source_trace(struct trace_array *tr) -{ - trace_skb_source_enabled = 1; -} - -static void stop_skb_source_trace(struct trace_array *tr) -{ - trace_skb_source_enabled = 0; -} - -static void skb_source_trace_reset(struct trace_array *tr) -{ - trace_skb_source_enabled = 0; - unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue); -} - - -static int skb_source_trace_init(struct trace_array *tr) -{ - int cpu; - skb_trace = tr; - - trace_skb_source_enabled = 1; - tracing_skb_source_register(); - - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); - return 0; -} - -static enum print_line_t skb_source_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_skb_event *event; - struct skb_record *record; - struct trace_seq *s = &iter->seq; - - trace_assign_type(event, entry); - record = &event->event_data; - if (entry->type != TRACE_SKB_SOURCE) - return TRACE_TYPE_UNHANDLED; - - ret = trace_seq_printf(s, " %d %d %d %s %d %d %d\n", - record->pid, - record->anid, - record->cnid, - record->ifname, - record->rx_queue, - record->ccpu, - record->len); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static void skb_source_print_header(struct seq_file *s) -{ - seq_puts(s, "# PID ANID CNID IFC RXQ CCPU LEN\n"); - seq_puts(s, "# | | | | | | |\n"); -} - -static struct tracer skb_source_tracer __read_mostly = -{ - .name = "skb_sources", - .init = skb_source_trace_init, - .start = start_skb_source_trace, - .stop = stop_skb_source_trace, - .reset = skb_source_trace_reset, - .print_line = skb_source_print_line, - .print_header = skb_source_print_header, -}; - -static int init_skb_source_trace(void) -{ - return register_tracer(&skb_source_tracer); -} -device_initcall(init_skb_source_trace); -- cgit v1.2.3-70-g09d2 From 2bc481cf433879f0e6cdd4d899fc21ee05dcea23 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 28 Aug 2009 23:41:29 -0700 Subject: pktgen: spin using hrtimer This changes how the pktgen thread spins/waits between packets if delay is configured. It uses a high res timer to wait for time to arrive. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- kernel/hrtimer.c | 2 ++ net/core/pktgen.c | 49 ++++++++++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 49da79ab848..05071bf6a37 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -485,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { @@ -1477,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; } +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bede00b5403..3045dd13307 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -131,6 +131,7 @@ #include #include #include +#include #include #include #include @@ -2086,33 +2087,40 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pkt_dev->nflows = 0; } -static inline s64 delta_ns(ktime_t a, ktime_t b) -{ - return ktime_to_ns(ktime_sub(a, b)); -} static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) { - ktime_t start, now; - s64 dt; + ktime_t start; + s32 remaining; + struct hrtimer_sleeper t; - start = now = ktime_now(); + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, spin_until); + + remaining = ktime_to_us(hrtimer_expires_remaining(&t.timer)); + if (remaining <= 0) + return; - while ((dt = delta_ns(spin_until, now)) > 0) { - /* TODO: optimize sleeping behavior */ - if (dt > TICK_NSEC) - schedule_timeout_interruptible(1); - else if (dt > 100*NSEC_PER_USEC) { - if (!pkt_dev->running) - return; - if (need_resched()) + start = ktime_now(); + if (remaining < 100) + udelay(remaining); /* really small just spin */ + else { + /* see do_nanosleep */ + hrtimer_init_sleeper(&t, current); + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) schedule(); - } - now = ktime_now(); + hrtimer_cancel(&t.timer); + } while (t.task && pkt_dev->running && !signal_pending(current)); + __set_current_state(TASK_RUNNING); } - - pkt_dev->idle_acc += ktime_to_ns(ktime_sub(now, start)); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), start)); } static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) @@ -3360,8 +3368,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) int ret; if (pkt_dev->delay) { - if (ktime_lt(ktime_now(), pkt_dev->next_tx)) - spin(pkt_dev, pkt_dev->next_tx); + spin(pkt_dev, pkt_dev->next_tx); /* This is max DELAY, this has special meaning of * "never transmit" -- cgit v1.2.3-70-g09d2 From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Tue, 1 Sep 2009 18:25:07 -0700 Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86 Move tboot.h from asm to linux to fix the build errors of intel_txt patch on non-X86 platforms. Remove the tboot code from generic code init/main.c and kernel/cpu.c. Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 + arch/x86/include/asm/tboot.h | 197 ------------------------------------------ arch/x86/kernel/reboot.c | 3 +- arch/x86/kernel/setup.c | 3 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tboot.c | 58 ++++++++++--- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/pci/dmar.c | 2 +- drivers/pci/intel-iommu.c | 2 +- include/linux/tboot.h | 162 ++++++++++++++++++++++++++++++++++ init/main.c | 3 - kernel/cpu.c | 6 +- security/Kconfig | 2 +- 13 files changed, 221 insertions(+), 225 deletions(-) delete mode 100644 arch/x86/include/asm/tboot.h create mode 100644 include/linux/tboot.h (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8..b66f2102c35 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h deleted file mode 100644 index b13929d4e5f..00000000000 --- a/arch/x86/include/asm/tboot.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * tboot.h: shared data structure with tboot and kernel and functions - * used by kernel for runtime support of Intel(R) Trusted - * Execution Technology - * - * Copyright (c) 2006-2009, Intel Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#ifndef _ASM_TBOOT_H -#define _ASM_TBOOT_H - -#include - -/* these must have the values from 0-5 in this order */ -enum { - TB_SHUTDOWN_REBOOT = 0, - TB_SHUTDOWN_S5, - TB_SHUTDOWN_S4, - TB_SHUTDOWN_S3, - TB_SHUTDOWN_HALT, - TB_SHUTDOWN_WFS -}; - -#ifdef CONFIG_INTEL_TXT - -/* used to communicate between tboot and the launched kernel */ - -#define TB_KEY_SIZE 64 /* 512 bits */ - -#define MAX_TB_MAC_REGIONS 32 - -struct tboot_mac_region { - u64 start; /* must be 64 byte -aligned */ - u32 size; /* must be 64 byte -granular */ -} __packed; - -/* GAS - Generic Address Structure (ACPI 2.0+) */ -struct tboot_acpi_generic_address { - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_width; - u64 address; -} __packed; - -/* - * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec - * (http://www.acpi.info/) - */ -struct tboot_acpi_sleep_info { - struct tboot_acpi_generic_address pm1a_cnt_blk; - struct tboot_acpi_generic_address pm1b_cnt_blk; - struct tboot_acpi_generic_address pm1a_evt_blk; - struct tboot_acpi_generic_address pm1b_evt_blk; - u16 pm1a_cnt_val; - u16 pm1b_cnt_val; - u64 wakeup_vector; - u32 vector_width; - u64 kernel_s3_resume_vector; -} __packed; - -/* - * shared memory page used for communication between tboot and kernel - */ -struct tboot { - /* - * version 3+ fields: - */ - - /* TBOOT_UUID */ - u8 uuid[16]; - - /* version number: 5 is current */ - u32 version; - - /* physical addr of tb_log_t log */ - u32 log_addr; - - /* - * physical addr of entry point for tboot shutdown and - * type of shutdown (TB_SHUTDOWN_*) being requested - */ - u32 shutdown_entry; - u32 shutdown_type; - - /* kernel-specified ACPI info for Sx shutdown */ - struct tboot_acpi_sleep_info acpi_sinfo; - - /* tboot location in memory (physical) */ - u32 tboot_base; - u32 tboot_size; - - /* memory regions (phys addrs) for tboot to MAC on S3 */ - u8 num_mac_regions; - struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; - - - /* - * version 4+ fields: - */ - - /* symmetric key for use by kernel; will be encrypted on S3 */ - u8 s3_key[TB_KEY_SIZE]; - - - /* - * version 5+ fields: - */ - - /* used to 4byte-align num_in_wfs */ - u8 reserved_align[3]; - - /* number of processors in wait-for-SIPI */ - u32 num_in_wfs; -} __packed; - -/* - * UUID for tboot data struct to facilitate matching - * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is - * represented as {} in the char array used here - */ -#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ - 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} - -extern struct tboot *tboot; - -static inline int tboot_enabled(void) -{ - return tboot != NULL; -} - -extern void tboot_probe(void); -extern void tboot_create_trampoline(void); -extern void tboot_shutdown(u32 shutdown_type); -extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); -extern int tboot_wait_for_aps(int num_aps); -extern struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl); -extern int tboot_force_iommu(void); - -#else /* CONFIG_INTEL_TXT */ - -static inline int tboot_enabled(void) -{ - return 0; -} - -static inline void tboot_probe(void) -{ -} - -static inline void tboot_create_trampoline(void) -{ -} - -static inline void tboot_shutdown(u32 shutdown_type) -{ -} - -static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, - u32 pm1b_control) -{ -} - -static inline int tboot_wait_for_aps(int num_aps) -{ - return 0; -} - -static inline struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl) -{ - return dmar_tbl; -} - -static inline int tboot_force_iommu(void) -{ - return 0; -} - -#endif /* !CONFIG_INTEL_TXT */ - -#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9de01c5d979..18ce5c04242 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,8 +25,6 @@ # include #endif -#include - /* * Power off function, if any */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80d6e9e3248..6ce0d6f38f7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include #include +#include #include