diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.locks | 6 | ||||
-rw-r--r-- | kernel/events/core.c | 278 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 191 | ||||
-rw-r--r-- | kernel/futex.c | 6 | ||||
-rw-r--r-- | kernel/irq/chip.c | 13 | ||||
-rw-r--r-- | kernel/irq/generic-chip.c | 314 | ||||
-rw-r--r-- | kernel/irq/irqdomain.c | 8 | ||||
-rw-r--r-- | kernel/irq/manage.c | 17 | ||||
-rw-r--r-- | kernel/mutex.c | 384 | ||||
-rw-r--r-- | kernel/power/Kconfig | 1 | ||||
-rw-r--r-- | kernel/rcupdate.c | 29 | ||||
-rw-r--r-- | kernel/rcutiny.c | 21 | ||||
-rw-r--r-- | kernel/rcutiny_plugin.h | 1009 | ||||
-rw-r--r-- | kernel/rcutorture.c | 39 | ||||
-rw-r--r-- | kernel/rcutree.c | 168 | ||||
-rw-r--r-- | kernel/rcutree.h | 15 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 81 | ||||
-rw-r--r-- | kernel/rtmutex.c | 13 | ||||
-rw-r--r-- | kernel/softirq.c | 10 | ||||
-rw-r--r-- | kernel/sysctl.c | 12 | ||||
-rw-r--r-- | kernel/wait.c | 88 |
21 files changed, 1258 insertions, 1445 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 44511d100ea..d2b32ac27a3 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y diff --git a/kernel/events/core.c b/kernel/events/core.c index b391907d535..1db3af93370 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' /* * max perf event sample rate */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = - DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +#define DEFAULT_MAX_SAMPLE_RATE 100000 +#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) +#define DEFAULT_CPU_TIME_MAX_PERCENT 25 + +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; + +static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; + +static atomic_t perf_sample_allowed_ns __read_mostly = + ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); + +void update_perf_cpu_limits(void) +{ + u64 tmp = perf_sample_period_ns; + + tmp *= sysctl_perf_cpu_time_max_percent; + tmp = do_div(tmp, 100); + atomic_set(&perf_sample_allowed_ns, tmp); +} + +static int perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write, return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + update_perf_cpu_limits(); return 0; } +int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; + +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + update_perf_cpu_limits(); + + return 0; +} + +/* + * perf samples are done in some very critical code paths (NMIs). + * If they take too much CPU time, the system can lock up and not + * get any real work done. This will drop the sample rate when + * we detect that events are taking too long. + */ +#define NR_ACCUMULATED_SAMPLES 128 +DEFINE_PER_CPU(u64, running_sample_length); + +void perf_sample_event_took(u64 sample_len_ns) +{ + u64 avg_local_sample_len; + u64 local_samples_len = __get_cpu_var(running_sample_length); + + if (atomic_read(&perf_sample_allowed_ns) == 0) + return; + + /* decay the counter by 1 average sample */ + local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; + local_samples_len += sample_len_ns; + __get_cpu_var(running_sample_length) = local_samples_len; + + /* + * note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * from having to maintain a count. + */ + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + return; + + if (max_samples_per_tick <= 1) + return; + + max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); + sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + + printk_ratelimited(KERN_WARNING + "perf samples too long (%lld > %d), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, + atomic_read(&perf_sample_allowed_ns), + sysctl_perf_event_sample_rate); + + update_perf_cpu_limits(); +} + static atomic64_t perf_event_id; static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, @@ -655,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event, } #endif +/* + * set default to be dependent on timer tick just + * like original code + */ +#define PERF_CPU_HRTIMER (1000 / HZ) +/* + * function must be called with interrupts disbled + */ +static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +{ + struct perf_cpu_context *cpuctx; + enum hrtimer_restart ret = HRTIMER_NORESTART; + int rotations = 0; + + WARN_ON(!irqs_disabled()); + + cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); + + rotations = perf_rotate_context(cpuctx); + + /* + * arm timer if needed + */ + if (rotations) { + hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + ret = HRTIMER_RESTART; + } + + return ret; +} + +/* CPU is going down */ +void perf_cpu_hrtimer_cancel(int cpu) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (WARN_ON(cpu != smp_processor_id())) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (pmu->task_ctx_nr == perf_sw_context) + continue; + + hrtimer_cancel(&cpuctx->hrtimer); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + int timer; + + /* no multiplexing needed for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + /* + * check default is sane, if not set then force to + * default interval (1/tick) + */ + timer = pmu->hrtimer_interval_ms; + if (timer < 1) + timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + hr->function = perf_cpu_hrtimer_handler; +} + +static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + + /* not for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + if (hrtimer_active(hr)) + return; + + if (!hrtimer_callback_running(hr)) + __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, + 0, HRTIMER_MODE_REL_PINNED, 0); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1503,6 +1689,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1549,6 +1736,8 @@ group_error: pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); + return -EAGAIN; } @@ -1804,8 +1993,10 @@ static int __perf_event_enable(void *info) * If this event can't go on and it's part of a * group, then the whole group has to come off. */ - if (leader != event) + if (leader != event) { group_sched_out(leader, cpuctx, ctx); + perf_cpu_hrtimer_restart(cpuctx); + } if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_EVENT_STATE_ERROR; @@ -2552,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx) * because they're strictly cpu affine and rotate_start is called with IRQs * disabled, while rotate_context is called from IRQ context. */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) +static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; int rotate = 0, remove = 1; @@ -2591,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) done: if (remove) list_del_init(&cpuctx->rotation_list); + + return rotate; } #ifdef CONFIG_NO_HZ_FULL @@ -2622,10 +2815,6 @@ void perf_event_task_tick(void) ctx = cpuctx->task_ctx; if (ctx) perf_adjust_freq_unthr_context(ctx, throttled); - - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); } } @@ -5036,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); * sign as trigger. */ -static u64 perf_swevent_set_period(struct perf_event *event) +u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; @@ -5979,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static ssize_t +perf_event_mux_interval_ms_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); +} + +static ssize_t +perf_event_mux_interval_ms_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pmu *pmu = dev_get_drvdata(dev); + int timer, cpu, ret; + + ret = kstrtoint(buf, 0, &timer); + if (ret) + return ret; + + if (timer < 1) + return -EINVAL; + + /* same value, noting to do */ + if (timer == pmu->hrtimer_interval_ms) + return count; + + pmu->hrtimer_interval_ms = timer; + + /* update all cpuctx for this PMU */ + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + + if (hrtimer_active(&cpuctx->hrtimer)) + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + } + + return count; +} + +#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) + static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_NULL, + __ATTR_RO(type), + __ATTR_RW(perf_event_mux_interval_ms), + __ATTR_NULL, }; static int pmu_bus_running; @@ -6027,7 +6263,7 @@ free_dev: static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, char *name, int type) +int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret; @@ -6076,7 +6312,9 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; - cpuctx->jiffies_interval = 1; + + __perf_cpu_hrtimer_init(cpuctx, cpu); + INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -6402,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; - /* kernel level capture: check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { @@ -6424,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, */ attr->branch_sample_type = mask; } + /* privileged levels capture (kernel, hv): check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -7476,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; - default: break; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 20185ea64aa..1559fb0b929 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -46,23 +46,26 @@ #include <linux/smp.h> #include <linux/hw_breakpoint.h> - - /* * Constraints data */ +struct bp_cpuinfo { + /* Number of pinned cpu breakpoints in a cpu */ + unsigned int cpu_pinned; + /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ + unsigned int *tsk_pinned; + /* Number of non-pinned cpu/task breakpoints in a cpu */ + unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ +}; -/* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); - -/* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); - -/* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); - +static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); static int nr_slots[TYPE_MAX]; +static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) +{ + return per_cpu_ptr(bp_cpuinfo + type, cpu); +} + /* Keep track of the breakpoints attached to tasks */ static LIST_HEAD(bp_task_head); @@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { + unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -127,6 +130,13 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) return count; } +static const struct cpumask *cpumask_of_bp(struct perf_event *bp) +{ + if (bp->cpu >= 0) + return cpumask_of(bp->cpu); + return cpu_possible_mask; +} + /* * Report the number of pinned/un-pinned breakpoints we have in * a given cpu (cpu > -1) or in all of them (cpu = -1). @@ -135,25 +145,15 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - slots->pinned += max_task_bp_pinned(cpu, type); - else - slots->pinned += task_bp_pinned(cpu, bp, type); - slots->flexible = per_cpu(nr_bp_flexible[type], cpu); - - return; - } + const struct cpumask *cpumask = cpumask_of_bp(bp); + int cpu; - for_each_possible_cpu(cpu) { - unsigned int nr; + for_each_cpu(cpu, cpumask) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + int nr; - nr = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) + nr = info->cpu_pinned; + if (!bp->hw.bp_target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (nr > slots->pinned) slots->pinned = nr; - nr = per_cpu(nr_bp_flexible[type], cpu); - + nr = info->flexible; if (nr > slots->flexible) slots->flexible = nr; } @@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { - unsigned int *tsk_pinned; - int old_count = 0; - int old_idx = 0; - int idx = 0; - - old_count = task_bp_pinned(cpu, bp, type); - old_idx = old_count - 1; - idx = old_idx + weight; - - /* tsk_pinned[n] is the number of tasks having n breakpoints */ - tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - if (enable) { - tsk_pinned[idx]++; - if (old_count > 0) - tsk_pinned[old_idx]--; - } else { - tsk_pinned[idx]--; - if (old_count > 0) - tsk_pinned[old_idx]++; - } + unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; + int old_idx, new_idx; + + old_idx = task_bp_pinned(cpu, bp, type) - 1; + new_idx = old_idx + weight; + + if (old_idx >= 0) + tsk_pinned[old_idx]--; + if (new_idx >= 0) + tsk_pinned[new_idx]++; } /* @@ -214,33 +203,26 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; + const struct cpumask *cpumask = cpumask_of_bp(bp); + int cpu; - /* Pinned counter cpu profiling */ - if (!tsk) { + if (!enable) + weight = -weight; - if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + /* Pinned counter cpu profiling */ + if (!bp->hw.bp_target) { + get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } /* Pinned counter task profiling */ - - if (!enable) - list_del(&bp->hw.bp_list); - - if (cpu >= 0) { - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } else { - for_each_possible_cpu(cpu) - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } + for_each_cpu(cpu, cpumask) + toggle_bp_task_slot(bp, cpu, type, weight); if (enable) list_add_tail(&bp->hw.bp_list, &bp_task_head); + else + list_del(&bp->hw.bp_list); } /* @@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to a single cpu, check: * - * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) + * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. @@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to every cpus, check: * - * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) + * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks @@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to a single cpu, check: * - * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) + * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM * - * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * -> Same checks as before. But now the info->flexible, if any, must keep * one register at least (or they will never be fed). * * - If attached to every cpus, check: * - * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) + * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ static int __reserve_bp_slot(struct perf_event *bp) { @@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { - struct perf_event * __percpu *cpu_events, **pevent, *bp; - long err; + struct perf_event * __percpu *cpu_events, *bp; + long err = 0; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); @@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); - - *pevent = bp; - if (IS_ERR(bp)) { err = PTR_ERR(bp); - goto fail; + break; } - } - put_online_cpus(); - return cpu_events; - -fail: - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - if (IS_ERR(*pevent)) - break; - unregister_hw_breakpoint(*pevent); + per_cpu(*cpu_events, cpu) = bp; } put_online_cpus(); - free_percpu(cpu_events); + if (likely(!err)) + return cpu_events; + + unregister_wide_hw_breakpoint(cpu_events); return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; - struct perf_event **pevent; - for_each_possible_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - unregister_hw_breakpoint(*pevent); - } + for_each_possible_cpu(cpu) + unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); + free_percpu(cpu_events); } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); @@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags) if (!(flags & PERF_EF_START)) bp->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(bp)) { + bp->hw.last_period = bp->hw.sample_period; + perf_swevent_set_period(bp); + } + return arch_install_hw_breakpoint(bp); } @@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - unsigned int **task_bp_pinned; int cpu, err_cpu; int i; @@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void) for_each_possible_cpu(cpu) { for (i = 0; i < TYPE_MAX; i++) { - task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); - *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], - GFP_KERNEL); - if (!*task_bp_pinned) + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), + GFP_KERNEL); + if (!info->tsk_pinned) goto err_alloc; } } @@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); + kfree(get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c9..c3a1a55a521 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,8 @@ #include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/sched/rt.h> +#include <linux/hugetlb.h> +#include <linux/freezer.h> #include <asm/futex.h> @@ -365,7 +367,7 @@ again: } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = page_head->mapping->host; - key->shared.pgoff = page_head->index; + key->shared.pgoff = basepage_index(page); } get_futex_key_refs(key); @@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - schedule(); + freezable_schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cbd97ce0b00..a3bb14fbe5c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc) irq_state_clr_masked(desc); } +/** + * irq_disable - Mark interupt disabled + * @desc: irq descriptor which should be disabled + * + * If the chip does not implement the irq_disable callback, we + * use a lazy disable approach. That means we mark the interrupt + * disabled, but leave the hardware unmasked. That's an + * optimization because we avoid the hardware access for the + * common case where no interrupt happens after we marked it + * disabled. If an interrupt happens, then the interrupt flow + * handler masks the line at the hardware level and marks it + * pending. + */ void irq_disable(struct irq_desc *desc) { irq_state_set_disabled(desc); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a8f66..1c39eccc1ea 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -7,6 +7,7 @@ #include <linux/irq.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/irqdomain.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/syscore_ops.h> @@ -16,11 +17,6 @@ static LIST_HEAD(gc_list); static DEFINE_RAW_SPINLOCK(gc_lock); -static inline struct irq_chip_regs *cur_regs(struct irq_data *d) -{ - return &container_of(d->chip, struct irq_chip_type, chip)->regs; -} - /** * irq_gc_noop - NOOP function * @d: irq_data @@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d) void irq_gc_mask_disable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); - gc->mask_cache &= ~mask; + irq_reg_writel(mask, gc->reg_base + ct->regs.disable); + *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } /** - * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * irq_gc_mask_set_bit - Mask chip via setting bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d) void irq_gc_mask_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - gc->mask_cache |= mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + *ct->mask_cache |= mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); /** - * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d) void irq_gc_mask_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - gc->mask_cache &= ~mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + *ct->mask_cache &= ~mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); /** * irq_gc_unmask_enable_reg - Unmask chip via enable register @@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d) void irq_gc_unmask_enable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); - gc->mask_cache |= mask; + irq_reg_writel(mask, gc->reg_base + ct->regs.enable); + *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); /** * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit @@ -122,10 +126,11 @@ void irq_gc_ack_set_bit(struct irq_data *d) void irq_gc_ack_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = ~(1 << (d->irq - gc->irq_base)); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = ~d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -136,11 +141,12 @@ void irq_gc_ack_clr_bit(struct irq_data *d) void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.mask); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) void irq_gc_eoi(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); + irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); irq_gc_unlock(gc); } /** * irq_gc_set_wake - Set/clr wake bit for an interrupt - * @d: irq_data + * @d: irq_data + * @on: Indicates whether the wake bit should be set or cleared * * For chips where the wake from suspend functionality is not * configured in a separate register and the wakeup active state is @@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d) int irq_gc_set_wake(struct irq_data *d, unsigned int on) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; if (!(mask & gc->wake_enabled)) return -EINVAL; @@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) +{ + raw_spin_lock_init(&gc->lock); + gc->num_ct = num_ct; + gc->irq_base = irq_base; + gc->reg_base = reg_base; + gc->chip_types->chip.name = name; + gc->chip_types->handler = handler; +} + /** * irq_alloc_generic_chip - Allocate a generic chip and initialize it * @name: Name of the irq chip @@ -203,23 +224,185 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, gc = kzalloc(sz, GFP_KERNEL); if (gc) { - raw_spin_lock_init(&gc->lock); - gc->num_ct = num_ct; - gc->irq_base = irq_base; - gc->reg_base = reg_base; - gc->chip_types->chip.name = name; - gc->chip_types->handler = handler; + irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, + handler); } return gc; } EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); +static void +irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) +{ + struct irq_chip_type *ct = gc->chip_types; + u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; + int i; + + for (i = 0; i < gc->num_ct; i++) { + if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { + mskptr = &ct[i].mask_cache_priv; + mskreg = ct[i].regs.mask; + } + ct[i].mask_cache = mskptr; + if (flags & IRQ_GC_INIT_MASK_CACHE) + *mskptr = irq_reg_readl(gc->reg_base + mskreg); + } +} + +/** + * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * @d: irq domain for which to allocate chips + * @irqs_per_chip: Number of interrupts each chip handles + * @num_ct: Number of irq_chip_type instances associated with this + * @name: Name of the irq chip + * @handler: Default flow handler associated with these chips + * @clr: IRQ_* bits to clear in the mapping function + * @set: IRQ_* bits to set in the mapping function + * @gcflags: Generic chip specific setup flags + */ +int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) +{ + struct irq_domain_chip_generic *dgc; + struct irq_chip_generic *gc; + int numchips, sz, i; + unsigned long flags; + void *tmp; + + if (d->gc) + return -EBUSY; + + if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) + return -EINVAL; + + numchips = d->revmap_data.linear.size / irqs_per_chip; + if (!numchips) + return -EINVAL; + + /* Allocate a pointer, generic chip and chiptypes for each chip */ + sz = sizeof(*dgc) + numchips * sizeof(gc); + sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); + + tmp = dgc = kzalloc(sz, GFP_KERNEL); + if (!dgc) + return -ENOMEM; + dgc->irqs_per_chip = irqs_per_chip; + dgc->num_chips = numchips; + dgc->irq_flags_to_set = set; + dgc->irq_flags_to_clear = clr; + dgc->gc_flags = gcflags; + d->gc = dgc; + + /* Calc pointer to the first generic chip */ + tmp += sizeof(*dgc) + numchips * sizeof(gc); + for (i = 0; i < numchips; i++) { + /* Store the pointer to the generic chip */ + dgc->gc[i] = gc = tmp; + irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, + NULL, handler); + gc->domain = d; + raw_spin_lock_irqsave(&gc_lock, flags); + list_add_tail(&gc->list, &gc_list); + raw_spin_unlock_irqrestore(&gc_lock, flags); + /* Calc pointer to the next generic chip */ + tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + } + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); + +/** + * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq + * @d: irq domain pointer + * @hw_irq: Hardware interrupt number + */ +struct irq_chip_generic * +irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return NULL; + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return NULL; + return dgc->gc[idx]; +} +EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); + /* * Separate lockdep class for interrupt chip which can nest irq_desc * lock. */ static struct lock_class_key irq_nested_lock_class; +/* + * irq_map_generic_chip - Map a generic chip for an irq domain + */ +static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) +{ + struct irq_data *data = irq_get_irq_data(virq); + struct irq_domain_chip_generic *dgc = d->gc; + struct irq_chip_generic *gc; + struct irq_chip_type *ct; + struct irq_chip *chip; + unsigned long flags; + int idx; + + if (!d->gc) + return -ENODEV; + + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return -EINVAL; + gc = dgc->gc[idx]; + + idx = hw_irq % dgc->irqs_per_chip; + + if (test_bit(idx, &gc->unused)) + return -ENOTSUPP; + + if (test_bit(idx, &gc->installed)) + return -EBUSY; + + ct = gc->chip_types; + chip = &ct->chip; + + /* We only init the cache for the first mapping of a generic chip */ + if (!gc->installed) { + raw_spin_lock_irqsave(&gc->lock, flags); + irq_gc_init_mask_cache(gc, dgc->gc_flags); + raw_spin_unlock_irqrestore(&gc->lock, flags); + } + + /* Mark the interrupt as installed */ + set_bit(idx, &gc->installed); + + if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) + irq_set_lockdep_class(virq, &irq_nested_lock_class); + + if (chip->irq_calc_mask) + chip->irq_calc_mask(data); + else + data->mask = 1 << idx; + + irq_set_chip_and_handler(virq, chip, ct->handler); + irq_set_chip_data(virq, gc); + irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); + return 0; +} + +struct irq_domain_ops irq_generic_chip_ops = { + .map = irq_map_generic_chip, + .xlate = irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_generic_chip_ops); + /** * irq_setup_generic_chip - Setup a range of interrupts with a generic chip * @gc: Generic irq chip holding all data @@ -237,15 +420,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int set) { struct irq_chip_type *ct = gc->chip_types; + struct irq_chip *chip = &ct->chip; unsigned int i; raw_spin_lock(&gc_lock); list_add_tail(&gc->list, &gc_list); raw_spin_unlock(&gc_lock); - /* Init mask cache ? */ - if (flags & IRQ_GC_INIT_MASK_CACHE) - gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); + irq_gc_init_mask_cache(gc, flags); for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) @@ -254,7 +436,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (flags & IRQ_GC_INIT_NESTED_LOCK) irq_set_lockdep_class(i, &irq_nested_lock_class); - irq_set_chip_and_handler(i, &ct->chip, ct->handler); + if (!(flags & IRQ_GC_NO_MASK)) { + struct irq_data *d = irq_get_irq_data(i); + + if (chip->irq_calc_mask) + chip->irq_calc_mask(d); + else + d->mask = 1 << (i - gc->irq_base); + } + irq_set_chip_and_handler(i, chip, ct->handler); irq_set_chip_data(i, gc); irq_modify_status(i, clr, set); } @@ -265,7 +455,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip); /** * irq_setup_alt_chip - Switch to alternative chip * @d: irq_data for this interrupt - * @type Flow type to be initialized + * @type: Flow type to be initialized * * Only to be called from chip->irq_set_type() callbacks. */ @@ -317,6 +507,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, } EXPORT_SYMBOL_GPL(irq_remove_generic_chip); +static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) +{ + unsigned int virq; + + if (!gc->domain) + return irq_get_irq_data(gc->irq_base); + + /* + * We don't know which of the irqs has been actually + * installed. Use the first one. + */ + if (!gc->installed) + return NULL; + + virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed)); + return virq ? irq_get_irq_data(virq) : NULL; +} + #ifdef CONFIG_PM static int irq_gc_suspend(void) { @@ -325,8 +533,12 @@ static int irq_gc_suspend(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_suspend) - ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_suspend) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_suspend(data); + } } return 0; } @@ -338,8 +550,12 @@ static void irq_gc_resume(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_resume) - ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_resume) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_resume(data); + } } } #else @@ -354,8 +570,12 @@ static void irq_gc_shutdown(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_pm_shutdown) - ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_pm_shutdown) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_pm_shutdown(data); + } } } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 54a4d522323..1ed8dff17eb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -16,12 +16,6 @@ #include <linux/smp.h> #include <linux/fs.h> -#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. - * ie. legacy 8259, gets irqs 1..15 */ -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ - static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); @@ -698,7 +692,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && - type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) + type != irq_get_trigger_type(virq)) irq_set_irq_type(virq, type); return virq; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa17855ca65..514bcfd855a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return 0; if (irq_settings_can_request(desc)) { - if (desc->action) - if (irqflags & desc->action->flags & IRQF_SHARED) - canrequest =1; + if (!desc->action || + irqflags & desc->action->flags & IRQF_SHARED) + canrequest = 1; } irq_put_desc_unlock(desc, flags); return canrequest; @@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused) static int irq_thread(void *data) { struct callback_head on_exit_work; - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, @@ -854,8 +851,6 @@ static int irq_thread(void *data) else handler_fn = irq_thread_fn; - sched_setscheduler(current, SCHED_FIFO, ¶m); - init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); @@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (new->thread_fn && !nested) { struct task_struct *t; + static const struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); @@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = PTR_ERR(t); goto out_mput; } + + sched_setscheduler(t, SCHED_FIFO, ¶m); + /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code diff --git a/kernel/mutex.c b/kernel/mutex.c index ad53a664f11..e581ada5faf 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -254,16 +254,165 @@ void __sched mutex_unlock(struct mutex *lock) EXPORT_SYMBOL(mutex_unlock); +/** + * ww_mutex_unlock - release the w/w mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously with any of the + * ww_mutex_lock* functions (with or without an acquire context). It is + * forbidden to release the locks after releasing the acquire context. + * + * This function must not be used in interrupt context. Unlocking + * of a unlocked mutex is not allowed. + */ +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ + if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } + +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(&lock->base); +#endif + __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); +} +EXPORT_SYMBOL(ww_mutex_unlock); + +static inline int __sched +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + + if (!hold_ctx) + return 0; + + if (unlikely(ctx == hold_ctx)) + return -EALREADY; + + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + unsigned long flags; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + + lock->ctx = ctx; + + /* + * The lock->ctx update should be visible on all cores before + * the atomic read is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* ^^^ */ + + /* + * Check if lock is contended, if not there is nobody to wake up + */ + if (likely(atomic_read(&lock->base.count) == 0)) + return; + + /* + * Uh oh, we raced in fastpath, wake up everyone in this case, + * so they can see the new lock->ctx. + */ + spin_lock_mutex(&lock->base.wait_lock, flags); + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } + spin_unlock_mutex(&lock->base.wait_lock, flags); +} + /* * Lock a mutex (possibly interruptible), slowpath: */ -static inline int __sched +static __always_inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip, + struct ww_acquire_ctx *ww_ctx) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; + int ret; preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); @@ -298,6 +447,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; struct mspin_node node; + if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + */ + if (ACCESS_ONCE(ww->ctx)) + break; + } + /* * If there's an owner, wait for it to either * release the lock or go to sleep. @@ -312,6 +477,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); + if (!__builtin_constant_p(ww_ctx == NULL)) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } + mutex_set_owner(lock); mspin_unlock(MLOCK(lock), &node); preempt_enable(); @@ -371,15 +543,16 @@ slowpath: * TASK_UNINTERRUPTIBLE case.) */ if (unlikely(signal_pending_state(state, task))) { - mutex_remove_waiter(lock, &waiter, - task_thread_info(task)); - mutex_release(&lock->dep_map, 1, ip); - spin_unlock_mutex(&lock->wait_lock, flags); + ret = -EINTR; + goto err; + } - debug_mutex_free_waiter(&waiter); - preempt_enable(); - return -EINTR; + if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + ret = __mutex_lock_check_stamp(lock, ww_ctx); + if (ret) + goto err; } + __set_task_state(task, state); /* didn't get the lock, go to sleep: */ @@ -394,6 +567,30 @@ done: mutex_remove_waiter(lock, &waiter, current_thread_info()); mutex_set_owner(lock); + if (!__builtin_constant_p(ww_ctx == NULL)) { + struct ww_mutex *ww = container_of(lock, + struct ww_mutex, + base); + struct mutex_waiter *cur; + + /* + * This branch gets optimized out for the common case, + * and is only important for ww_mutex_lock. + */ + + ww_mutex_lock_acquired(ww, ww_ctx); + ww->ctx = ww_ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + list_for_each_entry(cur, &lock->wait_list, list) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } + } + /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -404,6 +601,14 @@ done: preempt_enable(); return 0; + +err: + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + spin_unlock_mutex(&lock->wait_lock, flags); + debug_mutex_free_waiter(&waiter); + mutex_release(&lock->dep_map, 1, ip); + preempt_enable(); + return ret; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -411,7 +616,8 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -420,7 +626,8 @@ void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + 0, nest, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -429,7 +636,8 @@ int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, + subclass, NULL, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -438,10 +646,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_); + subclass, NULL, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +static inline int +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH + unsigned tmp; + + if (ctx->deadlock_inject_countdown-- == 0) { + tmp = ctx->deadlock_inject_interval; + if (tmp > UINT_MAX/4) + tmp = UINT_MAX; + else + tmp = tmp*2 + tmp + tmp/2; + + ctx->deadlock_inject_interval = tmp; + ctx->deadlock_inject_countdown = tmp; + ctx->contending_lock = lock; + + ww_mutex_unlock(lock); + + return -EDEADLK; + } +#endif + + return 0; +} + +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx); + if (!ret && ctx->acquired > 0) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx); + + if (!ret && ctx->acquired > 0) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); + #endif /* @@ -494,10 +760,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count) * mutex_lock_interruptible() and mutex_trylock(). */ static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count); +__mutex_lock_killable_slowpath(struct mutex *lock); static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count); +__mutex_lock_interruptible_slowpath(struct mutex *lock); /** * mutex_lock_interruptible - acquire the mutex, interruptible @@ -515,12 +781,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock) int ret; might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_interruptible_slowpath); - if (!ret) + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { mutex_set_owner(lock); - - return ret; + return 0; + } else + return __mutex_lock_interruptible_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_interruptible); @@ -530,12 +796,12 @@ int __sched mutex_lock_killable(struct mutex *lock) int ret; might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_killable_slowpath); - if (!ret) + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { mutex_set_owner(lock); - - return ret; + return 0; + } else + return __mutex_lock_killable_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_killable); @@ -544,24 +810,39 @@ __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL); } static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count) +__mutex_lock_killable_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); + return __mutex_lock_common(lock, TASK_KILLABLE, 0, + NULL, _RET_IP_, NULL); +} - return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +static noinline int __sched +__mutex_lock_interruptible_slowpath(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL); } static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count) +__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - struct mutex *lock = container_of(lock_count, struct mutex, count); + return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx); +} - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +static noinline int __sched +__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx); } + #endif /* @@ -617,6 +898,45 @@ int __sched mutex_trylock(struct mutex *lock) } EXPORT_SYMBOL(mutex_trylock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock_interruptible); + +#endif + /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 * @cnt: the atomic which we are to dec diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180..9c39de095ba 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,7 +100,6 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP - select HOTPLUG select HOTPLUG_CPU config PM_AUTOSLEEP diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4..cce6ba8bbac 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -104,31 +104,7 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (likely(list_empty(¤t->rcu_node_entry))) - return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; - __rcu_read_unlock(); -} - -#else /* #ifdef CONFIG_PREEMPT_RCU */ - -void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -145,9 +121,6 @@ static struct lock_class_key rcu_sched_lock_key; struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC int debug_lockdep_rcu_enabled(void) { diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index a0714a51b6d..aa344111de3 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -44,7 +44,6 @@ /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; -static void invoke_rcu_callbacks(void); static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, @@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - reset_cpu_stall_ticks(rcp); + RCU_TRACE(reset_cpu_stall_ticks(rcp)); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { - check_cpu_stalls(); + RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); - rcu_preempt_check_callbacks(); } /* @@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) ACCESS_ONCE(rcp->rcucblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); return; } @@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; - rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); } static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); } /* @@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8a233002fae..0cd385acccf 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC +#include <linux/kernel_stat.h> + int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_RCU_TRACE - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = rcp->jiffies_stall; - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -} - -static void check_cpu_stall_preempt(void); - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ -#ifdef CONFIG_RCU_TRACE - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -#endif /* #ifdef CONFIG_RCU_TRACE */ -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); - RCU_TRACE(check_cpu_stall_preempt()); -} - -#ifdef CONFIG_TINY_PREEMPT_RCU - -#include <linux/delay.h> - -/* Global control variables for preemptible RCU. */ -struct rcu_preempt_ctrlblk { - struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ - struct rcu_head **nexttail; - /* Tasks blocked in a preemptible RCU */ - /* read-side critical section while an */ - /* preemptible-RCU grace period is in */ - /* progress must wait for a later grace */ - /* period. This pointer points to the */ - /* ->next pointer of the last task that */ - /* must wait for a later grace period, or */ - /* to &->rcb.rcucblist if there is no */ - /* such task. */ - struct list_head blkd_tasks; - /* Tasks blocked in RCU read-side critical */ - /* section. Tasks are placed at the head */ - /* of this list and age towards the tail. */ - struct list_head *gp_tasks; - /* Pointer to the first task blocking the */ - /* current grace period, or NULL if there */ - /* is no such task. */ - struct list_head *exp_tasks; - /* Pointer to first task blocking the */ - /* current expedited grace period, or NULL */ - /* if there is no such task. If there */ - /* is no current expedited grace period, */ - /* then there cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST - struct list_head *boost_tasks; - /* Pointer to first task that needs to be */ - /* priority-boosted, or NULL if no priority */ - /* boosting is needed. If there is no */ - /* current or expedited grace period, there */ - /* can be no such task. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - u8 gpnum; /* Current grace period. */ - u8 gpcpu; /* Last grace period blocked by the CPU. */ - u8 completed; /* Last grace period completed. */ - /* If all three are equal, RCU is idle. */ -#ifdef CONFIG_RCU_BOOST - unsigned long boost_time; /* When to start boosting (jiffies) */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#ifdef CONFIG_RCU_TRACE - unsigned long n_grace_periods; -#ifdef CONFIG_RCU_BOOST - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { - .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), - RCU_TRACE(.rcb.name = "rcu_preempt") -}; - -static int rcu_preempted_readers_exp(void); -static void rcu_report_exp_done(void); - -/* - * Return true if the CPU has not yet responded to the current grace period. - */ -static int rcu_cpu_blocking_cur_gp(void) -{ - return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Check for a running RCU reader. Because there is only one CPU, - * there can be but one running RCU reader at a time. ;-) - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section. - */ -static int rcu_preempt_running_reader(void) -{ - return current->rcu_read_lock_nesting; -} - -/* - * Check for preempted RCU readers blocking any grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_any(void) -{ - return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); -} - -/* - * Check for preempted RCU readers blocking the current grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_cgp(void) -{ - return rcu_preempt_ctrlblk.gp_tasks != NULL; -} - -/* - * Return true if another preemptible-RCU grace period is needed. - */ -static int rcu_preempt_needs_another_gp(void) -{ - return *rcu_preempt_ctrlblk.rcb.curtail != NULL; -} - -/* - * Return true if a preemptible-RCU grace period is in progress. - * The caller must disable hardirqs. - */ -static int rcu_preempt_gp_in_progress(void) -{ - return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Advance a ->blkd_tasks-list pointer to the next entry, instead - * returning NULL if at the end of the list. - */ -static struct list_head *rcu_next_node_entry(struct task_struct *t) -{ - struct list_head *np; - - np = t->rcu_node_entry.next; - if (np == &rcu_preempt_ctrlblk.blkd_tasks) - np = NULL; - return np; -} - -#ifdef CONFIG_RCU_TRACE - -#ifdef CONFIG_RCU_BOOST -static void rcu_initiate_boost_trace(void); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Dump additional statistice for TINY_PREEMPT_RCU. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ - seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", - rcu_preempt_ctrlblk.rcb.qlen, - rcu_preempt_ctrlblk.n_grace_periods, - rcu_preempt_ctrlblk.gpnum, - rcu_preempt_ctrlblk.gpcpu, - rcu_preempt_ctrlblk.completed, - "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], - "N."[!rcu_preempt_ctrlblk.gp_tasks], - "E."[!rcu_preempt_ctrlblk.exp_tasks]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", - " ", - "B."[!rcu_preempt_ctrlblk.boost_tasks], - rcu_preempt_ctrlblk.n_tasks_boosted, - rcu_preempt_ctrlblk.n_exp_boosts, - rcu_preempt_ctrlblk.n_normal_boosts, - (int)(jiffies & 0xffff), - (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", - " balk", - rcu_preempt_ctrlblk.n_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, - rcu_preempt_ctrlblk.n_balk_boost_tasks, - rcu_preempt_ctrlblk.n_balk_notyet, - rcu_preempt_ctrlblk.n_balk_nos); -#endif /* #ifdef CONFIG_RCU_BOOST */ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -#ifdef CONFIG_RCU_BOOST - -#include "rtmutex_common.h" - -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO - -/* Controls for rcu_kthread() kthread. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; - -/* - * Carry out RCU priority boosting on the task indicated by ->boost_tasks, - * and advance ->boost_tasks to the next task in the ->blkd_tasks list. - */ -static int rcu_boost(void) -{ - unsigned long flags; - struct rt_mutex mtx; - struct task_struct *t; - struct list_head *tb; - - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - return 0; /* Nothing to boost. */ - - local_irq_save(flags); - - /* - * Recheck with irqs disabled: all tasks in need of boosting - * might exit their RCU read-side critical sections on their own - * if we are preempted just before disabling irqs. - */ - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - local_irq_restore(flags); - return 0; - } - - /* - * Preferentially boost tasks blocking expedited grace periods. - * This cannot starve the normal grace periods because a second - * expedited grace period must boost all blocked tasks, including - * those blocking the pre-existing normal grace period. - */ - if (rcu_preempt_ctrlblk.exp_tasks != NULL) { - tb = rcu_preempt_ctrlblk.exp_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); - } else { - tb = rcu_preempt_ctrlblk.boost_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); - } - RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - - /* - * We boost task t by manufacturing an rt_mutex that appears to - * be held by task t. We leave a pointer to that rt_mutex where - * task t can find it, and task t will release the mutex when it - * exits its outermost RCU read-side critical section. Then - * simply acquiring this artificial rt_mutex will boost task - * t's priority. (Thanks to tglx for suggesting this approach!) - */ - t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; - local_irq_restore(flags); - rt_mutex_lock(&mtx); - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - - return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || - ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; -} - -/* - * Check to see if it is now time to start boosting RCU readers blocking - * the current grace period, and, if so, tell the rcu_kthread_task to - * start boosting them. If there is an expedited boost in progress, - * we wait for it to complete. - * - * If there are no blocked readers blocking the current grace period, - * return 0 to let the caller know, otherwise return 1. Note that this - * return value is independent of whether or not boosting was done. - */ -static int rcu_initiate_boost(void) -{ - if (!rcu_preempt_blocked_readers_cgp() && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); - return 0; - } - if (rcu_preempt_ctrlblk.exp_tasks != NULL || - (rcu_preempt_ctrlblk.gp_tasks != NULL && - rcu_preempt_ctrlblk.boost_tasks == NULL && - ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { - if (rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.boost_tasks = - rcu_preempt_ctrlblk.gp_tasks; - invoke_rcu_callbacks(); - } else { - RCU_TRACE(rcu_initiate_boost_trace()); - } - return 1; -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) - -/* - * Do priority-boost accounting for the start of a new grace period. - */ -static void rcu_preempt_boost_start_gp(void) -{ - rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * If there is no RCU priority boosting, we don't initiate boosting, - * but we do indicate whether there are blocked readers blocking the - * current grace period. - */ -static int rcu_initiate_boost(void) -{ - return rcu_preempt_blocked_readers_cgp(); -} - -/* - * If there is no RCU priority boosting, nothing to do at grace-period start. - */ -static void rcu_preempt_boost_start_gp(void) -{ -} - -#endif /* else #ifdef CONFIG_RCU_BOOST */ - -/* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. - * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - * - * Because this is a single-CPU implementation, the only way a grace - * period can end is if the CPU is in a quiescent state. The reason is - * that a blocked preemptible-RCU reader can exit its critical section - * only if the CPU is running it at the time. Therefore, when the - * last task blocking the current grace period exits its RCU read-side - * critical section, neither the CPU nor blocked tasks will be stopping - * the current grace period. (In contrast, SMP implementations - * might have CPUs running in RCU read-side critical sections that - * block later grace periods -- but this is not possible given only - * one CPU.) - */ -static void rcu_preempt_cpu_qs(void) -{ - /* Record both CPU and task as having responded to current GP. */ - rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - - /* If there is no GP then there is nothing more to do. */ - if (!rcu_preempt_gp_in_progress()) - return; - /* - * Check up on boosting. If there are readers blocking the - * current grace period, leave. - */ - if (rcu_initiate_boost()) - return; - - /* Advance callbacks. */ - rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; - rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; - - /* If there are no blocked readers, next GP is done instantly. */ - if (!rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; - - /* If there are done callbacks, cause them to be invoked. */ - if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - invoke_rcu_callbacks(); -} - -/* - * Start a new RCU grace period if warranted. Hard irqs must be disabled. - */ -static void rcu_preempt_start_gp(void) -{ - if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { - - /* Official start of GP. */ - rcu_preempt_ctrlblk.gpnum++; - RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); - reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); - - /* Any blocked RCU readers block new GP. */ - if (rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.gp_tasks = - rcu_preempt_ctrlblk.blkd_tasks.next; - - /* Set up for RCU priority boosting. */ - rcu_preempt_boost_start_gp(); - - /* If there is no running reader, CPU is done with GP. */ - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); - } -} - -/* - * We have entered the scheduler, and the current task might soon be - * context-switched away from. If this task is in an RCU read-side - * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the blkd_tasks list. - * If the task started after the current grace period began, as recorded - * by ->gpcpu, we enqueue at the beginning of the list. Otherwise - * before the element referenced by ->gp_tasks (or at the tail if - * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. - * The task will dequeue itself when it exits the outermost enclosing - * RCU read-side critical section. Therefore, the current grace period - * cannot be permitted to complete until the ->gp_tasks pointer becomes - * NULL. - * - * Caller must disable preemption. - */ -void rcu_preempt_note_context_switch(void) -{ - struct task_struct *t = current; - unsigned long flags; - - local_irq_save(flags); /* must exclude scheduler_tick(). */ - if (rcu_preempt_running_reader() > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { - - /* Possibly blocking in an RCU read-side critical section. */ - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - - /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. - */ - list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); - if (rcu_cpu_blocking_cur_gp()) - rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; - } else if (rcu_preempt_running_reader() < 0 && - t->rcu_read_unlock_special) { - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - } - - /* - * Either we were not in an RCU read-side critical section to - * begin with, or we have now recorded that critical section - * globally. Either way, we can now note a quiescent state - * for this CPU. Again, if we were in an RCU read-side critical - * section, and if that critical section was blocking the current - * grace period, then the fact that the task has been enqueued - * means that current grace period continues to be blocked. - */ - rcu_preempt_cpu_qs(); - local_irq_restore(flags); -} - -/* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. - */ -void rcu_read_unlock_special(struct task_struct *t) -{ - int empty; - int empty_exp; - unsigned long flags; - struct list_head *np; -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ - int special; - - /* - * NMI handlers cannot block and cannot safely manipulate state. - * They therefore cannot possibly be special, so just leave. - */ - if (in_nmi()) - return; - - local_irq_save(flags); - - /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. - */ - special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) - rcu_preempt_cpu_qs(); - - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { - local_irq_restore(flags); - return; - } - - /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; - - /* - * Remove this task from the ->blkd_tasks list and adjust - * any pointers that might have been referencing it. - */ - empty = !rcu_preempt_blocked_readers_cgp(); - empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; - np = rcu_next_node_entry(t); - list_del_init(&t->rcu_node_entry); - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) - rcu_preempt_ctrlblk.gp_tasks = np; - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) - rcu_preempt_ctrlblk.exp_tasks = np; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) - rcu_preempt_ctrlblk.boost_tasks = np; -#endif /* #ifdef CONFIG_RCU_BOOST */ - - /* - * If this was the last task on the current list, and if - * we aren't waiting on the CPU, report the quiescent state - * and start a new grace period if needed. - */ - if (!empty && !rcu_preempt_blocked_readers_cgp()) { - rcu_preempt_cpu_qs(); - rcu_preempt_start_gp(); - } - - /* - * If this was the last task on the expedited lists, - * then we need wake up the waiting task. - */ - if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_report_exp_done(); - } -#ifdef CONFIG_RCU_BOOST - /* Unboost self if was boosted. */ - if (t->rcu_boost_mutex != NULL) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - rt_mutex_unlock(rbmp); - } -#endif /* #ifdef CONFIG_RCU_BOOST */ - local_irq_restore(flags); -} - -/* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the rcu_preempt_ctrlblk structure, which is - * checked elsewhere. This is called from the scheduling-clock interrupt. - * - * Caller must disable hard irqs. - */ -static void rcu_preempt_check_callbacks(void) -{ - struct task_struct *t = current; - - if (rcu_preempt_gp_in_progress() && - (!rcu_preempt_running_reader() || - !rcu_cpu_blocking_cur_gp())) - rcu_preempt_cpu_qs(); - if (&rcu_preempt_ctrlblk.rcb.rcucblist != - rcu_preempt_ctrlblk.rcb.donetail) - invoke_rcu_callbacks(); - if (rcu_preempt_gp_in_progress() && - rcu_cpu_blocking_cur_gp() && - rcu_preempt_running_reader() > 0) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; -} - -/* - * TINY_PREEMPT_RCU has an extra callback-list tail pointer to - * update, so this is invoked from rcu_process_callbacks() to - * handle that case. Of course, it is invoked for all flavors of - * RCU, but RCU callbacks can appear only on one of the lists, and - * neither ->nexttail nor ->donetail can possibly be NULL, so there - * is no need for an explicit check. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ - if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) - rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; -} - -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); -} - -/* - * Queue a preemptible -RCU callback for invocation after a grace period. - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - local_irq_save(flags); - *rcu_preempt_ctrlblk.nexttail = head; - rcu_preempt_ctrlblk.nexttail = &head->next; - RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); - rcu_preempt_start_gp(); /* checks to see if GP needed. */ - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - if (!rcu_scheduler_active) - return; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - if (!rcu_preempt_blocked_readers_any()) - return; - - /* Once we get past the fastpath checks, same code as rcu_barrier(). */ - if (rcu_expedited) - synchronize_rcu_expedited(); - else - rcu_barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(void) -{ - return rcu_preempt_ctrlblk.exp_tasks != NULL; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. - */ -static void rcu_report_exp_done(void) -{ - wake_up(&sync_rcu_preempt_exp_wq); -} - -/* - * Wait for an rcu-preempt grace period, but expedite it. The basic idea - * is to rely in the fact that there is but one CPU, and that it is - * illegal for a task to invoke synchronize_rcu_expedited() while in a - * preemptible-RCU read-side critical section. Therefore, any such - * critical sections must correspond to blocked tasks, which must therefore - * be on the ->blkd_tasks list. So just record the current head of the - * list in the ->exp_tasks pointer, and wait for all tasks including and - * after the task pointed to by ->exp_tasks to drain. - */ -void synchronize_rcu_expedited(void) -{ - unsigned long flags; - struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; - unsigned long snap; - - barrier(); /* ensure prior action seen before grace period. */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - - /* - * Acquire lock so that there is only one preemptible RCU grace - * period in flight. Of course, if someone does the expedited - * grace period for us while we are acquiring the lock, just leave. - */ - snap = sync_rcu_preempt_exp_count + 1; - mutex_lock(&sync_rcu_preempt_exp_mutex); - if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) - goto unlock_mb_ret; /* Others did our work for us. */ - - local_irq_save(flags); - - /* - * All RCU readers have to already be on blkd_tasks because - * we cannot legally be executing in an RCU read-side critical - * section. - */ - - /* Snapshot current head of ->blkd_tasks list. */ - rpcp->exp_tasks = rpcp->blkd_tasks.next; - if (rpcp->exp_tasks == &rpcp->blkd_tasks) - rpcp->exp_tasks = NULL; - - /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) { - local_irq_restore(flags); - } else { - rcu_initiate_boost(); - local_irq_restore(flags); - wait_event(sync_rcu_preempt_exp_wq, - !rcu_preempted_readers_exp()); - } - - /* Clean up and exit. */ - barrier(); /* ensure expedited GP seen before counter increment. */ - sync_rcu_preempt_exp_count++; -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); - barrier(); /* ensure subsequent action seen after grace period. */ -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* - * Does preemptible RCU need the CPU to stay out of dynticks mode? - */ -int rcu_preempt_needs_cpu(void) -{ - return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; -} - -#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_TRACE - -/* - * Because preemptible RCU does not exist, it is not necessary to - * dump out its statistics. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. - */ -static void rcu_preempt_check_callbacks(void) -{ -} - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to remove. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ -} - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - -#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_BOOST - -/* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_callbacks(void) -{ - have_rcu_kthread_work = 1; - if (rcu_kthread_task != NULL) - wake_up(&rcu_kthread_wq); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return rcu_kthread_task == current; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed. It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) -{ - unsigned long work; - unsigned long morework; - unsigned long flags; - - for (;;) { - wait_event_interruptible(rcu_kthread_wq, - have_rcu_kthread_work != 0); - morework = rcu_boost(); - local_irq_save(flags); - work = have_rcu_kthread_work; - have_rcu_kthread_work = morework; - local_irq_restore(flags); - if (work) - rcu_process_callbacks(NULL); - schedule_timeout_interruptible(1); /* Leave CPU for others. */ - } - - return 0; /* Not reached, but needed to shut gcc up. */ -} - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct sched_param sp; - - rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); - return 0; -} -early_initcall(rcu_spawn_kthreads); - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* Hold off callback invocation until early_initcall() time. */ -static int rcu_scheduler_fully_active __read_mostly; - -/* - * Start up softirq processing of callbacks. - */ -void invoke_rcu_callbacks(void) -{ - if (rcu_scheduler_fully_active) - raise_softirq(RCU_SOFTIRQ); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * There is no callback kthread, so this thread is never it. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static int __init rcu_scheduler_really_started(void) -{ - rcu_scheduler_fully_active = 1; - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ - return 0; -} -early_initcall(rcu_scheduler_really_started); - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#include <linux/kernel_stat.h> /* * During boot, we forgive RCU lockdep issues. After this function is @@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void) #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_BOOST - -static void rcu_initiate_boost_trace(void) -{ - if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) - rcu_preempt_ctrlblk.n_balk_blkd_tasks++; - else if (rcu_preempt_ctrlblk.gp_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; - else if (rcu_preempt_ctrlblk.boost_tasks != NULL) - rcu_preempt_ctrlblk.n_balk_boost_tasks++; - else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) - rcu_preempt_ctrlblk.n_balk_notyet++; - else - rcu_preempt_ctrlblk.n_balk_nos++; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) { unsigned long flags; @@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) */ static int show_tiny_stats(struct seq_file *m, void *unused) { - show_tiny_preempt_stats(m); seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); return 0; @@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); -static void check_cpu_stall_preempt(void) +static void check_cpu_stall(struct rcu_ctrlblk *rcp) { -#ifdef CONFIG_TINY_PREEMPT_RCU - check_cpu_stall(&rcu_preempt_ctrlblk.rcb); -#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); } #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e1f3a8c9672..b1fa5510388 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = { .name = "srcu_sync" }; -static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock_raw(&srcu_ctl); -} - -static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock_raw(&srcu_ctl, idx); -} - -static struct rcu_torture_ops srcu_raw_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = srcu_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw" -}; - -static struct rcu_torture_ops srcu_raw_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw_sync" -}; - static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -1983,7 +1945,6 @@ rcu_torture_init(void) { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, - &srcu_raw_ops, &srcu_raw_sync_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 35380019f0f..cf3adc6fe00 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -218,8 +218,8 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; -static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); @@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { @@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) smp_processor_id(), (long)(jiffies - rsp->gp_start), rsp->gpnum, rsp->completed, totqlen); if (ndetected == 0) - printk(KERN_ERR "INFO: Stall ended before state dump start\n"); + pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) rcu_dump_cpu_stacks(rsp); @@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); @@ -985,65 +985,6 @@ void rcu_cpu_stall_reset(void) } /* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period. The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - * and must have irqs disabled. - */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - if (rdp->gpnum != rnp->gpnum) { - /* - * If the current grace period is waiting for this CPU, - * set up to detect a quiescent state, otherwise don't - * go looking for one. - */ - rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - rdp->passed_quiesce = 0; - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); - zero_cpu_stall_ticks(rdp); - } -} - -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __note_new_gpnum(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Did someone else start a new RCU grace period start since we last - * checked? Update local state appropriately if so. Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - if (rdp->gpnum != rsp->gpnum) { - note_new_gpnum(rsp, rdp); - ret = 1; - } - local_irq_restore(flags); - return ret; -} - -/* * Initialize the specified rcu_data structure's callback list to empty. */ static void init_callback_list(struct rcu_data *rdp) @@ -1313,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. In addition, the corresponding leaf rcu_node structure's - * ->lock must be held by the caller, with irqs disabled. + * Update CPU-local rcu_data state to record the beginnings and ends of + * grace periods. The caller must hold the ->lock of the leaf rcu_node + * structure corresponding to the current CPU, and must have irqs disabled. */ -static void -__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - /* Did another grace period end? */ + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed) { - /* No, so just accelerate recent callbacks. */ + /* No grace period end, so just accelerate recent callbacks. */ rcu_accelerate_cbs(rsp, rnp, rdp); } else { @@ -1335,68 +1274,40 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); + } + if (rdp->gpnum != rnp->gpnum) { /* - * If we were in an extended quiescent state, we may have - * missed some grace periods that others CPUs handled on - * our behalf. Catch up with this state to avoid noting - * spurious new grace periods. If another grace period - * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. Of course, any quiescent - * states we found for the old GP are now invalid. - */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { - rdp->gpnum = rdp->completed; - rdp->passed_quiesce = 0; - } - - /* - * If RCU does not need a quiescent state from this CPU, - * then make sure that this CPU doesn't go looking for one. + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. */ - if ((rnp->qsmask & rdp->grpmask) == 0) - rdp->qs_pending = 0; + rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + zero_cpu_stall_ticks(rdp); } } -/* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_node *rnp; local_irq_save(flags); rnp = rdp->mynode; - if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ + if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && + rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - __rcu_process_gp_end(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* - * Do per-CPU grace-period initialization for running CPU. The caller - * must hold the lock of the leaf rcu_node structure corresponding to - * this CPU. - */ -static void -rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Prior grace period ended, so advance callbacks for current CPU. */ - __rcu_process_gp_end(rsp, rnp, rdp); - - /* Set state so that this CPU will detect the next quiescent state. */ - __note_new_gpnum(rsp, rnp, rdp); -} - -/* * Initialize a new grace period. */ static int rcu_gp_init(struct rcu_state *rsp) @@ -1444,7 +1355,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WARN_ON_ONCE(rnp->completed != rsp->completed); ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, @@ -1527,7 +1438,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) - __rcu_process_gp_end(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); @@ -1805,9 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) { - /* If there is now a new grace period, record and return. */ - if (check_for_new_grace_period(rsp, rdp)) - return; + /* Check for grace-period ends and beginnings. */ + note_gp_changes(rsp, rdp); /* * Does this CPU still need to do its part for current grace period? @@ -2271,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* Handle the end of a grace period that some other CPU ended. */ - rcu_process_gp_end(rsp, rdp); - /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -2358,8 +2265,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); + note_gp_changes(rsp, rdp); /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { @@ -3265,11 +3171,25 @@ static void __init rcu_init_one(struct rcu_state *rsp, */ static void __init rcu_init_geometry(void) { + ulong d; int i; int j; int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; + /* + * Initialize any unspecified boot parameters. + * The default values of jiffies_till_first_fqs and + * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS + * value, which is a function of HZ, then adding one for each + * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. + */ + d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + if (jiffies_till_first_fqs == ULONG_MAX) + jiffies_till_first_fqs = d; + if (jiffies_till_next_fqs == ULONG_MAX) + jiffies_till_next_fqs = d; + /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4df503470e4..4a39d364493 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -343,12 +343,17 @@ struct rcu_data { #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) + /* For jiffies_till_first_fqs and */ + /* and jiffies_till_next_fqs. */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ +#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ + /* delay between bouts of */ + /* quiescent-state forcing. */ + +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ + /* at least one scheduling clock */ + /* irq before ratting on them. */ #define rcu_wait(cond) \ do { \ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3db5a375d8d..63098a59216 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5]; static void __init rcu_bootup_announce_oddness(void) { #ifdef CONFIG_RCU_TRACE - printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU debugfs-based tracing is enabled.\n"); #endif #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) - printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", CONFIG_RCU_FANOUT); #endif #ifdef CONFIG_RCU_FANOUT_EXACT - printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); + pr_info("\tHierarchical RCU autobalancing is disabled.\n"); #endif #ifdef CONFIG_RCU_FAST_NO_HZ - printk(KERN_INFO - "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); + pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); #endif #ifdef CONFIG_PROVE_RCU - printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); + pr_info("\tRCU lockdep checking is enabled.\n"); #endif #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE - printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); + pr_info("\tRCU torture testing starts during boot.\n"); #endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); + pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); #endif #if defined(CONFIG_RCU_CPU_STALL_INFO) - printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); + pr_info("\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); + pr_info("\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) - printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { @@ -92,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void) have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tExperimental no-CBs CPU 0\n"); + pr_info("\tOffload RCU callbacks from CPU 0\n"); cpumask_set_cpu(0, rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tExperimental no-CBs for all CPUs\n"); + pr_info("\tOffload RCU callbacks from all CPUs\n"); cpumask_setall(rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); + pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) - pr_info("\tExperimental polled no-CBs CPUs.\n"); + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); } #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } @@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); + pr_info("Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) static void rcu_print_task_stall_begin(struct rcu_node *rnp) { - printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", rnp->level, rnp->grplo, rnp->grphi); } static void rcu_print_task_stall_end(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - printk(KERN_CONT " P%d", t->pid); + pr_cont(" P%d", t->pid); ndetected++; } rcu_print_task_stall_end(); @@ -933,6 +932,24 @@ static void __init __rcu_init_preempt(void) rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static struct rcu_state *rcu_state = &rcu_sched_state; @@ -942,7 +959,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state; */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Hierarchical RCU implementation.\n"); + pr_info("Hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -1101,6 +1118,14 @@ static void __init __rcu_init_preempt(void) { } +/* + * Because preemptible RCU does not exist, tasks cannot possibly exit + * while in preemptible RCU read-side critical sections. + */ +void exit_rcu(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST @@ -1629,7 +1654,7 @@ static bool rcu_try_advance_all_cbs(void) */ if (rdp->completed != rnp->completed && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); if (cpu_has_callbacks_ready_to_invoke(rdp)) cbs_ready = true; @@ -1883,7 +1908,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) /* Initiate the stall-info list. */ static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } /* @@ -1914,7 +1939,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, @@ -1925,7 +1950,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) /* Terminate the stall-info list. */ static void print_cpu_stall_info_end(void) { - printk(KERN_ERR "\t"); + pr_err("\t"); } /* Zero ->ticks_this_gp for all flavors of RCU. */ @@ -1948,17 +1973,17 @@ static void increment_cpu_stall_ticks(void) static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT " {"); + pr_cont(" {"); } static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { - printk(KERN_CONT " %d", cpu); + pr_cont(" %d", cpu); } static void print_cpu_stall_info_end(void) { - printk(KERN_CONT "} "); + pr_cont("} "); } static void zero_cpu_stall_ticks(struct rcu_data *rdp) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 1e09308bf2a..0dd6aec1cb6 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -145,6 +145,19 @@ int max_lock_depth = 1024; /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. + * + * @task: the task owning the mutex (owner) for which a chain walk is probably + * needed + * @deadlock_detect: do we have to carry out deadlock detection? + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @orig_waiter: rt_mutex_waiter struct for the task that has just donated + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter + * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, diff --git a/kernel/softirq.c b/kernel/softirq.c index 3d6833f125d..ca25e6e704a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_DISABLE_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); @@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt) WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); + trace_softirqs_on(_RET_IP_); sub_preempt_count(cnt); } @@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) void local_bh_enable(void) { - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + _local_bh_enable_ip(_RET_IP_); } EXPORT_SYMBOL(local_bh_enable); @@ -229,8 +228,7 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0f..4ce13c3cedb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -120,7 +120,6 @@ extern int blk_iopoll_enabled; /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; -static int neg_one = -1; #endif static int zero; @@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dowatchdog, - .extra1 = &neg_one, + .extra1 = &zero, .extra2 = &sixty, }, { @@ -1044,6 +1043,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_proc_update_handler, }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_KMEMCHECK { diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ea..ce0daa320a2 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit) return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; } EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @word: The word being waited on, a kernel virtual address + * @bit: The bit of the word being waited on + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); |