From 33c3d6c61debcc0d295fe65521cfbc45409936c7 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 9 Feb 2010 14:43:59 -0500 Subject: sched: Cleanup pre_schedule_rt Since [commit 9a897c5a: sched: RT-balance, replace hooks with pre/post schedule and wakeup methods] we must call pre_schedule_rt if prev is rt task. So condition rt_task(prev) is always true and the 'unlikely' declaration is simply incorrect. Signed-off-by: Yong Zhang Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9..1ab66a227b5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1474,7 +1474,7 @@ skip: static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) + if (rq->rt.highest_prio.curr > prev->prio) pull_rt_task(rq); } -- cgit v1.2.3-70-g09d2 From 8e54a2c036d8c47195f094af1628834f4c55844a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 11:28:30 -0500 Subject: sched: Change pick_next_task_rt from unlikely to likely The if (unlikely(!rt_rq->rt_nr_running)) test in pick_next_task_rt() tests if there is another rt task ready to run. If so, then pick it. In most systems, only one RT task runs at a time most of the time. Running the branch unlikely annotator profiler on a system doing average work "running firefox, evolution, xchat, distcc builds, etc", it showed the following: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 324344 135104992 99 _pick_next_task_rt sched_rt.c 1064 99% of the time the condition is true. When an RT task schedules out, it is unlikely that another RT task is waiting to run on that same run queue. Simply remove the unlikely() condition. Acked-by: Gregory Haskins Cc:Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1ab66a227b5..c2266c43e99 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1062,7 +1062,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) rt_rq = &rq->rt; - if (unlikely(!rt_rq->rt_nr_running)) + if (!rt_rq->rt_nr_running) return NULL; if (rt_rq_throttled(rt_rq)) -- cgit v1.2.3-70-g09d2 From 63f01241176d7cbc976385aec32f0a209b0bc36a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 14:48:10 -0500 Subject: sched: Remove unlikely() from rt_policy() in sched.c The rt_policy() has an unlikely() that the policy it is checking is of RT priority (SCHED_FIFO or SCHED_RR). According to the annotate branch profiler it is incorrect most of the time: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 36667 654674 94 rt_policy sched.c 126 This makes sense because the rt_policy() is used by the sched_set_scheduler() and nice(). Although users may use sys_nice a bit, all RT users use the sched_set_scheduler() to set their RT priority, including kernel threads. The above numbers were from a normal desktop computer running firefox, evolution, xchat and was part of a distcc compile farm. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac..269a0450281 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -123,7 +123,7 @@ static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + if (policy == SCHED_FIFO || policy == SCHED_RR) return 1; return 0; } -- cgit v1.2.3-70-g09d2 From e69c634190dc724ef2d845ace8d783031d3e492e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 17:10:31 -0500 Subject: sched: Remove unlikely() from ttwu_post_activation The unlikely() used in ttwu_post_activation() tests if the rq->idle_stamp is set. But since this is for a wakeup, and wakeups happen when tasks block on IO, and blocking tasks on IO may put the system into idle, this can actually be a common occurence. Running the annotated branch profiler on an average desktop running firefox, evolution, xchat and distcc, the report shows: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 34884862 146110926 80 ttwu_post_activation sched.c 2309 80% of the time, this unlikely is incorrect. Best not to assume what the result is, and just remove the branch annotation. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 269a0450281..6d24b2e8d82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,7 +2458,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); - if (unlikely(rq->idle_stamp)) { + if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; -- cgit v1.2.3-70-g09d2 From 9c5a2ba70251ecaab18c7a83e38b3c620223476c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 18:01:44 +0200 Subject: workqueue: separate out drain_workqueue() from destroy_workqueue() There are users which want to drain workqueues without destroying it. Separate out drain functionality from destroy_workqueue() into drain_workqueue() and make it accessible to workqueue users. To guarantee forward-progress, only chain queueing is allowed while drain is in progress. If a new work item which isn't chained from the running or pending work items is queued while draining is in progress, WARN_ON_ONCE() is triggered. Signed-off-by: Tejun Heo Cc: James Bottomley --- include/linux/workqueue.h | 3 +- kernel/workqueue.c | 81 +++++++++++++++++++++++++++++++---------------- 2 files changed, 55 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 57b31b3d83b..2be2887c695 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -255,7 +255,7 @@ enum { WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ - WQ_DYING = 1 << 6, /* internal: workqueue is dying */ + WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ WQ_RESCUER = 1 << 7, /* internal: workqueue has rescuer */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ @@ -355,6 +355,7 @@ extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern void flush_workqueue(struct workqueue_struct *wq); +extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); extern int schedule_work(struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e3378e8d3a5..25c8afeaeae 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t; * per-CPU workqueues: */ struct workqueue_struct { - unsigned int flags; /* I: WQ_* flags */ + unsigned int flags; /* W: WQ_* flags */ union { struct cpu_workqueue_struct __percpu *pcpu; struct cpu_workqueue_struct *single; @@ -240,6 +240,7 @@ struct workqueue_struct { mayday_mask_t mayday_mask; /* cpus requesting rescue */ struct worker *rescuer; /* I: rescue worker */ + int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP @@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DYING) && + if (unlikely(wq->flags & WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; @@ -2381,6 +2382,54 @@ out_unlock: } EXPORT_SYMBOL_GPL(flush_workqueue); +/** + * drain_workqueue - drain a workqueue + * @wq: workqueue to drain + * + * Wait until the workqueue becomes empty. While draining is in progress, + * only chain queueing is allowed. IOW, only currently pending or running + * work items on @wq can queue further work items on it. @wq is flushed + * repeatedly until it becomes empty. The number of flushing is detemined + * by the depth of chaining and should be relatively short. Whine if it + * takes too long. + */ +void drain_workqueue(struct workqueue_struct *wq) +{ + unsigned int flush_cnt = 0; + unsigned int cpu; + + /* + * __queue_work() needs to test whether there are drainers, is much + * hotter than drain_workqueue() and already looks at @wq->flags. + * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + */ + spin_lock(&workqueue_lock); + if (!wq->nr_drainers++) + wq->flags |= WQ_DRAINING; + spin_unlock(&workqueue_lock); +reflush: + flush_workqueue(wq); + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq->nr_active && list_empty(&cwq->delayed_works)) + continue; + + if (++flush_cnt == 10 || + (flush_cnt % 100 == 0 && flush_cnt <= 1000)) + pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", + wq->name, flush_cnt); + goto reflush; + } + + spin_lock(&workqueue_lock); + if (!--wq->nr_drainers) + wq->flags &= ~WQ_DRAINING; + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(drain_workqueue); + static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, bool wait_executing) { @@ -3011,34 +3060,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { - unsigned int flush_cnt = 0; unsigned int cpu; - /* - * Mark @wq dying and drain all pending works. Once WQ_DYING is - * set, only chain queueing is allowed. IOW, only currently - * pending or running work items on @wq can queue further work - * items on it. @wq is flushed repeatedly until it becomes empty. - * The number of flushing is detemined by the depth of chaining and - * should be relatively short. Whine if it takes too long. - */ - wq->flags |= WQ_DYING; -reflush: - flush_workqueue(wq); - - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - - if (!cwq->nr_active && list_empty(&cwq->delayed_works)) - continue; - - if (++flush_cnt == 10 || - (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - printk(KERN_WARNING "workqueue %s: flush on " - "destruction isn't complete after %u tries\n", - wq->name, flush_cnt); - goto reflush; - } + /* drain it before proceeding with destruction */ + drain_workqueue(wq); /* * wq list is used to freeze wq, remove from list after -- cgit v1.2.3-70-g09d2 From 075e0b00857e166dcc3e39037a1fc5a90acac709 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:40 +0200 Subject: perf: Optimize ctx_sched_out() Oleg noted that ctx_sched_out() disables the PMU even though it might not actually do something, avoid needless PMU-disabling. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.665385503@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d863b3c057b..4d9a1f01428 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1760,7 +1760,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_event *event; raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; @@ -1770,6 +1769,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active) goto out; + perf_pmu_disable(ctx->pmu); if (event_type & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); @@ -1779,8 +1779,8 @@ static void ctx_sched_out(struct perf_event_context *ctx, list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } -out: perf_pmu_enable(ctx->pmu); +out: raw_spin_unlock(&ctx->lock); } -- cgit v1.2.3-70-g09d2 From 9137fb28ac74d05eb66d1d8e6778eaa14e6fed43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:41 +0200 Subject: perf: Clean up 'ctx' reference counting Small cleanup to how we refcount in find_get_context(), this also allows us to use put_ctx() to free things instead of using kfree(). Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.719340481@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d9a1f01428..d665ac4242f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2835,16 +2835,12 @@ retry: unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { + } else { ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; - get_ctx(ctx); - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -2856,14 +2852,14 @@ retry: else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; else { + get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { - put_task_struct(task); - kfree(ctx); + put_ctx(ctx); if (err == -EAGAIN) goto retry; -- cgit v1.2.3-70-g09d2 From facc43071cc0d4821c176d7d34570714eb348df9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:42 +0200 Subject: perf: Optimize event scheduling locking Currently we only hold one ctx->lock at a time, which results in us flipping back and forth between cpuctx->ctx.lock and task_ctx->lock. Avoid this and gain large atomic regions by holding both locks. We nest the task lock inside the cpu lock, since with task scheduling we might have to change task ctx while holding the cpu ctx lock. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.769881865@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 61 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d665ac4242f..d243af954dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -200,6 +200,22 @@ __get_cpu_context(struct perf_event_context *ctx) return this_cpu_ptr(ctx->pmu->pmu_cpu_context); } +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + #ifdef CONFIG_CGROUP_PERF /* @@ -340,11 +356,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - perf_pmu_disable(cpuctx->ctx.pmu); - /* * perf_cgroup_events says at least one * context on this CPU has cgroup events. @@ -353,6 +366,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) * events for a context. */ if (cpuctx->ctx.nr_cgroups > 0) { + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); if (mode & PERF_CGROUP_SWOUT) { cpu_ctx_sched_out(cpuctx, EVENT_ALL); @@ -372,9 +387,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode) cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - perf_pmu_enable(cpuctx->ctx.pmu); } rcu_read_unlock(); @@ -1759,15 +1774,14 @@ static void ctx_sched_out(struct perf_event_context *ctx, { struct perf_event *event; - raw_spin_lock(&ctx->lock); ctx->is_active = 0; if (likely(!ctx->nr_events)) - goto out; + return; + update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) - goto out; + return; perf_pmu_disable(ctx->pmu); if (event_type & EVENT_PINNED) { @@ -1780,8 +1794,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); -out: - raw_spin_unlock(&ctx->lock); } /* @@ -1929,8 +1941,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, rcu_read_unlock(); if (do_switch) { + raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; + raw_spin_unlock(&ctx->lock); } } @@ -2056,10 +2070,9 @@ ctx_sched_in(struct perf_event_context *ctx, { u64 now; - raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) - goto out; + return; now = perf_clock(); ctx->timestamp = now; @@ -2074,9 +2087,6 @@ ctx_sched_in(struct perf_event_context *ctx, /* Then walk through the lower prio flexible groups */ if (event_type & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); - -out: - raw_spin_unlock(&ctx->lock); } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, @@ -2110,6 +2120,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (cpuctx->task_ctx == ctx) return; + perf_ctx_lock(cpuctx, ctx); perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -2124,12 +2135,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpuctx->task_ctx = ctx; + perf_pmu_enable(ctx->pmu); + perf_ctx_unlock(cpuctx, ctx); + /* * Since these rotations are per-cpu, we need to ensure the * cpu-context we got scheduled on is actually rotating. */ perf_pmu_rotate_start(ctx->pmu); - perf_pmu_enable(ctx->pmu); } /* @@ -2269,7 +2282,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; - raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2301,7 +2313,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (delta > 0) perf_adjust_period(event, period, delta); } - raw_spin_unlock(&ctx->lock); } /* @@ -2309,16 +2320,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) */ static void rotate_ctx(struct perf_event_context *ctx) { - raw_spin_lock(&ctx->lock); - /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (!ctx->rotate_disable) list_rotate_left(&ctx->flexible_groups); - - raw_spin_unlock(&ctx->lock); } /* @@ -2345,6 +2352,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) rotate = 1; } + perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); perf_ctx_adjust_freq(&cpuctx->ctx, interval); if (ctx) @@ -2370,6 +2378,7 @@ done: list_del_init(&cpuctx->rotation_list); perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } void perf_event_task_tick(void) @@ -2424,9 +2433,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * in. */ perf_cgroup_sched_out(current); - task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); + task_ctx_sched_out(ctx, EVENT_ALL); list_for_each_entry(event, &ctx->pinned_groups, group_entry) { ret = event_enable_on_exec(event, ctx); @@ -5982,6 +5991,7 @@ free_dev: } static struct lock_class_key cpuctx_mutex; +static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, char *name, int type) { @@ -6032,6 +6042,7 @@ skip_type: cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; @@ -6776,7 +6787,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * our context. */ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); - task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is @@ -6784,6 +6794,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); + task_ctx_sched_out(child_ctx, EVENT_ALL); child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get -- cgit v1.2.3-70-g09d2 From 04dc2dbbfe1c6f81b996d4dab255da75f9efbb4a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:43 +0200 Subject: perf: Remove task_ctx_sched_in() Make task_ctx_sched_*() imply EVENT_ALL, since anything less will not actually have scheduled the task in/out at all. Since there's no site that schedules all of a task in (due to the interleave with flexible cpuctx) we can remove this function. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.817893268@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d243af954dc..66b3dd80940 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1979,8 +1979,7 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task); } -static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) +static void task_ctx_sched_out(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); @@ -1990,7 +1989,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; } @@ -2098,19 +2097,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, event_type, task); } -static void task_ctx_sched_in(struct perf_event_context *ctx, - enum event_type_t event_type) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - ctx_sched_in(ctx, cpuctx, event_type, NULL); - cpuctx->task_ctx = ctx; -} - static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { @@ -2363,7 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_out(ctx, EVENT_FLEXIBLE); + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx); if (ctx) @@ -2371,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); if (ctx) - task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, current); done: if (remove) @@ -2435,7 +2421,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) perf_cgroup_sched_out(current); raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx); list_for_each_entry(event, &ctx->pinned_groups, group_entry) { ret = event_enable_on_exec(event, ctx); @@ -6794,7 +6780,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx); child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get -- cgit v1.2.3-70-g09d2 From 2c29ef0fef8aaff1f91263fc75c749d659da6972 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:44 +0200 Subject: perf: Simplify and fix __perf_install_in_context() Currently __perf_install_in_context() will try and schedule in the event irrespective of our event scheduling rules, that is, we try to schedule CPU-pinned, TASK-pinned, CPU-flexible, TASK-flexible, but when creating a new event we simply try and schedule it on top of whatever is already on the PMU, this can lead to errors for pinned events. Therefore, simplify things and simply schedule everything out, add the event to the corresponding context and schedule everything back in. This also nicely handles the case where with __ARCH_WANT_INTERRUPTS_ON_CTXSW the IPI can come right in the middle of schedule, before we managed to call perf_event_task_sched_in(). Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.870894224@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 80 +++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 66b3dd80940..60b333ae0bc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1469,8 +1469,12 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *tsk); +static void task_ctx_sched_out(struct perf_event_context *ctx); +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task); /* * Cross CPU call to install and enable a performance event @@ -1481,20 +1485,31 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; + struct perf_event_context *task_ctx = cpuctx->task_ctx; + struct task_struct *task = current; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); /* - * In case we're installing a new context to an already running task, - * could also happen before perf_event_task_sched_in() on architectures - * which do context switches with IRQs enabled. + * If there was an active task_ctx schedule it out. */ - if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx, ctx->task); + if (task_ctx) { + task_ctx_sched_out(task_ctx); + /* + * If the context we're installing events in is not the + * active task_ctx, flip them. + */ + if (ctx->task && task_ctx != ctx) { + raw_spin_unlock(&cpuctx->ctx.lock); + raw_spin_lock(&ctx->lock); + cpuctx->task_ctx = task_ctx = ctx; + } + task = task_ctx->task; + } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); /* * update cgrp time only if current cgrp @@ -1505,43 +1520,18 @@ static int __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); - if (!event_filter_match(event)) - goto unlock; - - /* - * Don't put the event on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE || - (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) - goto unlock; - /* - * An exclusive event can't go on if there are already active - * hardware events, and no hardware event can go on if there - * is already an exclusive event on. + * Schedule everything back in */ - if (!group_can_go_on(event, cpuctx, 1)) - err = -EEXIST; - else - err = event_sched_in(event, cpuctx, ctx); - - if (err) { - /* - * This event couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the event group is pinned then put it in error state. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + if (task_ctx) + ctx_sched_in(task_ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + if (task_ctx) + ctx_sched_in(task_ctx, cpuctx, EVENT_FLEXIBLE, task); -unlock: - raw_spin_unlock(&ctx->lock); + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, task_ctx); return 0; } -- cgit v1.2.3-70-g09d2 From db24d33e08b88e990991760a44d72006a5dc6102 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:45 +0200 Subject: perf: Change and simplify ctx::is_active semantics Instead of tracking if a context is active or not, track which events of the context are active. By making it a bitmask of EVENT_PINNED|EVENT_FLEXIBLE we can simplify some of the scheduling routines since it can avoid adding events that are already active. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.930282378@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 60b333ae0bc..71c2d44ff95 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1763,8 +1763,9 @@ static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_event *event; + int is_active = ctx->is_active; - ctx->is_active = 0; + ctx->is_active &= ~event_type; if (likely(!ctx->nr_events)) return; @@ -1774,12 +1775,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; perf_pmu_disable(ctx->pmu); - if (event_type & EVENT_PINNED) { + if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if (event_type & EVENT_FLEXIBLE) { + if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2058,8 +2059,9 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { u64 now; + int is_active = ctx->is_active; - ctx->is_active = 1; + ctx->is_active |= event_type; if (likely(!ctx->nr_events)) return; @@ -2070,11 +2072,11 @@ ctx_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (event_type & EVENT_PINNED) + if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (event_type & EVENT_FLEXIBLE) + if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) ctx_flexible_sched_in(ctx, cpuctx); } -- cgit v1.2.3-70-g09d2 From dce5855bba5df9e87bb04584d505c1f1b103c652 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:46 +0200 Subject: perf: Collect the schedule-in rules in one function This was scattered out - refactor it into a single function. No change in functionality. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192141.979862055@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 71c2d44ff95..802f3b24eee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1476,6 +1476,18 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task); +static void perf_event_sched_in(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + struct task_struct *task) +{ + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + if (ctx) + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + if (ctx) + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); +} + /* * Cross CPU call to install and enable a performance event * @@ -1523,12 +1535,7 @@ static int __perf_install_in_context(void *info) /* * Schedule everything back in */ - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); - if (task_ctx) - ctx_sched_in(task_ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - if (task_ctx) - ctx_sched_in(task_ctx, cpuctx, EVENT_FLEXIBLE, task); + perf_event_sched_in(cpuctx, task_ctx, task); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, task_ctx); @@ -2107,9 +2114,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + perf_event_sched_in(cpuctx, ctx, task); cpuctx->task_ctx = ctx; @@ -2347,9 +2352,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); - if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, current); + perf_event_sched_in(cpuctx, ctx, current); done: if (remove) -- cgit v1.2.3-70-g09d2 From e03a9a55b4e45377af9ca3d464135f9ea280b8f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:47 +0200 Subject: perf: Change close() semantics for group events In order to always call list_del_event() on the correct cpu if the event is part of an active context and avoid having to do two IPIs, change the close() semantics slightly. The current perf_event_disable() call would disable a whole group if the event that's being closed is the group leader, whereas the new code keeps the group siblings enabled. People should not rely on this behaviour and I don't think they do, but in case we find they do, the fix is easy and we have to take the double IPI cost. Signed-off-by: Peter Zijlstra Cc: Vince Weaver Link: http://lkml.kernel.org/r/20110409192142.038377551@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 802f3b24eee..c378062da27 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2920,12 +2920,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - /* - * Remove from the PMU, can't get re-enabled since we got - * here because the last ref went. - */ - perf_event_disable(event); - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: @@ -2942,8 +2936,8 @@ int perf_event_release_kernel(struct perf_event *event) mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); - list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event); mutex_unlock(&ctx->mutex); free_event(event); -- cgit v1.2.3-70-g09d2 From 64ce312618ef0e11d88def80effcefd1b59fdb1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 9 Apr 2011 21:17:48 +0200 Subject: perf: De-schedule a task context when removing the last event Since perf_install_in_context() will now install a context when we add the first event, we can de-schedule the context when the last event is removed. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110409192142.090431763@chello.nl Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c378062da27..cc5d57d1d0b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1120,6 +1120,10 @@ static int __perf_remove_from_context(void *info) raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); list_del_event(event, ctx); + if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + ctx->is_active = 0; + cpuctx->task_ctx = NULL; + } raw_spin_unlock(&ctx->lock); return 0; -- cgit v1.2.3-70-g09d2 From 0b1007c3578569469a6fab6ae5cca918ccdc3ee1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:13:59 +0200 Subject: ptrace: remove silly wait_trap variable from ptrace_attach() Remove local variable wait_trap which determines whether to wait for !TRAPPING or not and simply wait for it if attach was successful. -v2: Oleg pointed out wait should happen iff attach was successful. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Oleg Nesterov --- kernel/ptrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2df115790cd..4f689cb739a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -184,7 +184,6 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) static int ptrace_attach(struct task_struct *task) { - bool wait_trap = false; int retval; audit_ptrace(task); @@ -246,7 +245,6 @@ static int ptrace_attach(struct task_struct *task) if (task_is_stopped(task)) { task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; signal_wake_up(task, 1); - wait_trap = true; } spin_unlock(&task->sighand->siglock); @@ -257,7 +255,7 @@ unlock_tasklist: unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: - if (wait_trap) + if (!retval) wait_event(current->signal->wait_chldexit, !(task->group_stop & GROUP_STOP_TRAPPING)); return retval; -- cgit v1.2.3-70-g09d2 From a8f072c1d624a627b67f2ace2f0c25d856ef4e54 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:13:59 +0200 Subject: job control: rename signal->group_stop and flags to jobctl and update them signal->group_stop currently hosts mostly group stop related flags; however, it's gonna be used for wider purposes and the GROUP_STOP_ flag prefix becomes confusing. Rename signal->group_stop to signal->jobctl and rename all GROUP_STOP_* flags to JOBCTL_*. Bit position macros JOBCTL_*_BIT are defined and JOBCTL_* flags are defined in terms of them to allow using bitops later. While at it, reassign JOBCTL_TRAPPING to bit 22 to better accomodate future additions. This doesn't cause any functional change. -v2: JOBCTL_*_BIT macros added as suggested by Linus. Signed-off-by: Tejun Heo Cc: Linus Torvalds Signed-off-by: Oleg Nesterov --- fs/exec.c | 2 +- include/linux/sched.h | 22 ++++++++----- kernel/ptrace.c | 12 +++---- kernel/signal.c | 91 ++++++++++++++++++++++++++------------------------- 4 files changed, 67 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index ea5f748906a..8986bb0f9dc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code) t = start; do { - task_clear_group_stop_pending(t); + task_clear_jobctl_stop_pending(t); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a8621c4be1..b0dd064eb4f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1282,7 +1282,7 @@ struct task_struct { int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ - unsigned int group_stop; /* GROUP_STOP_*, siglock protected */ + unsigned int jobctl; /* JOBCTL_*, siglock protected */ /* ??? */ unsigned int personality; unsigned did_exec:1; @@ -1803,15 +1803,21 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define used_math() tsk_used_math(current) /* - * task->group_stop flags + * task->jobctl flags */ -#define GROUP_STOP_SIGMASK 0xffff /* signr of the last group stop */ -#define GROUP_STOP_PENDING (1 << 16) /* task should stop for group stop */ -#define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ -#define GROUP_STOP_TRAPPING (1 << 18) /* switching from STOPPED to TRACED */ -#define GROUP_STOP_DEQUEUED (1 << 19) /* stop signal dequeued */ +#define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */ -extern void task_clear_group_stop_pending(struct task_struct *task); +#define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ +#define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ +#define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ +#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ + +#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) +#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) +#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) + +extern void task_clear_jobctl_stop_pending(struct task_struct *task); #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4f689cb739a..134f34cb142 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -77,13 +77,13 @@ void __ptrace_unlink(struct task_struct *child) spin_lock(&child->sighand->siglock); /* - * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and * @child isn't dead. */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || child->signal->group_stop_count)) - child->group_stop |= GROUP_STOP_PENDING; + child->jobctl |= JOBCTL_STOP_PENDING; /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick @@ -91,7 +91,7 @@ void __ptrace_unlink(struct task_struct *child) * is in TASK_TRACED; otherwise, we might unduly disrupt * TASK_KILLABLE sleeps. */ - if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) signal_wake_up(child, task_is_traced(child)); spin_unlock(&child->sighand->siglock); @@ -226,7 +226,7 @@ static int ptrace_attach(struct task_struct *task) spin_lock(&task->sighand->siglock); /* - * If the task is already STOPPED, set GROUP_STOP_PENDING and + * If the task is already STOPPED, set JOBCTL_STOP_PENDING and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING * will be cleared if the child completes the transition or any * event which clears the group stop states happens. We'll wait @@ -243,7 +243,7 @@ static int ptrace_attach(struct task_struct *task) * in and out of STOPPED are protected by siglock. */ if (task_is_stopped(task)) { - task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + task->jobctl |= JOBCTL_STOP_PENDING | JOBCTL_TRAPPING; signal_wake_up(task, 1); } @@ -257,7 +257,7 @@ unlock_creds: out: if (!retval) wait_event(current->signal->wait_chldexit, - !(task->group_stop & GROUP_STOP_TRAPPING)); + !(task->jobctl & JOBCTL_TRAPPING)); return retval; } diff --git a/kernel/signal.c b/kernel/signal.c index 86c32b884f8..ab6851c0646 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if ((t->group_stop & GROUP_STOP_PENDING) || + if ((t->jobctl & JOBCTL_STOP_PENDING) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -224,27 +224,28 @@ static inline void print_dropped_signal(int sig) } /** - * task_clear_group_stop_trapping - clear group stop trapping bit + * task_clear_jobctl_trapping - clear jobctl trapping bit * @task: target task * - * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it - * and wake up the ptracer. Note that we don't need any further locking. - * @task->siglock guarantees that @task->parent points to the ptracer. + * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. + * Clear it and wake up the ptracer. Note that we don't need any further + * locking. @task->siglock guarantees that @task->parent points to the + * ptracer. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ -static void task_clear_group_stop_trapping(struct task_struct *task) +static void task_clear_jobctl_trapping(struct task_struct *task) { - if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { - task->group_stop &= ~GROUP_STOP_TRAPPING; + if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_TRAPPING; __wake_up_sync_key(&task->parent->signal->wait_chldexit, TASK_UNINTERRUPTIBLE, 1, task); } } /** - * task_clear_group_stop_pending - clear pending group stop + * task_clear_jobctl_stop_pending - clear pending group stop * @task: target task * * Clear group stop states for @task. @@ -252,19 +253,19 @@ static void task_clear_group_stop_trapping(struct task_struct *task) * CONTEXT: * Must be called with @task->sighand->siglock held. */ -void task_clear_group_stop_pending(struct task_struct *task) +void task_clear_jobctl_stop_pending(struct task_struct *task) { - task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | - GROUP_STOP_DEQUEUED); + task->jobctl &= ~(JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME | + JOBCTL_STOP_DEQUEUED); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * - * @task has GROUP_STOP_PENDING set and is participating in a group stop. + * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if - * %GROUP_STOP_CONSUME was set. If the consumption completes the group + * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group * stop, the appropriate %SIGNAL_* flags are set. * * CONTEXT: @@ -277,11 +278,11 @@ void task_clear_group_stop_pending(struct task_struct *task) static bool task_participate_group_stop(struct task_struct *task) { struct signal_struct *sig = task->signal; - bool consume = task->group_stop & GROUP_STOP_CONSUME; + bool consume = task->jobctl & JOBCTL_STOP_CONSUME; - WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); + WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); - task_clear_group_stop_pending(task); + task_clear_jobctl_stop_pending(task); if (!consume) return false; @@ -604,7 +605,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - current->group_stop |= GROUP_STOP_DEQUEUED; + current->jobctl |= JOBCTL_STOP_DEQUEUED; } if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* @@ -809,7 +810,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { - task_clear_group_stop_pending(t); + task_clear_jobctl_stop_pending(t); rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); wake_up_state(t, __TASK_STOPPED); } while_each_thread(p, t); @@ -925,7 +926,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { - task_clear_group_stop_pending(t); + task_clear_jobctl_stop_pending(t); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -1160,7 +1161,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { - task_clear_group_stop_pending(t); + task_clear_jobctl_stop_pending(t); count++; /* Don't bother with already dead threads */ @@ -1738,7 +1739,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * clear now. We act as if SIGCONT is received after TASK_TRACED * is entered - ignore it. */ - if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) + if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); current->last_siginfo = info; @@ -1751,12 +1752,12 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) set_current_state(TASK_TRACED); /* - * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and + * We're committing to trapping. Clearing JOBCTL_TRAPPING and * transition to TASK_TRACED should be atomic with respect to - * siglock. This hsould be done after the arch hook as siglock is + * siglock. This should be done after the arch hook as siglock is * released and regrabbed across it. */ - task_clear_group_stop_trapping(current); + task_clear_jobctl_trapping(current); spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); @@ -1792,9 +1793,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * * If @gstop_done, the ptracer went away between group stop * completion and here. During detach, it would have set - * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED - * in do_signal_stop() on return, so notifying the real - * parent of the group stop completion is enough. + * JOBCTL_STOP_PENDING on us and we'll re-enter + * TASK_STOPPED in do_signal_stop() on return, so notifying + * the real parent of the group stop completion is enough. */ if (gstop_done) do_notify_parent_cldstop(current, false, why); @@ -1856,14 +1857,14 @@ static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - if (!(current->group_stop & GROUP_STOP_PENDING)) { - unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; + if (!(current->jobctl & JOBCTL_STOP_PENDING)) { + unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; - /* signr will be recorded in task->group_stop for retries */ - WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); + /* signr will be recorded in task->jobctl for retries */ + WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); - if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || + if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) return 0; /* @@ -1890,19 +1891,19 @@ static int do_signal_stop(int signr) else WARN_ON_ONCE(!task_ptrace(current)); - current->group_stop &= ~GROUP_STOP_SIGMASK; - current->group_stop |= signr | gstop; + current->jobctl &= ~JOBCTL_STOP_SIGMASK; + current->jobctl |= signr | gstop; sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) { - t->group_stop &= ~GROUP_STOP_SIGMASK; + t->jobctl &= ~JOBCTL_STOP_SIGMASK; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { - t->group_stop |= signr | gstop; + t->jobctl |= signr | gstop; sig->group_stop_count++; signal_wake_up(t, 0); } @@ -1943,23 +1944,23 @@ retry: spin_lock_irq(¤t->sighand->siglock); } else { - ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, + ptrace_stop(current->jobctl & JOBCTL_STOP_SIGMASK, CLD_STOPPED, 0, NULL); current->exit_code = 0; } /* - * GROUP_STOP_PENDING could be set if another group stop has + * JOBCTL_STOP_PENDING could be set if another group stop has * started since being woken up or ptrace wants us to transit * between TASK_STOPPED and TRACED. Retry group stop. */ - if (current->group_stop & GROUP_STOP_PENDING) { - WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); + if (current->jobctl & JOBCTL_STOP_PENDING) { + WARN_ON_ONCE(!(current->jobctl & JOBCTL_STOP_SIGMASK)); goto retry; } /* PTRACE_ATTACH might have raced with task killing, clear trapping */ - task_clear_group_stop_trapping(current); + task_clear_jobctl_trapping(current); spin_unlock_irq(¤t->sighand->siglock); @@ -2078,8 +2079,8 @@ relock: if (unlikely(signr != 0)) ka = return_ka; else { - if (unlikely(current->group_stop & - GROUP_STOP_PENDING) && do_signal_stop(0)) + if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && + do_signal_stop(0)) goto relock; signr = dequeue_signal(current, ¤t->blocked, @@ -2253,7 +2254,7 @@ void exit_signals(struct task_struct *tsk) signotset(&unblocked); retarget_shared_pending(tsk, &unblocked); - if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && + if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: -- cgit v1.2.3-70-g09d2 From 755e276b3326f300585435d2f3876e66e248c476 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:13:59 +0200 Subject: ptrace: ptrace_check_attach(): rename @kill to @ignore_state and add comments PTRACE_INTERRUPT is going to be added which should also skip task_is_traced() check in ptrace_check_attach(). Rename @kill to @ignore_state and make it bool. Add function comment while at it. This patch doesn't introduce any behavior difference. Signed-off-by: Tejun Heo Signed-off-by: Oleg Nesterov --- include/linux/ptrace.h | 2 +- kernel/ptrace.c | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 9178d5cc0b0..e93ef1a54fc 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -105,7 +105,7 @@ extern long arch_ptrace(struct task_struct *child, long request, extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); -extern int ptrace_check_attach(struct task_struct *task, int kill); +extern int ptrace_check_attach(struct task_struct *task, bool ignore_state); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 134f34cb142..eb191116edf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -97,10 +97,24 @@ void __ptrace_unlink(struct task_struct *child) spin_unlock(&child->sighand->siglock); } -/* - * Check that we have indeed attached to the thing.. +/** + * ptrace_check_attach - check whether ptracee is ready for ptrace operation + * @child: ptracee to check for + * @ignore_state: don't check whether @child is currently %TASK_TRACED + * + * Check whether @child is being ptraced by %current and ready for further + * ptrace operations. If @ignore_state is %false, @child also should be in + * %TASK_TRACED state and on return the child is guaranteed to be traced + * and not executing. If @ignore_state is %true, @child can be in any + * state. + * + * CONTEXT: + * Grabs and releases tasklist_lock and @child->sighand->siglock. + * + * RETURNS: + * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, int kill) +int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; @@ -119,13 +133,13 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ spin_lock_irq(&child->sighand->siglock); WARN_ON_ONCE(task_is_stopped(child)); - if (task_is_traced(child) || kill) + if (task_is_traced(child) || ignore_state) ret = 0; spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !kill) + if (!ret && !ignore_state) ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ -- cgit v1.2.3-70-g09d2 From 81be24b8cdeb69e62f9d1b6b425fd9ffdd37f581 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:13:59 +0200 Subject: ptrace: relocate set_current_state(TASK_TRACED) in ptrace_stop() In ptrace_stop(), after arch hook is done, the task state and jobctl bits are updated while holding siglock. The ordering requirement there is that TASK_TRACED is set before JOBCTL_TRAPPING is cleared to prevent ptracer waiting on TRAPPING doesn't end up waking up TRACED is actually set and sees TASK_RUNNING in wait(2). Move set_current_state(TASK_TRACED) to the top of the block and reorganize comments. This makes the ordering more obvious (TASK_TRACED before other updates) and helps future updates to group stop participation. This patch doesn't cause any functional change. Signed-off-by: Tejun Heo Signed-off-by: Oleg Nesterov --- kernel/signal.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ab6851c0646..62a6c3bb9f0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1732,6 +1732,18 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) return; } + /* + * We're committing to trapping. TRACED should be visible before + * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). + * Also, transition to TRACED and updates to ->jobctl should be + * atomic with respect to siglock and should be done after the arch + * hook as siglock is released and regrabbed across it. + */ + set_current_state(TASK_TRACED); + + current->last_siginfo = info; + current->exit_code = exit_code; + /* * If @why is CLD_STOPPED, we're trapping to participate in a group * stop. Do the bookkeeping. Note that if SIGCONT was delievered @@ -1742,21 +1754,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); - current->last_siginfo = info; - current->exit_code = exit_code; - - /* - * TRACED should be visible before TRAPPING is cleared; otherwise, - * the tracer might fail do_wait(). - */ - set_current_state(TASK_TRACED); - - /* - * We're committing to trapping. Clearing JOBCTL_TRAPPING and - * transition to TASK_TRACED should be atomic with respect to - * siglock. This should be done after the arch hook as siglock is - * released and regrabbed across it. - */ + /* entering a trap, clear TRAPPING */ task_clear_jobctl_trapping(current); spin_unlock_irq(¤t->sighand->siglock); -- cgit v1.2.3-70-g09d2 From 3759a0d94c18764247b66511d1038f2b93aa95de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:14:00 +0200 Subject: job control: introduce JOBCTL_PENDING_MASK and task_clear_jobctl_pending() This patch introduces JOBCTL_PENDING_MASK and replaces task_clear_jobctl_stop_pending() with task_clear_jobctl_pending() which takes an extra @mask argument. JOBCTL_PENDING_MASK is currently equal to JOBCTL_STOP_PENDING but future patches will add more bits. recalc_sigpending_tsk() is updated to use JOBCTL_PENDING_MASK instead. task_clear_jobctl_pending() takes @mask which in subset of JOBCTL_PENDING_MASK and clears the relevant jobctl bits. If JOBCTL_STOP_PENDING is set, other STOP bits are cleared together. All task_clear_jobctl_stop_pending() users are updated to call task_clear_jobctl_pending() with JOBCTL_STOP_PENDING which is functionally identical to task_clear_jobctl_stop_pending(). This patch doesn't cause any functional change. Signed-off-by: Tejun Heo Signed-off-by: Oleg Nesterov --- fs/exec.c | 2 +- include/linux/sched.h | 5 ++++- kernel/signal.c | 27 +++++++++++++++++---------- 3 files changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 8986bb0f9dc..4402105287c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code) t = start; do { - task_clear_jobctl_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); diff --git a/include/linux/sched.h b/include/linux/sched.h index b0dd064eb4f..5a958b17f9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1817,7 +1817,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -extern void task_clear_jobctl_stop_pending(struct task_struct *task); +#define JOBCTL_PENDING_MASK JOBCTL_STOP_PENDING + +extern void task_clear_jobctl_pending(struct task_struct *task, + unsigned int mask); #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/signal.c b/kernel/signal.c index 62a6c3bb9f0..288d952fa3b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if ((t->jobctl & JOBCTL_STOP_PENDING) || + if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -245,18 +245,25 @@ static void task_clear_jobctl_trapping(struct task_struct *task) } /** - * task_clear_jobctl_stop_pending - clear pending group stop + * task_clear_jobctl_pending - clear jobctl pending bits * @task: target task + * @mask: pending bits to clear * - * Clear group stop states for @task. + * Clear @mask from @task->jobctl. @mask must be subset of + * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other + * STOP bits are cleared together. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ -void task_clear_jobctl_stop_pending(struct task_struct *task) +void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) { - task->jobctl &= ~(JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME | - JOBCTL_STOP_DEQUEUED); + BUG_ON(mask & ~JOBCTL_PENDING_MASK); + + if (mask & JOBCTL_STOP_PENDING) + mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; + + task->jobctl &= ~mask; } /** @@ -282,7 +289,7 @@ static bool task_participate_group_stop(struct task_struct *task) WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); - task_clear_jobctl_stop_pending(task); + task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); if (!consume) return false; @@ -810,7 +817,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { - task_clear_jobctl_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); wake_up_state(t, __TASK_STOPPED); } while_each_thread(p, t); @@ -926,7 +933,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { - task_clear_jobctl_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -1161,7 +1168,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { - task_clear_jobctl_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); count++; /* Don't bother with already dead threads */ -- cgit v1.2.3-70-g09d2 From 6dfca32984237a8a011b5bf367e53341a265b2a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:14:00 +0200 Subject: job control: make task_clear_jobctl_pending() clear TRAPPING automatically JOBCTL_TRAPPING indicates that ptracer is waiting for tracee to (re)transit into TRACED. task_clear_jobctl_pending() must be called when either tracee enters TRACED or the transition is cancelled for some reason. The former is achieved by explicitly calling task_clear_jobctl_pending() in ptrace_stop() and the latter by calling it at the end of do_signal_stop(). Calling task_clear_jobctl_trapping() at the end of do_signal_stop() limits the scope TRAPPING can be used and is fragile in that seemingly unrelated changes to tracee's control flow can lead to stuck TRAPPING. We already have task_clear_jobctl_pending() calls on those cancelling events to clear JOBCTL_STOP_PENDING. Cancellations can be handled by making those call sites use JOBCTL_PENDING_MASK instead and updating task_clear_jobctl_pending() such that task_clear_jobctl_trapping() is called automatically if no stop/trap is pending. This patch makes the above changes and removes the fallback task_clear_jobctl_trapping() call from do_signal_stop(). Signed-off-by: Tejun Heo Signed-off-by: Oleg Nesterov --- fs/exec.c | 2 +- kernel/signal.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 4402105287c..a9f2b3631bd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1772,7 +1772,7 @@ static int zap_process(struct task_struct *start, int exit_code) t = start; do { - task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); diff --git a/kernel/signal.c b/kernel/signal.c index 288d952fa3b..637a171b65b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -253,6 +253,9 @@ static void task_clear_jobctl_trapping(struct task_struct *task) * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other * STOP bits are cleared together. * + * If clearing of @mask leaves no stop or trap pending, this function calls + * task_clear_jobctl_trapping(). + * * CONTEXT: * Must be called with @task->sighand->siglock held. */ @@ -264,6 +267,9 @@ void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; task->jobctl &= ~mask; + + if (!(task->jobctl & JOBCTL_PENDING_MASK)) + task_clear_jobctl_trapping(task); } /** @@ -933,7 +939,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { - task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -1168,7 +1174,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { - task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); count++; /* Don't bother with already dead threads */ @@ -1964,9 +1970,6 @@ retry: goto retry; } - /* PTRACE_ATTACH might have raced with task killing, clear trapping */ - task_clear_jobctl_trapping(current); - spin_unlock_irq(¤t->sighand->siglock); tracehook_finish_jctl(); -- cgit v1.2.3-70-g09d2 From 7dd3db54e77d21eb95e145f19ba53f68250d0e73 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:14:00 +0200 Subject: job control: introduce task_set_jobctl_pending() task->jobctl currently hosts JOBCTL_STOP_PENDING and will host TRAP pending bits too. Setting pending conditions on a dying task may make the task unkillable. Currently, each setting site is responsible for checking for the condition but with to-be-added job control traps this becomes too fragile. This patch adds task_set_jobctl_pending() which should be used when setting task->jobctl bits to schedule a stop or trap. The function performs the followings to ease setting pending bits. * Sanity checks. * If fatal signal is pending or PF_EXITING is set, no bit is set. * STOP_SIGMASK is automatically cleared if new value is being set. do_signal_stop() and ptrace_attach() are updated to use task_set_jobctl_pending() instead of setting STOP_PENDING explicitly. The surrounding structures around setting are changed to fit task_set_jobctl_pending() better but there should be no userland visible behavior difference. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Oleg Nesterov --- include/linux/sched.h | 2 ++ kernel/ptrace.c | 6 +++--- kernel/signal.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a958b17f9f..5157bd9eee3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1819,6 +1819,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define JOBCTL_PENDING_MASK JOBCTL_STOP_PENDING +extern bool task_set_jobctl_pending(struct task_struct *task, + unsigned int mask); extern void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index eb191116edf..0c37d999c8b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -256,10 +256,10 @@ static int ptrace_attach(struct task_struct *task) * The following task_is_stopped() test is safe as both transitions * in and out of STOPPED are protected by siglock. */ - if (task_is_stopped(task)) { - task->jobctl |= JOBCTL_STOP_PENDING | JOBCTL_TRAPPING; + if (task_is_stopped(task) && + task_set_jobctl_pending(task, + JOBCTL_STOP_PENDING | JOBCTL_TRAPPING)) signal_wake_up(task, 1); - } spin_unlock(&task->sighand->siglock); diff --git a/kernel/signal.c b/kernel/signal.c index 637a171b65b..9ab91c516c3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -223,6 +223,39 @@ static inline void print_dropped_signal(int sig) current->comm, current->pid, sig); } +/** + * task_set_jobctl_pending - set jobctl pending bits + * @task: target task + * @mask: pending bits to set + * + * Clear @mask from @task->jobctl. @mask must be subset of + * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | + * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is + * cleared. If @task is already being killed or exiting, this function + * becomes noop. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + * + * RETURNS: + * %true if @mask is set, %false if made noop because @task was dying. + */ +bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) +{ + BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | + JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); + BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); + + if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) + return false; + + if (mask & JOBCTL_STOP_SIGMASK) + task->jobctl &= ~JOBCTL_STOP_SIGMASK; + + task->jobctl |= mask; + return true; +} + /** * task_clear_jobctl_trapping - clear jobctl trapping bit * @task: target task @@ -1902,19 +1935,20 @@ static int do_signal_stop(int signr) else WARN_ON_ONCE(!task_ptrace(current)); - current->jobctl &= ~JOBCTL_STOP_SIGMASK; - current->jobctl |= signr | gstop; - sig->group_stop_count = 1; + sig->group_stop_count = 0; + + if (task_set_jobctl_pending(current, signr | gstop)) + sig->group_stop_count++; + for (t = next_thread(current); t != current; t = next_thread(t)) { - t->jobctl &= ~JOBCTL_STOP_SIGMASK; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ - if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { - t->jobctl |= signr | gstop; + if (!task_is_stopped(t) && + task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; signal_wake_up(t, 0); } -- cgit v1.2.3-70-g09d2 From 62c124ff3bcdb414af635c2bf822c9e4f2a5abfa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:14:00 +0200 Subject: ptrace: use bit_waitqueue for TRAPPING instead of wait_chldexit ptracer->signal->wait_chldexit was used to wait for TRAPPING; however, ->wait_chldexit was already complicated with waker-side filtering without adding TRAPPING wait on top of it. Also, it unnecessarily made TRAPPING clearing depend on the current ptrace relationship - if the ptracee is detached, wakeup is lost. There is no reason to use signal->wait_chldexit here. We're just waiting for JOBCTL_TRAPPING bit to clear and given the relatively infrequent use of ptrace, bit_waitqueue can serve it perfectly. This patch makes JOBCTL_TRAPPING wait use bit_waitqueue instead of signal->wait_chldexit. -v2: Use JOBCTL_*_BIT macros instead of ilog2() as suggested by Linus. Signed-off-by: Tejun Heo Cc: Linus Torvalds Signed-off-by: Oleg Nesterov --- kernel/ptrace.c | 10 ++++++++-- kernel/signal.c | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0c37d999c8b..7f05f3a1267 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,6 +25,12 @@ #include +static int ptrace_trapping_sleep_fn(void *flags) +{ + schedule(); + return 0; +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -270,8 +276,8 @@ unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: if (!retval) - wait_event(current->signal->wait_chldexit, - !(task->jobctl & JOBCTL_TRAPPING)); + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, + ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); return retval; } diff --git a/kernel/signal.c b/kernel/signal.c index 9ab91c516c3..172a4c79f12 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -272,8 +272,7 @@ static void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; - __wake_up_sync_key(&task->parent->signal->wait_chldexit, - TASK_UNINTERRUPTIBLE, 1, task); + wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } -- cgit v1.2.3-70-g09d2 From dd1d6772692316fe35094085c5e4d9a370ad3462 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Jun 2011 11:14:00 +0200 Subject: signal: remove three noop tracehooks Remove the following three noop tracehooks in signals.c. * tracehook_force_sigpending() * tracehook_get_signal() * tracehook_finish_jctl() The code area is about to be updated and these hooks don't do anything other than obfuscating the logic. Signed-off-by: Tejun Heo Signed-off-by: Oleg Nesterov --- include/linux/tracehook.h | 52 ----------------------------------------------- kernel/signal.c | 44 +++++++++++++-------------------------- 2 files changed, 14 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index e95f5236611..15745cdd32c 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -425,58 +425,6 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task, return (task_ptrace(task) & PT_PTRACED) != 0; } -/** - * tracehook_force_sigpending - let tracing force signal_pending(current) on - * - * Called when recomputing our signal_pending() flag. Return nonzero - * to force the signal_pending() flag on, so that tracehook_get_signal() - * will be called before the next return to user mode. - * - * Called with @current->sighand->siglock held. - */ -static inline int tracehook_force_sigpending(void) -{ - return 0; -} - -/** - * tracehook_get_signal - deliver synthetic signal to traced task - * @task: @current - * @regs: task_pt_regs(@current) - * @info: details of synthetic signal - * @return_ka: sigaction for synthetic signal - * - * Return zero to check for a real pending signal normally. - * Return -1 after releasing the siglock to repeat the check. - * Return a signal number to induce an artificial signal delivery, - * setting *@info and *@return_ka to specify its details and behavior. - * - * The @return_ka->sa_handler value controls the disposition of the - * signal, no matter the signal number. For %SIG_DFL, the return value - * is a representative signal to indicate the behavior (e.g. %SIGTERM - * for death, %SIGQUIT for core dump, %SIGSTOP for job control stop, - * %SIGTSTP for stop unless in an orphaned pgrp), but the signal number - * reported will be @info->si_signo instead. - * - * Called with @task->sighand->siglock held, before dequeuing pending signals. - */ -static inline int tracehook_get_signal(struct task_struct *task, - struct pt_regs *regs, - siginfo_t *info, - struct k_sigaction *return_ka) -{ - return 0; -} - -/** - * tracehook_finish_jctl - report about return from job control stop - * - * This is called by do_signal_stop() after wakeup. - */ -static inline void tracehook_finish_jctl(void) -{ -} - #define DEATH_REAP -1 #define DEATH_DELAYED_GROUP_LEADER -2 diff --git a/kernel/signal.c b/kernel/signal.c index 172a4c79f12..c99b8b5c0be 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (unlikely(tracehook_force_sigpending())) - set_thread_flag(TIF_SIGPENDING); - else if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (!recalc_sigpending_tsk(current) && !freezing(current)) clear_thread_flag(TIF_SIGPENDING); } @@ -2005,8 +2003,6 @@ retry: spin_unlock_irq(¤t->sighand->siglock); - tracehook_finish_jctl(); - return 1; } @@ -2109,37 +2105,25 @@ relock: for (;;) { struct k_sigaction *ka; - /* - * Tracing can induce an artificial signal and choose sigaction. - * The return value in @signr determines the default action, - * but @info->si_signo is the signal number we will report. - */ - signr = tracehook_get_signal(current, regs, info, return_ka); - if (unlikely(signr < 0)) + + if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && + do_signal_stop(0)) goto relock; - if (unlikely(signr != 0)) - ka = return_ka; - else { - if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && - do_signal_stop(0)) - goto relock; - signr = dequeue_signal(current, ¤t->blocked, - info); + signr = dequeue_signal(current, ¤t->blocked, info); - if (!signr) - break; /* will return 0 */ + if (!signr) + break; /* will return 0 */ - if (signr != SIGKILL) { - signr = ptrace_signal(signr, info, - regs, cookie); - if (!signr) - continue; - } - - ka = &sighand->action[signr-1]; + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, + regs, cookie); + if (!signr) + continue; } + ka = &sighand->action[signr-1]; + /* Trace actually delivered signals. */ trace_signal_deliver(signr, info, ka); -- cgit v1.2.3-70-g09d2 From 0b5e1c5255e7ee8670e077e8224e5c2281229a5b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Jun 2011 11:15:33 +0200 Subject: printk: Release console_sem after logbuf_lock Release console_sem after unlocking the logbuf_lock so that we don't generate wakeups while holding logbuf_lock. This avoids some lock inversion troubles once we remove the lockdep_off bits between logbuf_lock and rq->lock (prints while holding rq->lock vs doing wakeups while holding logbuf_lock). There's of course still an actual deadlock where the printk()s under rq->lock will issue a wakeup from the up() call, but lockdep won't warn about that since semaphores are not tracked. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-j8swthl12u73h4znbvitljzd@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/printk.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 35185392173..751e7b84e9e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -782,7 +782,7 @@ static inline int can_use_console(unsigned int cpu) static int console_trylock_for_printk(unsigned int cpu) __releases(&logbuf_lock) { - int retval = 0; + int retval = 0, wake = 0; if (console_trylock()) { retval = 1; @@ -795,12 +795,14 @@ static int console_trylock_for_printk(unsigned int cpu) */ if (!can_use_console(cpu)) { console_locked = 0; - up(&console_sem); + wake = 1; retval = 0; } } printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); + if (wake) + up(&console_sem); return retval; } static const char recursion_bug_msg [] = @@ -1271,8 +1273,8 @@ void console_unlock(void) if (unlikely(exclusive_console)) exclusive_console = NULL; - up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); + up(&console_sem); if (wake_klogd) wake_up_klogd(); } -- cgit v1.2.3-70-g09d2 From b58f6b0dd3d677338b9065388cc2cc942b86338e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Jun 2011 00:23:28 +0200 Subject: perf, core: Fix initial task_ctx/event installation A lost Quilt refresh of 2c29ef0fef8 (perf: Simplify and fix __perf_install_in_context()) is causing grief and lockups, reported by Jiri Olsa. When installing an event in a task context, there's a number of issues: - there might not be an existing task context, in which case we should install the now current context; - there might already be a context, not the current one, in which case we should de-schedule the old and install the new; these cases were dealt with in the lost refresh, however there is one further case that was found in testing: - there might already be a context, the current one, in which case we should still de-schedule, and should take care to re-install it (note that task_ctx_sched_out() clears cpuctx->task_ctx). Reported-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1307399008.2497.971.camel@laptop Signed-off-by: Ingo Molnar --- kernel/events/core.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ba89f40abe6..5e8c7b1389b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1505,25 +1505,31 @@ static int __perf_install_in_context(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; struct task_struct *task = current; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_ctx_lock(cpuctx, task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); /* * If there was an active task_ctx schedule it out. */ - if (task_ctx) { + if (task_ctx) task_ctx_sched_out(task_ctx); - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - raw_spin_unlock(&cpuctx->ctx.lock); - raw_spin_lock(&ctx->lock); - cpuctx->task_ctx = task_ctx = ctx; - } + + /* + * If the context we're installing events in is not the + * active task_ctx, flip them. + */ + if (ctx->task && task_ctx != ctx) { + if (task_ctx) + raw_spin_unlock(&task_ctx->lock); + raw_spin_lock(&ctx->lock); + task_ctx = ctx; + } + + if (task_ctx) { + cpuctx->task_ctx = task_ctx; task = task_ctx->task; } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); update_context_time(ctx); -- cgit v1.2.3-70-g09d2 From 2da8c8bc44b572cbf623629ff736608dc7968436 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Jun 2011 22:53:39 +0200 Subject: sched: Remove pointless in_atomic() definition check It's really supposed to be defined here. If it's not then we actually want the build to crash so that we know it, and not keep it silent. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fd18f395a1b..01d9536aaa8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8028,7 +8028,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { -#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || @@ -8050,7 +8049,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) if (irqs_disabled()) print_irqtrace_events(current); dump_stack(); -#endif } EXPORT_SYMBOL(__might_sleep); #endif -- cgit v1.2.3-70-g09d2 From 2ce9738bac1b386f46e8478fd2c263460e7c2b09 Mon Sep 17 00:00:00 2001 From: "eparis@redhat" Date: Thu, 2 Jun 2011 21:20:51 +1000 Subject: cgroupfs: use init_cred when populating new cgroupfs mount We recently found that in some configurations SELinux was blocking the ability for cgroupfs to be mounted. The reason for this is because cgroupfs creates files and directories during the get_sb() call and also uses lookup_one_len() during that same get_sb() call. This is a problem since the security subsystem cannot initialize the superblock and the inodes in that filesystem until after the get_sb() call returns. Thus we leave the inodes in an unitialized state during get_sb(). For the vast majority of filesystems this is not an issue, but since cgroupfs uses lookup_on_len() it does search permission checks on the directories in the path it walks. Since the inode security state is not set up SELinux does these checks as if the inodes were 'unlabeled.' Many 'normal' userspace process do not have permission to interact with unlabeled inodes. The solution presented here is to do the permission checks of path walk and inode creation as the kernel rather than as the task that called mount. Since the kernel has permission to read/write/create unlabeled inodes the get_sb() call will complete successfully and the SELinux code will be able to initialize the superblock and those inodes created during the get_sb() call. This appears to be the same solution used by other filesystems such as devtmpfs to solve the same issue and should thus have no negative impact on other LSMs which currently work. Signed-off-by: Eric Paris Acked-by: Paul Menage Signed-off-by: James Morris --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2731d115d72..81a867851fe 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,9 +27,11 @@ */ #include +#include #include #include #include +#include #include #include #include @@ -1514,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; struct cgroupfs_root *existing_root; + const struct cred *cred; int i; BUG_ON(sb->s_root != NULL); @@ -1593,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); + cred = override_creds(&init_cred); cgroup_populate_dir(root_cgrp); + revert_creds(cred); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); } else { -- cgit v1.2.3-70-g09d2 From 76369139ceb955deefc509e6e12ce9d6ce50ccab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 May 2011 19:55:04 +0200 Subject: perf: Split up buffer handling from core code And create the internal perf events header. v2: Keep an internal inlined perf_output_copy() Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Borislav Petkov Cc: Stephane Eranian Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com [ v3: use clearer 'ring_buffer' and 'rb' naming ] Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 33 +-- kernel/events/Makefile | 2 +- kernel/events/core.c | 568 ++++++-------------------------------------- kernel/events/internal.h | 97 ++++++++ kernel/events/ring_buffer.c | 399 +++++++++++++++++++++++++++++++ 5 files changed, 572 insertions(+), 527 deletions(-) create mode 100644 kernel/events/internal.h create mode 100644 kernel/events/ring_buffer.c (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3412684ce5d..779f6ed54d5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -680,33 +680,6 @@ enum perf_event_active_state { }; struct file; - -#define PERF_BUFFER_WRITABLE 0x01 - -struct perf_buffer { - atomic_t refcount; - struct rcu_head rcu_head; -#ifdef CONFIG_PERF_USE_VMALLOC - struct work_struct work; - int page_order; /* allocation order */ -#endif - int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ - - atomic_t poll; /* POLL_ for wakeups */ - - local_t head; /* write position */ - local_t nest; /* nested writers */ - local_t events; /* event limit */ - local_t wakeup; /* wakeup stamp */ - local_t lost; /* nr records lost */ - - long watermark; /* wakeup watermark */ - - struct perf_event_mmap_page *user_page; - void *data_pages[0]; -}; - struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, int, @@ -745,6 +718,8 @@ struct perf_cgroup { }; #endif +struct ring_buffer; + /** * struct perf_event - performance event kernel representation: */ @@ -834,7 +809,7 @@ struct perf_event { atomic_t mmap_count; int mmap_locked; struct user_struct *mmap_user; - struct perf_buffer *buffer; + struct ring_buffer *rb; /* poll related */ wait_queue_head_t waitq; @@ -945,7 +920,7 @@ struct perf_cpu_context { struct perf_output_handle { struct perf_event *event; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned long wakeup; unsigned long size; void *addr; diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 1ce23d3d839..89e5e8aa4c3 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o +obj-y := core.o ring_buffer.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/core.c b/kernel/events/core.c index 5e8c7b1389b..5e70f62752a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,6 +36,8 @@ #include #include +#include "internal.h" + #include struct remote_function_call { @@ -2886,7 +2888,7 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_buffer_put(struct perf_buffer *buffer); +static void ring_buffer_put(struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2909,9 +2911,9 @@ static void free_event(struct perf_event *event) } } - if (event->buffer) { - perf_buffer_put(event->buffer); - event->buffer = NULL; + if (event->rb) { + ring_buffer_put(event->rb); + event->rb = NULL; } if (is_cgroup_event(event)) @@ -3139,13 +3141,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned int events = POLL_HUP; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) - events = atomic_xchg(&buffer->poll, 0); + rb = rcu_dereference(event->rb); + if (rb) + events = atomic_xchg(&rb->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -3356,14 +3358,14 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_buffer *buffer; + struct ring_buffer *rb; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) + rb = rcu_dereference(event->rb); + if (!rb) goto unlock; - userpg = buffer->user_page; + userpg = rb->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -3390,220 +3392,10 @@ unlock: rcu_read_unlock(); } -static unsigned long perf_data_size(struct perf_buffer *buffer); - -static void -perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) -{ - long max_size = perf_data_size(buffer); - - if (watermark) - buffer->watermark = min(max_size, watermark); - - if (!buffer->watermark) - buffer->watermark = max_size / 2; - - if (flags & PERF_BUFFER_WRITABLE) - buffer->writable = 1; - - atomic_set(&buffer->refcount, 1); -} - -#ifndef CONFIG_PERF_USE_VMALLOC - -/* - * Back perf_mmap() with regular GFP_KERNEL-0 pages. - */ - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > buffer->nr_pages) - return NULL; - - if (pgoff == 0) - return virt_to_page(buffer->user_page); - - return virt_to_page(buffer->data_pages[pgoff - 1]); -} - -static void *perf_mmap_alloc_page(int cpu) -{ - struct page *page; - int node; - - node = (cpu == -1) ? cpu : cpu_to_node(cpu); - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) - return NULL; - - return page_address(page); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - int i; - - size = sizeof(struct perf_buffer); - size += nr_pages * sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - buffer->user_page = perf_mmap_alloc_page(cpu); - if (!buffer->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - buffer->data_pages[i] = perf_mmap_alloc_page(cpu); - if (!buffer->data_pages[i]) - goto fail_data_pages; - } - - buffer->nr_pages = nr_pages; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)buffer->data_pages[i]); - - free_page((unsigned long)buffer->user_page); - -fail_user_page: - kfree(buffer); - -fail: - return NULL; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - int i; - - perf_mmap_free_page((unsigned long)buffer->user_page); - for (i = 0; i < buffer->nr_pages; i++) - perf_mmap_free_page((unsigned long)buffer->data_pages[i]); - kfree(buffer); -} - -static inline int page_order(struct perf_buffer *buffer) -{ - return 0; -} - -#else - -/* - * Back perf_mmap() with vmalloc memory. - * - * Required for architectures that have d-cache aliasing issues. - */ - -static inline int page_order(struct perf_buffer *buffer) -{ - return buffer->page_order; -} - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > (1UL << page_order(buffer))) - return NULL; - - return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); -} - -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - -static void perf_buffer_free_work(struct work_struct *work) -{ - struct perf_buffer *buffer; - void *base; - int i, nr; - - buffer = container_of(work, struct perf_buffer, work); - nr = 1 << page_order(buffer); - - base = buffer->user_page; - for (i = 0; i < nr + 1; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - - vfree(base); - kfree(buffer); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - schedule_work(&buffer->work); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - void *all_buf; - - size = sizeof(struct perf_buffer); - size += sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - INIT_WORK(&buffer->work, perf_buffer_free_work); - - all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); - if (!all_buf) - goto fail_all_buf; - - buffer->user_page = all_buf; - buffer->data_pages[0] = all_buf + PAGE_SIZE; - buffer->page_order = ilog2(nr_pages); - buffer->nr_pages = 1; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_all_buf: - kfree(buffer); - -fail: - return NULL; -} - -#endif - -static unsigned long perf_data_size(struct perf_buffer *buffer) -{ - return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); -} - static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *buffer; + struct ring_buffer *rb; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -3613,14 +3405,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) + rb = rcu_dereference(event->rb); + if (!rb) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); + vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; @@ -3635,35 +3427,35 @@ unlock: return ret; } -static void perf_buffer_free_rcu(struct rcu_head *rcu_head) +static void rb_free_rcu(struct rcu_head *rcu_head) { - struct perf_buffer *buffer; + struct ring_buffer *rb; - buffer = container_of(rcu_head, struct perf_buffer, rcu_head); - perf_buffer_free(buffer); + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); } -static struct perf_buffer *perf_buffer_get(struct perf_event *event) +static struct ring_buffer *ring_buffer_get(struct perf_event *event) { - struct perf_buffer *buffer; + struct ring_buffer *rb; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) { - if (!atomic_inc_not_zero(&buffer->refcount)) - buffer = NULL; + rb = rcu_dereference(event->rb); + if (rb) { + if (!atomic_inc_not_zero(&rb->refcount)) + rb = NULL; } rcu_read_unlock(); - return buffer; + return rb; } -static void perf_buffer_put(struct perf_buffer *buffer) +static void ring_buffer_put(struct ring_buffer *rb) { - if (!atomic_dec_and_test(&buffer->refcount)) + if (!atomic_dec_and_test(&rb->refcount)) return; - call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); + call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3678,16 +3470,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->buffer); + unsigned long size = perf_data_size(event->rb); struct user_struct *user = event->mmap_user; - struct perf_buffer *buffer = event->buffer; + struct ring_buffer *rb = event->rb; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->buffer, NULL); + rcu_assign_pointer(event->rb, NULL); mutex_unlock(&event->mmap_mutex); - perf_buffer_put(buffer); + ring_buffer_put(rb); free_uid(user); } } @@ -3705,7 +3497,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -3714,7 +3506,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the - * same buffer. + * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; @@ -3726,7 +3518,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have buffer pages ensure they're a power-of-two number, so we + * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -3740,9 +3532,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->buffer) { - if (event->buffer->nr_pages == nr_pages) - atomic_inc(&event->buffer->refcount); + if (event->rb) { + if (event->rb->nr_pages == nr_pages) + atomic_inc(&event->rb->refcount); else ret = -EINVAL; goto unlock; @@ -3772,18 +3564,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->buffer); + WARN_ON(event->rb); if (vma->vm_flags & VM_WRITE) - flags |= PERF_BUFFER_WRITABLE; + flags |= RING_BUFFER_WRITABLE; - buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + rb = rb_alloc(nr_pages, event->attr.wakeup_watermark, event->cpu, flags); - if (!buffer) { + if (!rb) { ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->buffer, buffer); + rcu_assign_pointer(event->rb, rb); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; @@ -3882,117 +3674,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -/* - * Output - */ -static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!buffer->writable) - return true; - - mask = perf_data_size(buffer) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->buffer->poll, POLL_IN); - - if (handle->nmi) { - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); - } else - perf_event_wakeup(handle->event); -} - -/* - * We need to ensure a later event_id doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_get_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - - preempt_disable(); - local_inc(&buffer->nest); - handle->wakeup = local_read(&buffer->wakeup); -} - -static void perf_output_put_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - unsigned long head; - -again: - head = local_read(&buffer->head); - - /* - * IRQ/NMI can happen here, which means we can miss a head update. - */ - - if (!local_dec_and_test(&buffer->nest)) - goto out; - - /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the buffer->head read and this - * write. - */ - buffer->user_page->data_head = head; - - /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read buffer->head. - */ - if (unlikely(head != local_read(&buffer->head))) { - local_inc(&buffer->nest); - goto again; - } - - if (handle->wakeup != local_read(&buffer->wakeup)) - perf_output_wakeup(handle); - -out: - preempt_enable(); -} - -__always_inline void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct perf_buffer *buffer = handle->buffer; - - handle->page++; - handle->page &= buffer->nr_pages - 1; - handle->addr = buffer->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(buffer); - } - } while (len); -} - static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4023,9 +3704,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } } -static void perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) +void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) { if (event->attr.sample_id_all) __perf_event_header__init_id(header, data, event); @@ -4052,121 +3733,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, perf_output_put(handle, data->cpu_entry); } -static void perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample) +void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int nmi, int sample) -{ - struct perf_buffer *buffer; - unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited events we send all the output towards the parent. - */ - if (event->parent) - event = event->parent; - - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto out; - - handle->buffer = buffer; - handle->event = event; - handle->nmi = nmi; - handle->sample = sample; - - if (!buffer->nr_pages) - goto out; - - have_lost = local_read(&buffer->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; - } - - perf_output_get_handle(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(buffer->user_page->data_tail); - smp_rmb(); - offset = head = local_read(&buffer->head); - head += size; - if (unlikely(!perf_output_space(buffer, tail, offset, head))) - goto fail; - } while (local_cmpxchg(&buffer->head, offset, head) != offset); - - if (head - local_read(&buffer->wakeup) > buffer->watermark) - local_add(buffer->watermark, &buffer->wakeup); - - handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); - handle->page &= buffer->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); - handle->addr = buffer->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; - - if (have_lost) { - lost_event.header.type = PERF_RECORD_LOST; - lost_event.header.misc = 0; - lost_event.id = event->id; - lost_event.lost = local_xchg(&buffer->lost, 0); - - perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); - } - - return 0; - -fail: - local_inc(&buffer->lost); - perf_output_put_handle(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_event *event = handle->event; - struct perf_buffer *buffer = handle->buffer; - - int wakeup_events = event->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = local_inc_return(&buffer->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &buffer->events); - local_inc(&buffer->wakeup); - } - } - - perf_output_put_handle(handle); - rcu_read_unlock(); -} - static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -4187,7 +3761,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); } /* @@ -4217,7 +3791,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); list_for_each_entry(sub, &leader->sibling_list, group_entry) { n = 0; @@ -4229,7 +3803,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); } } @@ -4309,7 +3883,7 @@ void perf_output_sample(struct perf_output_handle *handle, size *= sizeof(u64); - perf_output_copy(handle, data->callchain, size); + __output_copy(handle, data->callchain, size); } else { u64 nr = 0; perf_output_put(handle, nr); @@ -4319,8 +3893,8 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_RAW) { if (data->raw) { perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); + __output_copy(handle, data->raw->data, + data->raw->size); } else { struct { u32 size; @@ -4617,7 +4191,7 @@ static void perf_event_comm_output(struct perf_event *event, comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); - perf_output_copy(&handle, comm_event->comm, + __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); @@ -4763,7 +4337,7 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.tid = perf_event_tid(event, current); perf_output_put(&handle, mmap_event->event_id); - perf_output_copy(&handle, mmap_event->file_name, + __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); @@ -4819,7 +4393,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (file) { /* - * d_path works from the end of the buffer backwards, so we + * d_path works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ @@ -6346,7 +5920,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_buffer *buffer = NULL, *old_buffer = NULL; + struct ring_buffer *rb = NULL, *old_rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6363,7 +5937,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto out; /* - * If its not a per-cpu buffer, it must be the same task. + * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; @@ -6375,20 +5949,20 @@ set: goto unlock; if (output_event) { - /* get the buffer we want to redirect to */ - buffer = perf_buffer_get(output_event); - if (!buffer) + /* get the rb we want to redirect to */ + rb = ring_buffer_get(output_event); + if (!rb) goto unlock; } - old_buffer = event->buffer; - rcu_assign_pointer(event->buffer, buffer); + old_rb = event->rb; + rcu_assign_pointer(event->rb, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_buffer) - perf_buffer_put(old_buffer); + if (old_rb) + ring_buffer_put(old_rb); out: return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h new file mode 100644 index 00000000000..114f27f3a62 --- /dev/null +++ b/kernel/events/internal.h @@ -0,0 +1,97 @@ +#ifndef _KERNEL_EVENTS_INTERNAL_H +#define _KERNEL_EVENTS_INTERNAL_H + +#define RING_BUFFER_WRITABLE 0x01 + +struct ring_buffer { + atomic_t refcount; + struct rcu_head rcu_head; +#ifdef CONFIG_PERF_USE_VMALLOC + struct work_struct work; + int page_order; /* allocation order */ +#endif + int nr_pages; /* nr of data pages */ + int writable; /* are we writable */ + + atomic_t poll; /* POLL_ for wakeups */ + + local_t head; /* write position */ + local_t nest; /* nested writers */ + local_t events; /* event limit */ + local_t wakeup; /* wakeup stamp */ + local_t lost; /* nr records lost */ + + long watermark; /* wakeup watermark */ + + struct perf_event_mmap_page *user_page; + void *data_pages[0]; +}; + + +extern void rb_free(struct ring_buffer *rb); +extern struct ring_buffer * +rb_alloc(int nr_pages, long watermark, int cpu, int flags); +extern void perf_event_wakeup(struct perf_event *event); + +extern void +perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event); +extern void +perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample); + +extern struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); + +#ifdef CONFIG_PERF_USE_VMALLOC +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static inline int page_order(struct ring_buffer *rb) +{ + return rb->page_order; +} + +#else + +static inline int page_order(struct ring_buffer *rb) +{ + return 0; +} +#endif + +static unsigned long perf_data_size(struct ring_buffer *rb) +{ + return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); +} + +static inline void +__output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + do { + unsigned long size = min_t(unsigned long, handle->size, len); + + memcpy(handle->addr, buf, size); + + len -= size; + handle->addr += size; + buf += size; + handle->size -= size; + if (!handle->size) { + struct ring_buffer *rb = handle->rb; + + handle->page++; + handle->page &= rb->nr_pages - 1; + handle->addr = rb->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(rb); + } + } while (len); +} + +#endif /* _KERNEL_EVENTS_INTERNAL_H */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c new file mode 100644 index 00000000000..fde52595d8f --- /dev/null +++ b/kernel/events/ring_buffer.c @@ -0,0 +1,399 @@ +/* + * Performance events ring-buffer code: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include + +#include "internal.h" + +static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!rb->writable) + return true; + + mask = perf_data_size(rb) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->rb->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + irq_work_queue(&handle->event->pending); + } else + perf_event_wakeup(handle->event); +} + +/* + * We need to ensure a later event_id doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_get_handle(struct perf_output_handle *handle) +{ + struct ring_buffer *rb = handle->rb; + + preempt_disable(); + local_inc(&rb->nest); + handle->wakeup = local_read(&rb->wakeup); +} + +static void perf_output_put_handle(struct perf_output_handle *handle) +{ + struct ring_buffer *rb = handle->rb; + unsigned long head; + +again: + head = local_read(&rb->head); + + /* + * IRQ/NMI can happen here, which means we can miss a head update. + */ + + if (!local_dec_and_test(&rb->nest)) + goto out; + + /* + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the rb->head read and this + * write. + */ + rb->user_page->data_head = head; + + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read rb->head. + */ + if (unlikely(head != local_read(&rb->head))) { + local_inc(&rb->nest); + goto again; + } + + if (handle->wakeup != local_read(&rb->wakeup)) + perf_output_wakeup(handle); + +out: + preempt_enable(); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct ring_buffer *rb; + unsigned long tail, offset, head; + int have_lost; + struct perf_sample_data sample_data; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + rb = rcu_dereference(event->rb); + if (!rb) + goto out; + + handle->rb = rb; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!rb->nr_pages) + goto out; + + have_lost = local_read(&rb->lost); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } + + perf_output_get_handle(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(rb->user_page->data_tail); + smp_rmb(); + offset = head = local_read(&rb->head); + head += size; + if (unlikely(!perf_output_space(rb, tail, offset, head))) + goto fail; + } while (local_cmpxchg(&rb->head, offset, head) != offset); + + if (head - local_read(&rb->wakeup) > rb->watermark) + local_add(rb->watermark, &rb->wakeup); + + handle->page = offset >> (PAGE_SHIFT + page_order(rb)); + handle->page &= rb->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); + handle->addr = rb->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.id = event->id; + lost_event.lost = local_xchg(&rb->lost, 0); + + perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); + } + + return 0; + +fail: + local_inc(&rb->lost); + perf_output_put_handle(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + __output_copy(handle, buf, len); +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct ring_buffer *rb = handle->rb; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = local_inc_return(&rb->events); + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + + perf_output_put_handle(handle); + rcu_read_unlock(); +} + +static void +ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) +{ + long max_size = perf_data_size(rb); + + if (watermark) + rb->watermark = min(max_size, watermark); + + if (!rb->watermark) + rb->watermark = max_size / 2; + + if (flags & RING_BUFFER_WRITABLE) + rb->writable = 1; + + atomic_set(&rb->refcount, 1); +} + +#ifndef CONFIG_PERF_USE_VMALLOC + +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (pgoff > rb->nr_pages) + return NULL; + + if (pgoff == 0) + return virt_to_page(rb->user_page); + + return virt_to_page(rb->data_pages[pgoff - 1]); +} + +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + +struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct ring_buffer *rb; + unsigned long size; + int i; + + size = sizeof(struct ring_buffer); + size += nr_pages * sizeof(void *); + + rb = kzalloc(size, GFP_KERNEL); + if (!rb) + goto fail; + + rb->user_page = perf_mmap_alloc_page(cpu); + if (!rb->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + rb->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!rb->data_pages[i]) + goto fail_data_pages; + } + + rb->nr_pages = nr_pages; + + ring_buffer_init(rb, watermark, flags); + + return rb; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)rb->data_pages[i]); + + free_page((unsigned long)rb->user_page); + +fail_user_page: + kfree(rb); + +fail: + return NULL; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +void rb_free(struct ring_buffer *rb) +{ + int i; + + perf_mmap_free_page((unsigned long)rb->user_page); + for (i = 0; i < rb->nr_pages; i++) + perf_mmap_free_page((unsigned long)rb->data_pages[i]); + kfree(rb); +} + +#else + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (pgoff > (1UL << page_order(rb))) + return NULL; + + return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void rb_free_work(struct work_struct *work) +{ + struct ring_buffer *rb; + void *base; + int i, nr; + + rb = container_of(work, struct ring_buffer, work); + nr = 1 << page_order(rb); + + base = rb->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); + kfree(rb); +} + +void rb_free(struct ring_buffer *rb) +{ + schedule_work(&rb->work); +} + +struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct ring_buffer *rb; + unsigned long size; + void *all_buf; + + size = sizeof(struct ring_buffer); + size += sizeof(void *); + + rb = kzalloc(size, GFP_KERNEL); + if (!rb) + goto fail; + + INIT_WORK(&rb->work, rb_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + rb->user_page = all_buf; + rb->data_pages[0] = all_buf + PAGE_SIZE; + rb->page_order = ilog2(nr_pages); + rb->nr_pages = 1; + + ring_buffer_init(rb, watermark, flags); + + return rb; + +fail_all_buf: + kfree(rb); + +fail: + return NULL; +} + +#endif -- cgit v1.2.3-70-g09d2 From 28f65c11f2ffb3957259dece647a24f8ad2e241b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 9 Jun 2011 09:13:32 -0700 Subject: treewide: Convert uses of struct resource to resource_size(ptr) Several fixes as well where the +1 was missing. Done via coccinelle scripts like: @@ struct resource *ptr; @@ - ptr->end - ptr->start + 1 + resource_size(ptr) and some grep and typing. Mostly uncompiled, no cross-compilers. Signed-off-by: Joe Perches Signed-off-by: Jiri Kosina --- arch/arm/common/scoop.c | 2 +- arch/arm/mach-at91/at91sam9261_devices.c | 2 +- arch/arm/mach-mv78xx0/pcie.c | 8 ++++---- arch/arm/mach-u300/core.c | 2 +- arch/arm/plat-mxc/pwm.c | 8 ++++---- arch/arm/plat-s5p/sysmmu.c | 6 +++--- arch/arm/plat-samsung/pm-check.c | 2 +- arch/avr32/kernel/setup.c | 10 +++++----- arch/avr32/mach-at32ap/extint.c | 2 +- arch/avr32/mach-at32ap/hsmc.c | 2 +- arch/avr32/mach-at32ap/intc.c | 2 +- arch/avr32/mach-at32ap/pio.c | 2 +- arch/microblaze/pci/pci-common.c | 2 +- arch/mips/pci/pci-rc32434.c | 2 +- arch/mips/pci/pci-vr41xx.c | 2 +- arch/mips/powertv/asic/asic_devices.c | 10 ++++------ arch/powerpc/include/asm/macio.h | 2 +- arch/powerpc/kernel/machine_kexec.c | 4 ++-- arch/powerpc/kernel/pci-common.c | 2 +- arch/powerpc/platforms/52xx/mpc52xx_pci.c | 8 ++++---- arch/powerpc/platforms/83xx/km83xx.c | 2 +- arch/powerpc/platforms/83xx/mpc832x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc836x_mds.c | 2 +- arch/powerpc/platforms/83xx/usb.c | 2 +- arch/powerpc/platforms/85xx/sbc8560.c | 2 +- arch/powerpc/platforms/85xx/xes_mpc85xx.c | 2 +- arch/powerpc/platforms/cell/celleb_scc_epci.c | 8 ++++---- arch/powerpc/platforms/cell/celleb_scc_pciex.c | 2 +- arch/powerpc/platforms/cell/spu_manage.c | 2 +- arch/powerpc/platforms/chrp/pci.c | 2 +- arch/powerpc/platforms/pasemi/dma_lib.c | 2 +- arch/powerpc/platforms/powermac/nvram.c | 4 ++-- arch/powerpc/platforms/powermac/pci.c | 6 ++---- arch/powerpc/platforms/powermac/time.c | 2 +- arch/powerpc/sysdev/axonram.c | 2 +- arch/powerpc/sysdev/cpm1.c | 2 +- arch/powerpc/sysdev/cpm_common.c | 2 +- arch/powerpc/sysdev/dart_iommu.c | 2 +- arch/powerpc/sysdev/fsl_msi.c | 2 +- arch/powerpc/sysdev/fsl_pci.c | 12 ++++++------ arch/powerpc/sysdev/fsl_rio.c | 2 +- arch/powerpc/sysdev/ipic.c | 2 +- arch/powerpc/sysdev/mmio_nvram.c | 2 +- arch/powerpc/sysdev/mpc8xx_pic.c | 2 +- arch/powerpc/sysdev/mv64x60_udbg.c | 4 ++-- arch/powerpc/sysdev/ppc4xx_pci.c | 16 ++++++++-------- arch/powerpc/sysdev/qe_lib/qe_ic.c | 2 +- arch/powerpc/sysdev/qe_lib/qe_io.c | 2 +- arch/powerpc/sysdev/xics/icp-native.c | 2 +- arch/sh/kernel/io_trapped.c | 8 ++++---- arch/sh/kernel/machine_kexec.c | 2 +- arch/sparc/kernel/ioport.c | 12 ++++++------ arch/sparc/kernel/pci.c | 6 ++---- arch/tile/kernel/setup.c | 3 +-- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/probe_roms.c | 2 +- drivers/char/bsr.c | 2 +- drivers/char/xilinx_hwicap/xilinx_hwicap.c | 2 +- drivers/dma/mv_xor.c | 5 ++--- drivers/edac/cell_edac.c | 2 +- drivers/edac/mpc85xx_edac.c | 12 ++++++------ drivers/gpio/gpio-u300.c | 8 ++++---- drivers/ide/palm_bk3710.c | 2 +- drivers/ide/tx4939ide.c | 4 ++-- drivers/input/serio/sa1111ps2.c | 6 ++---- drivers/media/video/davinci/vpif.c | 2 +- drivers/media/video/omap24xxcam.c | 5 ++--- drivers/message/i2o/iop.c | 8 ++++---- drivers/mfd/tc6387xb.c | 2 +- drivers/misc/atmel-ssc.c | 2 +- drivers/misc/atmel_pwm.c | 2 +- drivers/mmc/host/dw_mmc.c | 2 +- drivers/mtd/maps/bfin-async-flash.c | 2 +- drivers/mtd/maps/ixp2000.c | 11 ++++++----- drivers/mtd/maps/pxa2xx-flash.c | 2 +- drivers/mtd/nand/atmel_nand.c | 4 ++-- drivers/mtd/nand/bcm_umi_nand.c | 2 +- drivers/mtd/nand/mpc5121_nfc.c | 2 +- drivers/net/bcm63xx_enet.c | 8 ++++---- drivers/net/can/softing/softing_main.c | 2 +- drivers/net/davinci_emac.c | 6 +++--- drivers/net/ethoc.c | 2 +- drivers/net/fec_mpc52xx.c | 7 ++++--- drivers/net/fs_enet/mii-bitbang.c | 4 ++-- drivers/net/fs_enet/mii-fec.c | 2 +- drivers/net/gianfar_ptp.c | 2 +- drivers/net/ibm_newemac/core.c | 2 +- drivers/net/macb.c | 2 +- drivers/net/mv643xx_eth.c | 2 +- drivers/net/pxa168_eth.c | 2 +- drivers/net/sb1250-mac.c | 2 +- drivers/parport/parport_ax88796.c | 2 +- drivers/pci/hotplug/shpchp_sysfs.c | 21 +++++++++------------ drivers/pcmcia/at91_cf.c | 7 +++---- drivers/pcmcia/electra_cf.c | 4 ++-- drivers/pcmcia/rsrc_iodyn.c | 6 +++--- drivers/pcmcia/rsrc_nonstatic.c | 6 +++--- drivers/pnp/pnpacpi/rsparser.c | 10 +++++----- drivers/pnp/pnpbios/rsparser.c | 12 ++++++------ drivers/rtc/rtc-at32ap700x.c | 2 +- drivers/rtc/rtc-cmos.c | 6 +++--- drivers/rtc/rtc-ds1286.c | 2 +- drivers/rtc/rtc-ds1511.c | 2 +- drivers/rtc/rtc-ds1742.c | 2 +- drivers/rtc/rtc-m48t35.c | 2 +- drivers/rtc/rtc-m48t59.c | 2 +- drivers/rtc/rtc-mrst.c | 5 ++--- drivers/rtc/rtc-puv3.c | 5 ++--- drivers/rtc/rtc-s3c.c | 5 ++--- drivers/staging/generic_serial/ser_a2232.c | 3 ++- drivers/staging/gma500/psb_gtt.c | 8 ++++---- drivers/tty/serial/bfin_5xx.c | 5 ++--- drivers/tty/serial/imx.c | 5 ++--- drivers/tty/serial/m32r_sio.c | 2 +- drivers/tty/serial/omap-serial.c | 6 +++--- drivers/tty/serial/pxa.c | 2 +- drivers/tty/serial/sunsu.c | 2 +- drivers/tty/serial/vt8500_serial.c | 3 +-- drivers/uio/uio_pdrv.c | 2 +- drivers/uio/uio_pdrv_genirq.c | 2 +- drivers/usb/gadget/atmel_usba_udc.c | 2 +- drivers/usb/gadget/fsl_udc_core.c | 6 +++--- drivers/usb/host/ehci-ath79.c | 2 +- drivers/usb/host/ehci-cns3xxx.c | 2 +- drivers/usb/host/ehci-fsl.c | 2 +- drivers/usb/host/ehci-grlib.c | 2 +- drivers/usb/host/ehci-ixp4xx.c | 2 +- drivers/usb/host/ehci-octeon.c | 2 +- drivers/usb/host/ehci-pmcmsp.c | 10 +++++----- drivers/usb/host/ehci-ppc-of.c | 2 +- drivers/usb/host/ehci-w90x900.c | 2 +- drivers/usb/host/ehci-xilinx-of.c | 2 +- drivers/usb/host/fhci-hcd.c | 2 +- drivers/usb/host/ohci-ath79.c | 4 ++-- drivers/usb/host/ohci-cns3xxx.c | 2 +- drivers/usb/host/ohci-da8xx.c | 2 +- drivers/usb/host/ohci-octeon.c | 2 +- drivers/usb/host/ohci-ppc-of.c | 2 +- drivers/usb/host/ohci-ppc-soc.c | 2 +- drivers/usb/host/ohci-sa1111.c | 2 +- drivers/usb/host/ohci-sm501.c | 11 +++++------ drivers/usb/host/ohci-tmio.c | 6 +++--- drivers/usb/host/oxu210hp-hcd.c | 2 +- drivers/usb/host/uhci-grlib.c | 2 +- drivers/usb/host/whci/init.c | 2 +- drivers/uwb/whc-rc.c | 2 +- drivers/video/atmel_lcdfb.c | 4 ++-- drivers/video/aty/atyfb_base.c | 7 ++++--- drivers/video/au1100fb.c | 2 +- drivers/video/cobalt_lcdfb.c | 2 +- drivers/video/controlfb.c | 4 ++-- drivers/video/mb862xx/mb862xxfbdrv.c | 4 ++-- drivers/video/msm/mdp.c | 3 +-- drivers/video/msm/msm_fb.c | 7 +++---- drivers/video/nuc900fb.c | 2 +- drivers/video/platinumfb.c | 5 ++--- drivers/video/pxa168fb.c | 2 +- include/linux/dio.h | 2 +- include/linux/pnp.h | 2 +- include/linux/zorro.h | 2 +- kernel/kexec.c | 2 +- sound/aoa/soundbus/i2sbus/core.c | 9 ++++----- sound/atmel/abdac.c | 2 +- sound/atmel/ac97c.c | 2 +- sound/ppc/pmac.c | 9 +++------ sound/soc/fsl/fsl_ssi.c | 2 +- sound/soc/fsl/mpc5200_dma.c | 2 +- 168 files changed, 308 insertions(+), 333 deletions(-) (limited to 'kernel') diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c index c11af1e4bad..a07b0e763a8 100644 --- a/arch/arm/common/scoop.c +++ b/arch/arm/common/scoop.c @@ -193,7 +193,7 @@ static int __devinit scoop_probe(struct platform_device *pdev) spin_lock_init(&devptr->scoop_lock); inf = pdev->dev.platform_data; - devptr->base = ioremap(mem->start, mem->end - mem->start + 1); + devptr->base = ioremap(mem->start, resource_size(mem)); if (!devptr->base) { ret = -ENOMEM; diff --git a/arch/arm/mach-at91/at91sam9261_devices.c b/arch/arm/mach-at91/at91sam9261_devices.c index 3eb4538fcee..14dd666850b 100644 --- a/arch/arm/mach-at91/at91sam9261_devices.c +++ b/arch/arm/mach-at91/at91sam9261_devices.c @@ -525,7 +525,7 @@ void __init at91_add_device_lcdc(struct atmel_lcdfb_info *data) if (ARRAY_SIZE(lcdc_resources) > 2) { void __iomem *fb; struct resource *fb_res = &lcdc_resources[2]; - size_t fb_len = fb_res->end - fb_res->start + 1; + size_t fb_len = resource_size(fb_res); fb = ioremap(fb_res->start, fb_len); if (fb) { diff --git a/arch/arm/mach-mv78xx0/pcie.c b/arch/arm/mach-mv78xx0/pcie.c index a560439dcc3..f27c7d2fa9f 100644 --- a/arch/arm/mach-mv78xx0/pcie.c +++ b/arch/arm/mach-mv78xx0/pcie.c @@ -129,12 +129,12 @@ static void __init mv78xx0_pcie_preinit(void) struct pcie_port *pp = pcie_port + i; mv78xx0_setup_pcie_io_win(win++, pp->res[0].start, - pp->res[0].end - pp->res[0].start + 1, - pp->maj, pp->min); + resource_size(&pp->res[0]), + pp->maj, pp->min); mv78xx0_setup_pcie_mem_win(win++, pp->res[1].start, - pp->res[1].end - pp->res[1].start + 1, - pp->maj, pp->min); + resource_size(&pp->res[1]), + pp->maj, pp->min); } } diff --git a/arch/arm/mach-u300/core.c b/arch/arm/mach-u300/core.c index 513d6abec1f..399c89f14df 100644 --- a/arch/arm/mach-u300/core.c +++ b/arch/arm/mach-u300/core.c @@ -1791,7 +1791,7 @@ static void __init u300_assign_physmem(void) 0 == res->start) { res->start = curr_start; res->end += curr_start; - curr_start += (res->end - res->start + 1); + curr_start += resource_size(res); printk(KERN_INFO "core.c: Mapping RAM " \ "%#x-%#x to device %s:%s\n", diff --git a/arch/arm/plat-mxc/pwm.c b/arch/arm/plat-mxc/pwm.c index 7a61ef8f471..761c3c940a6 100644 --- a/arch/arm/plat-mxc/pwm.c +++ b/arch/arm/plat-mxc/pwm.c @@ -214,14 +214,14 @@ static int __devinit mxc_pwm_probe(struct platform_device *pdev) goto err_free_clk; } - r = request_mem_region(r->start, r->end - r->start + 1, pdev->name); + r = request_mem_region(r->start, resource_size(r), pdev->name); if (r == NULL) { dev_err(&pdev->dev, "failed to request memory resource\n"); ret = -EBUSY; goto err_free_clk; } - pwm->mmio_base = ioremap(r->start, r->end - r->start + 1); + pwm->mmio_base = ioremap(r->start, resource_size(r)); if (pwm->mmio_base == NULL) { dev_err(&pdev->dev, "failed to ioremap() registers\n"); ret = -ENODEV; @@ -236,7 +236,7 @@ static int __devinit mxc_pwm_probe(struct platform_device *pdev) return 0; err_free_mem: - release_mem_region(r->start, r->end - r->start + 1); + release_mem_region(r->start, resource_size(r)); err_free_clk: clk_put(pwm->clk); err_free: @@ -260,7 +260,7 @@ static int __devexit mxc_pwm_remove(struct platform_device *pdev) iounmap(pwm->mmio_base); r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(r->start, r->end - r->start + 1); + release_mem_region(r->start, resource_size(r)); clk_put(pwm->clk); diff --git a/arch/arm/plat-s5p/sysmmu.c b/arch/arm/plat-s5p/sysmmu.c index 54f5eddc921..e1cbc728c77 100644 --- a/arch/arm/plat-s5p/sysmmu.c +++ b/arch/arm/plat-s5p/sysmmu.c @@ -232,8 +232,8 @@ static int s5p_sysmmu_probe(struct platform_device *pdev) goto err_res; } - mem = request_mem_region(res->start, - ((res->end) - (res->start)) + 1, pdev->name); + mem = request_mem_region(res->start, resource_size(res), + pdev->name); if (!mem) { dev_err(dev, "Failed to request the memory region of %s.\n", sysmmu_ips_name[i]); @@ -241,7 +241,7 @@ static int s5p_sysmmu_probe(struct platform_device *pdev) goto err_res; } - sysmmusfrs[i] = ioremap(res->start, res->end - res->start + 1); + sysmmusfrs[i] = ioremap(res->start, resource_size(res)); if (!sysmmusfrs[i]) { dev_err(dev, "Failed to ioremap() for %s.\n", sysmmu_ips_name[i]); diff --git a/arch/arm/plat-samsung/pm-check.c b/arch/arm/plat-samsung/pm-check.c index 6b733fafe7c..3cbd62666b1 100644 --- a/arch/arm/plat-samsung/pm-check.c +++ b/arch/arm/plat-samsung/pm-check.c @@ -72,7 +72,7 @@ static void s3c_pm_run_sysram(run_fn_t fn, u32 *arg) static u32 *s3c_pm_countram(struct resource *res, u32 *val) { - u32 size = (u32)(res->end - res->start)+1; + u32 size = (u32)resource_size(res); size += CHECK_CHUNKSIZE-1; size /= CHECK_CHUNKSIZE; diff --git a/arch/avr32/kernel/setup.c b/arch/avr32/kernel/setup.c index bb0974cce4a..b4247f47806 100644 --- a/arch/avr32/kernel/setup.c +++ b/arch/avr32/kernel/setup.c @@ -444,7 +444,7 @@ static unsigned long __init find_bootmap_pfn(const struct resource *mem) { unsigned long bootmap_pages, bootmap_len; - unsigned long node_pages = PFN_UP(mem->end - mem->start + 1); + unsigned long node_pages = PFN_UP(resource_size(mem)); unsigned long bootmap_start; bootmap_pages = bootmem_bootmap_pages(node_pages); @@ -541,10 +541,10 @@ static void __init setup_bootmem(void) */ if (res->start >= PFN_PHYS(first_pfn) && res->end < PFN_PHYS(max_pfn)) - reserve_bootmem_node( - NODE_DATA(node), res->start, - res->end - res->start + 1, - BOOTMEM_DEFAULT); + reserve_bootmem_node(NODE_DATA(node), + res->start, + resource_size(res), + BOOTMEM_DEFAULT); } node_set_online(node); diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c index fbc2aeaebdd..cfb298d6630 100644 --- a/arch/avr32/mach-at32ap/extint.c +++ b/arch/avr32/mach-at32ap/extint.c @@ -204,7 +204,7 @@ static int __init eic_probe(struct platform_device *pdev) } eic->first_irq = EIM_IRQ_BASE + 32 * pdev->id; - eic->regs = ioremap(regs->start, regs->end - regs->start + 1); + eic->regs = ioremap(regs->start, resource_size(regs)); if (!eic->regs) { dev_dbg(&pdev->dev, "failed to map regs\n"); goto err_ioremap; diff --git a/arch/avr32/mach-at32ap/hsmc.c b/arch/avr32/mach-at32ap/hsmc.c index f7672d3e86b..f66245e6e63 100644 --- a/arch/avr32/mach-at32ap/hsmc.c +++ b/arch/avr32/mach-at32ap/hsmc.c @@ -245,7 +245,7 @@ static int hsmc_probe(struct platform_device *pdev) hsmc->pclk = pclk; hsmc->mck = mck; - hsmc->regs = ioremap(regs->start, regs->end - regs->start + 1); + hsmc->regs = ioremap(regs->start, resource_size(regs)); if (!hsmc->regs) goto out_disable_clocks; diff --git a/arch/avr32/mach-at32ap/intc.c b/arch/avr32/mach-at32ap/intc.c index 3e3646186c9..6c700431e34 100644 --- a/arch/avr32/mach-at32ap/intc.c +++ b/arch/avr32/mach-at32ap/intc.c @@ -107,7 +107,7 @@ void __init init_IRQ(void) clk_enable(pclk); - intc0.regs = ioremap(regs->start, regs->end - regs->start + 1); + intc0.regs = ioremap(regs->start, resource_size(regs)); if (!intc0.regs) { printk(KERN_EMERG "intc: failed to map registers (0x%08lx)\n", (unsigned long)regs->start); diff --git a/arch/avr32/mach-at32ap/pio.c b/arch/avr32/mach-at32ap/pio.c index 2e0aa853a4b..9b39dea6682 100644 --- a/arch/avr32/mach-at32ap/pio.c +++ b/arch/avr32/mach-at32ap/pio.c @@ -461,7 +461,7 @@ void __init at32_init_pio(struct platform_device *pdev) clk_enable(pio->clk); pio->pdev = pdev; - pio->regs = ioremap(regs->start, regs->end - regs->start + 1); + pio->regs = ioremap(regs->start, resource_size(regs)); /* start with irqs disabled and acked */ pio_writel(pio, IDR, ~0UL); diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 53599067d2f..dd3fae06ece 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -89,7 +89,7 @@ void pcibios_free_controller(struct pci_controller *phb) static resource_size_t pcibios_io_size(const struct pci_controller *hose) { - return hose->io_resource.end - hose->io_resource.start + 1; + return resource_size(&hose->io_resource); } int pcibios_vaddr_is_ioport(void __iomem *address) diff --git a/arch/mips/pci/pci-rc32434.c b/arch/mips/pci/pci-rc32434.c index f31218e17d3..764362ce5e4 100644 --- a/arch/mips/pci/pci-rc32434.c +++ b/arch/mips/pci/pci-rc32434.c @@ -215,7 +215,7 @@ static int __init rc32434_pci_init(void) rc32434_pcibridge_init(); io_map_base = ioremap(rc32434_res_pci_io1.start, - rc32434_res_pci_io1.end - rc32434_res_pci_io1.start + 1); + resource_size(&rcrc32434_res_pci_io1)); if (!io_map_base) return -ENOMEM; diff --git a/arch/mips/pci/pci-vr41xx.c b/arch/mips/pci/pci-vr41xx.c index 56525711f8b..444b8d8004a 100644 --- a/arch/mips/pci/pci-vr41xx.c +++ b/arch/mips/pci/pci-vr41xx.c @@ -305,7 +305,7 @@ static int __init vr41xx_pciu_init(void) struct resource *res = vr41xx_pci_controller.io_resource; master = setup->master_io; io_map_base = ioremap(master->bus_base_address, - res->end - res->start + 1); + resource_size(res)); if (!io_map_base) return -EBUSY; diff --git a/arch/mips/powertv/asic/asic_devices.c b/arch/mips/powertv/asic/asic_devices.c index e56fa61b399..bce1872249b 100644 --- a/arch/mips/powertv/asic/asic_devices.c +++ b/arch/mips/powertv/asic/asic_devices.c @@ -394,23 +394,21 @@ void __init platform_alloc_bootmem(void) /* Loop through looking for resources that want a particular address */ for (i = 0; gp_resources[i].flags != 0; i++) { - int size = gp_resources[i].end - gp_resources[i].start + 1; + int size = resource_size(&gp_resources[i]); if ((gp_resources[i].start != 0) && ((gp_resources[i].flags & IORESOURCE_MEM) != 0)) { reserve_bootmem(dma_to_phys(gp_resources[i].start), size, 0); - total += gp_resources[i].end - - gp_resources[i].start + 1; + total += resource_size(&gp_resources[i]); pr_info("reserve resource %s at %08x (%u bytes)\n", gp_resources[i].name, gp_resources[i].start, - gp_resources[i].end - - gp_resources[i].start + 1); + resource_size(&gp_resources[i])); } } /* Loop through assigning addresses for those that are left */ for (i = 0; gp_resources[i].flags != 0; i++) { - int size = gp_resources[i].end - gp_resources[i].start + 1; + int size = resource_size(&gp_resources[i]); if ((gp_resources[i].start == 0) && ((gp_resources[i].flags & IORESOURCE_MEM) != 0)) { void *mem = alloc_bootmem_pages(size); diff --git a/arch/powerpc/include/asm/macio.h b/arch/powerpc/include/asm/macio.h index 7ab82c825a0..27af7f8bbb8 100644 --- a/arch/powerpc/include/asm/macio.h +++ b/arch/powerpc/include/asm/macio.h @@ -76,7 +76,7 @@ static inline unsigned long macio_resource_len(struct macio_dev *dev, int resour struct resource *res = &dev->resource[resource_no]; if (res->start == 0 || res->end == 0 || res->end < res->start) return 0; - return res->end - res->start + 1; + return resource_size(res); } extern int macio_enable_devres(struct macio_dev *dev); diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 7ee50f0547c..6658a158995 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -126,7 +126,7 @@ void __init reserve_crashkernel(void) /* We might have got these values via the command line or the * device tree, either way sanitise them now. */ - crash_size = crashk_res.end - crashk_res.start + 1; + crash_size = resource_size(&crashk_res); #ifndef CONFIG_RELOCATABLE if (crashk_res.start != KDUMP_KERNELBASE) @@ -222,7 +222,7 @@ static void __init export_crashk_values(struct device_node *node) if (crashk_res.start != 0) { prom_add_property(node, &crashk_base_prop); - crashk_size = crashk_res.end - crashk_res.start + 1; + crashk_size = resource_size(&crashk_res); prom_add_property(node, &crashk_size_prop); } } diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 893af2a9cd0..3764e37205f 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -107,7 +107,7 @@ static resource_size_t pcibios_io_size(const struct pci_controller *hose) #ifdef CONFIG_PPC64 return hose->pci_io_size; #else - return hose->io_resource.end - hose->io_resource.start + 1; + return resource_size(&hose->io_resource); #endif } diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pci.c b/arch/powerpc/platforms/52xx/mpc52xx_pci.c index da110bd8834..5f5e6930908 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pci.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pci.c @@ -264,7 +264,7 @@ mpc52xx_pci_setup(struct pci_controller *hose, (unsigned long long)res->flags); out_be32(&pci_regs->iw0btar, MPC52xx_PCI_IWBTAR_TRANSLATION(res->start, res->start, - res->end - res->start + 1)); + resource_size(res))); iwcr0 = MPC52xx_PCI_IWCR_ENABLE | MPC52xx_PCI_IWCR_MEM; if (res->flags & IORESOURCE_PREFETCH) iwcr0 |= MPC52xx_PCI_IWCR_READ_MULTI; @@ -278,7 +278,7 @@ mpc52xx_pci_setup(struct pci_controller *hose, res->start, res->end, res->flags); out_be32(&pci_regs->iw1btar, MPC52xx_PCI_IWBTAR_TRANSLATION(res->start, res->start, - res->end - res->start + 1)); + resource_size(res))); iwcr1 = MPC52xx_PCI_IWCR_ENABLE | MPC52xx_PCI_IWCR_MEM; if (res->flags & IORESOURCE_PREFETCH) iwcr1 |= MPC52xx_PCI_IWCR_READ_MULTI; @@ -300,7 +300,7 @@ mpc52xx_pci_setup(struct pci_controller *hose, out_be32(&pci_regs->iw2btar, MPC52xx_PCI_IWBTAR_TRANSLATION(hose->io_base_phys, res->start, - res->end - res->start + 1)); + resource_size(res))); iwcr2 = MPC52xx_PCI_IWCR_ENABLE | MPC52xx_PCI_IWCR_IO; /* Set all the IWCR fields at once; they're in the same reg */ @@ -402,7 +402,7 @@ mpc52xx_add_bridge(struct device_node *node) hose->ops = &mpc52xx_pci_ops; - pci_regs = ioremap(rsrc.start, rsrc.end - rsrc.start + 1); + pci_regs = ioremap(rsrc.start, resource_size(&rsrc)); if (!pci_regs) return -ENOMEM; diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c index a2b9b9ef124..f8fa2fc3129 100644 --- a/arch/powerpc/platforms/83xx/km83xx.c +++ b/arch/powerpc/platforms/83xx/km83xx.c @@ -101,7 +101,7 @@ static void __init mpc83xx_km_setup_arch(void) __func__); return; } - base = ioremap(res.start, res.end - res.start + 1); + base = ioremap(res.start, resource_size(&res)); /* * IMMR + 0x14A8[4:5] = 11 (clk delay for UCC 2) diff --git a/arch/powerpc/platforms/83xx/mpc832x_mds.c b/arch/powerpc/platforms/83xx/mpc832x_mds.c index ec0b401bc9c..93e60f1f21a 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc832x_mds.c @@ -68,7 +68,7 @@ static void __init mpc832x_sys_setup_arch(void) struct resource res; of_address_to_resource(np, 0, &res); - bcsr_regs = ioremap(res.start, res.end - res.start +1); + bcsr_regs = ioremap(res.start, resource_size(&res)); of_node_put(np); } diff --git a/arch/powerpc/platforms/83xx/mpc834x_mds.c b/arch/powerpc/platforms/83xx/mpc834x_mds.c index d0a634b056c..c1b1dc50b32 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc834x_mds.c @@ -53,7 +53,7 @@ static int mpc834xemds_usb_cfg(void) struct resource res; of_address_to_resource(np, 0, &res); - bcsr_regs = ioremap(res.start, res.end - res.start + 1); + bcsr_regs = ioremap(res.start, resource_size(&res)); of_node_put(np); } if (!bcsr_regs) diff --git a/arch/powerpc/platforms/83xx/mpc836x_mds.c b/arch/powerpc/platforms/83xx/mpc836x_mds.c index 09e9d6fb741..81c052b1353 100644 --- a/arch/powerpc/platforms/83xx/mpc836x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc836x_mds.c @@ -76,7 +76,7 @@ static void __init mpc836x_mds_setup_arch(void) struct resource res; of_address_to_resource(np, 0, &res); - bcsr_regs = ioremap(res.start, res.end - res.start +1); + bcsr_regs = ioremap(res.start, resource_size(&res)); of_node_put(np); } diff --git a/arch/powerpc/platforms/83xx/usb.c b/arch/powerpc/platforms/83xx/usb.c index 2c64164722d..1ad748bb39b 100644 --- a/arch/powerpc/platforms/83xx/usb.c +++ b/arch/powerpc/platforms/83xx/usb.c @@ -171,7 +171,7 @@ int mpc831x_usb_cfg(void) of_node_put(np); return ret; } - usb_regs = ioremap(res.start, res.end - res.start + 1); + usb_regs = ioremap(res.start, resource_size(&res)); /* Using on-chip PHY */ if (prop && (!strcmp(prop, "utmi_wide") || diff --git a/arch/powerpc/platforms/85xx/sbc8560.c b/arch/powerpc/platforms/85xx/sbc8560.c index d2dfd465fbf..09ced722175 100644 --- a/arch/powerpc/platforms/85xx/sbc8560.c +++ b/arch/powerpc/platforms/85xx/sbc8560.c @@ -285,7 +285,7 @@ static int __init sbc8560_bdrstcr_init(void) printk(KERN_INFO "sbc8560: Found BRSTCR at i/o 0x%x\n", res.start); - brstcr = ioremap(res.start, res.end - res.start); + brstcr = ioremap(res.start, resource_size(&res)); if(!brstcr) printk(KERN_WARNING "sbc8560: ioremap of brstcr failed.\n"); diff --git a/arch/powerpc/platforms/85xx/xes_mpc85xx.c b/arch/powerpc/platforms/85xx/xes_mpc85xx.c index 0125604d096..a9dc5e79512 100644 --- a/arch/powerpc/platforms/85xx/xes_mpc85xx.c +++ b/arch/powerpc/platforms/85xx/xes_mpc85xx.c @@ -123,7 +123,7 @@ static void xes_mpc85xx_fixups(void) continue; } - l2_base = ioremap(r[0].start, r[0].end - r[0].start + 1); + l2_base = ioremap(r[0].start, resource_size(&r[0])); xes_mpc85xx_configure_l2(l2_base); } diff --git a/arch/powerpc/platforms/cell/celleb_scc_epci.c b/arch/powerpc/platforms/cell/celleb_scc_epci.c index 05b0db3ef63..844c0facb4f 100644 --- a/arch/powerpc/platforms/cell/celleb_scc_epci.c +++ b/arch/powerpc/platforms/cell/celleb_scc_epci.c @@ -393,19 +393,19 @@ static int __init celleb_setup_epci(struct device_node *node, if (of_address_to_resource(node, 0, &r)) goto error; - hose->cfg_addr = ioremap(r.start, (r.end - r.start + 1)); + hose->cfg_addr = ioremap(r.start, resource_size(&r)); if (!hose->cfg_addr) goto error; pr_debug("EPCI: cfg_addr map 0x%016llx->0x%016lx + 0x%016llx\n", - r.start, (unsigned long)hose->cfg_addr, (r.end - r.start + 1)); + r.start, (unsigned long)hose->cfg_addr, resource_size(&r)); if (of_address_to_resource(node, 2, &r)) goto error; - hose->cfg_data = ioremap(r.start, (r.end - r.start + 1)); + hose->cfg_data = ioremap(r.start, resource_size(&r)); if (!hose->cfg_data) goto error; pr_debug("EPCI: cfg_data map 0x%016llx->0x%016lx + 0x%016llx\n", - r.start, (unsigned long)hose->cfg_data, (r.end - r.start + 1)); + r.start, (unsigned long)hose->cfg_data, resource_size(&r)); hose->ops = &celleb_epci_ops; celleb_epci_init(hose); diff --git a/arch/powerpc/platforms/cell/celleb_scc_pciex.c b/arch/powerpc/platforms/cell/celleb_scc_pciex.c index a881bbee8de..ae790ac4a58 100644 --- a/arch/powerpc/platforms/cell/celleb_scc_pciex.c +++ b/arch/powerpc/platforms/cell/celleb_scc_pciex.c @@ -494,7 +494,7 @@ static __init int celleb_setup_pciex(struct device_node *node, pr_err("PCIEXC:Failed to get config resource.\n"); return 1; } - phb->cfg_addr = ioremap(r.start, r.end - r.start + 1); + phb->cfg_addr = ioremap(r.start, resource_size(&r)); if (!phb->cfg_addr) { pr_err("PCIEXC:Failed to remap SMMIO region.\n"); return 1; diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index f465d474ad9..4e5c91489c0 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -222,7 +222,7 @@ static int spu_map_resource(struct spu *spu, int nr, return ret; if (phys) *phys = resource.start; - len = resource.end - resource.start + 1; + len = resource_size(&resource); *virt = ioremap(resource.start, len); if (!*virt) return -EINVAL; diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c index 8f67a394b2d..3f65443f171 100644 --- a/arch/powerpc/platforms/chrp/pci.c +++ b/arch/powerpc/platforms/chrp/pci.c @@ -142,7 +142,7 @@ hydra_init(void) return 0; } of_node_put(np); - Hydra = ioremap(r.start, r.end-r.start); + Hydra = ioremap(r.start, resource_size(&r)); printk("Hydra Mac I/O at %llx\n", (unsigned long long)r.start); printk("Hydra Feature_Control was %x", in_le32(&Hydra->Feature_Control)); diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index 321a9b3a2d0..756123bf06a 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -576,7 +576,7 @@ int pasemi_dma_init(void) res.start = 0xfd800000; res.end = res.start + 0x1000; } - dma_status = __ioremap(res.start, res.end-res.start, 0); + dma_status = __ioremap(res.start, resource_size(&res), 0); pci_dev_put(iob_pdev); for (i = 0; i < MAX_TXCH; i++) diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index b1cdcf94aa8..695443bfdb0 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -580,10 +580,10 @@ int __init pmac_nvram_init(void) /* Try to obtain an address */ if (of_address_to_resource(dp, 0, &r1) == 0) { nvram_naddrs = 1; - s1 = (r1.end - r1.start) + 1; + s1 = resource_size(&r1); if (of_address_to_resource(dp, 1, &r2) == 0) { nvram_naddrs = 2; - s2 = (r2.end - r2.start) + 1; + s2 = resource_size(&r2); } } diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c index f33e08d573c..4d4eba32483 100644 --- a/arch/powerpc/platforms/powermac/pci.c +++ b/arch/powerpc/platforms/powermac/pci.c @@ -838,8 +838,7 @@ static void __init setup_u3_ht(struct pci_controller* hose) * into cfg_addr */ hose->cfg_data = ioremap(cfg_res.start, 0x02000000); - hose->cfg_addr = ioremap(self_res.start, - self_res.end - self_res.start + 1); + hose->cfg_addr = ioremap(self_res.start, resource_size(&self_res)); /* * /ht node doesn't expose a "ranges" property, we read the register @@ -1323,8 +1322,7 @@ static void fixup_u4_pcie(struct pci_dev* dev) */ if (r->start >= 0xf0000000 && r->start < 0xf3000000) continue; - if (!region || (r->end - r->start) > - (region->end - region->start)) + if (!region || resource_size(r) > resource_size(region)) region = r; } /* Nothing found, bail */ diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c index 48211ca134c..11c9fce43b5 100644 --- a/arch/powerpc/platforms/powermac/time.c +++ b/arch/powerpc/platforms/powermac/time.c @@ -274,7 +274,7 @@ int __init via_calibrate_decr(void) return 0; } of_node_put(vias); - via = ioremap(rsrc.start, rsrc.end - rsrc.start + 1); + via = ioremap(rsrc.start, resource_size(&rsrc)); if (via == NULL) { printk(KERN_ERR "Failed to map VIA for timer calibration !\n"); return 0; diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index bd0d54060b9..265f0f09395 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -203,7 +203,7 @@ static int axon_ram_probe(struct platform_device *device) goto failed; } - bank->size = resource.end - resource.start + 1; + bank->size = resource_size(&resource); if (bank->size == 0) { dev_err(&device->dev, "No DDR2 memory found for %s%d\n", diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c index 350787c83e2..5d7d59a43c4 100644 --- a/arch/powerpc/sysdev/cpm1.c +++ b/arch/powerpc/sysdev/cpm1.c @@ -148,7 +148,7 @@ unsigned int cpm_pic_init(void) if (ret) goto end; - cpic_reg = ioremap(res.start, res.end - res.start + 1); + cpic_reg = ioremap(res.start, resource_size(&res)); if (cpic_reg == NULL) goto end; diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c index 2b69aa0315b..d55d0ad0dea 100644 --- a/arch/powerpc/sysdev/cpm_common.c +++ b/arch/powerpc/sysdev/cpm_common.c @@ -115,7 +115,7 @@ int cpm_muram_init(void) max = r.end; rh_attach_region(&cpm_muram_info, r.start - muram_pbase, - r.end - r.start + 1); + resource_size(&r)); } muram_vbase = ioremap(muram_pbase, max - muram_pbase + 1); diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 8e9e06a7ca5..4f2680f431b 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -239,7 +239,7 @@ static int __init dart_init(struct device_node *dart_node) DARTMAP_RPNMASK); /* Map in DART registers */ - dart = ioremap(r.start, r.end - r.start + 1); + dart = ioremap(r.start, resource_size(&r)); if (dart == NULL) panic("DART: Cannot map registers!"); diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 92e78333c47..419a77239bd 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -349,7 +349,7 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev) goto error_out; } - msi->msi_regs = ioremap(res.start, res.end - res.start + 1); + msi->msi_regs = ioremap(res.start, resource_size(&res)); if (!msi->msi_regs) { dev_err(&dev->dev, "ioremap problem failed\n"); goto error_out; diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 68ca9290df9..ba5cb3fa707 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -64,7 +64,7 @@ static int __init setup_one_atmu(struct ccsr_pci __iomem *pci, { resource_size_t pci_addr = res->start - offset; resource_size_t phys_addr = res->start; - resource_size_t size = res->end - res->start + 1; + resource_size_t size = resource_size(res); u32 flags = 0x80044000; /* enable & mem R/W */ unsigned int i; @@ -108,7 +108,7 @@ static void __init setup_pci_atmu(struct pci_controller *hose, char *name = hose->dn->full_name; pr_debug("PCI memory map start 0x%016llx, size 0x%016llx\n", - (u64)rsrc->start, (u64)rsrc->end - (u64)rsrc->start + 1); + (u64)rsrc->start, (u64)resource_size(rsrc)); if (of_device_is_compatible(hose->dn, "fsl,qoriq-pcie-v2.2")) { win_idx = 2; @@ -116,7 +116,7 @@ static void __init setup_pci_atmu(struct pci_controller *hose, end_idx = 3; } - pci = ioremap(rsrc->start, rsrc->end - rsrc->start + 1); + pci = ioremap(rsrc->start, resource_size(rsrc)); if (!pci) { dev_err(hose->parent, "Unable to map ATMU registers\n"); return; @@ -153,9 +153,9 @@ static void __init setup_pci_atmu(struct pci_controller *hose, } else { pr_debug("PCI IO resource start 0x%016llx, size 0x%016llx, " "phy base 0x%016llx.\n", - (u64)hose->io_resource.start, - (u64)hose->io_resource.end - (u64)hose->io_resource.start + 1, - (u64)hose->io_base_phys); + (u64)hose->io_resource.start, + (u64)resource_size(&hose->io_resource), + (u64)hose->io_base_phys); out_be32(&pci->pow[j].potar, (hose->io_resource.start >> 12)); out_be32(&pci->pow[j].potear, 0); out_be32(&pci->pow[j].powbar, (hose->io_base_phys >> 12)); diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 5b206a2fe17..95853386a66 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -1523,7 +1523,7 @@ int fsl_rio_setup(struct platform_device *dev) port->priv = priv; port->phys_efptr = 0x100; - priv->regs_win = ioremap(regs.start, regs.end - regs.start + 1); + priv->regs_win = ioremap(regs.start, resource_size(®s)); rio_regs_win = priv->regs_win; /* Probe the master port phy type */ diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 7367d17364c..95da897f05a 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -736,7 +736,7 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) return NULL; } - ipic->regs = ioremap(res.start, res.end - res.start + 1); + ipic->regs = ioremap(res.start, resource_size(&res)); ipic->irqhost->host_data = ipic; diff --git a/arch/powerpc/sysdev/mmio_nvram.c b/arch/powerpc/sysdev/mmio_nvram.c index ddc877a3a23..69f5814ae6d 100644 --- a/arch/powerpc/sysdev/mmio_nvram.c +++ b/arch/powerpc/sysdev/mmio_nvram.c @@ -129,7 +129,7 @@ int __init mmio_nvram_init(void) goto out; } nvram_addr = r.start; - mmio_nvram_len = r.end - r.start + 1; + mmio_nvram_len = resource_size(&r); if ( (!mmio_nvram_len) || (!nvram_addr) ) { printk(KERN_WARNING "nvram: address or length is 0\n"); ret = -EIO; diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c index 20924f2246f..22e48e2d71f 100644 --- a/arch/powerpc/sysdev/mpc8xx_pic.c +++ b/arch/powerpc/sysdev/mpc8xx_pic.c @@ -166,7 +166,7 @@ int mpc8xx_pic_init(void) if (ret) goto out; - siu_reg = ioremap(res.start, res.end - res.start + 1); + siu_reg = ioremap(res.start, resource_size(&res)); if (siu_reg == NULL) { ret = -EINVAL; goto out; diff --git a/arch/powerpc/sysdev/mv64x60_udbg.c b/arch/powerpc/sysdev/mv64x60_udbg.c index 2792dc8b038..50a81387e9b 100644 --- a/arch/powerpc/sysdev/mv64x60_udbg.c +++ b/arch/powerpc/sysdev/mv64x60_udbg.c @@ -125,11 +125,11 @@ static void mv64x60_udbg_init(void) of_node_put(np); - mpsc_base = ioremap(r[0].start, r[0].end - r[0].start + 1); + mpsc_base = ioremap(r[0].start, resource_size(&r[0])); if (!mpsc_base) return; - mpsc_intr_cause = ioremap(r[1].start, r[1].end - r[1].start + 1); + mpsc_intr_cause = ioremap(r[1].start, resource_size(&r[1])); if (!mpsc_intr_cause) { iounmap(mpsc_base); return; diff --git a/arch/powerpc/sysdev/ppc4xx_pci.c b/arch/powerpc/sysdev/ppc4xx_pci.c index 156aa7d3625..deda60a7f99 100644 --- a/arch/powerpc/sysdev/ppc4xx_pci.c +++ b/arch/powerpc/sysdev/ppc4xx_pci.c @@ -265,7 +265,7 @@ static void __init ppc4xx_configure_pci_PMMs(struct pci_controller *hose, if (ppc4xx_setup_one_pci_PMM(hose, reg, res->start, res->start - hose->pci_mem_offset, - res->end + 1 - res->start, + resource_size(res), res->flags, j) == 0) { j++; @@ -290,7 +290,7 @@ static void __init ppc4xx_configure_pci_PTMs(struct pci_controller *hose, void __iomem *reg, const struct resource *res) { - resource_size_t size = res->end - res->start + 1; + resource_size_t size = resource_size(res); u32 sa; /* Calculate window size */ @@ -349,7 +349,7 @@ static void __init ppc4xx_probe_pci_bridge(struct device_node *np) bus_range = of_get_property(np, "bus-range", NULL); /* Map registers */ - reg = ioremap(rsrc_reg.start, rsrc_reg.end + 1 - rsrc_reg.start); + reg = ioremap(rsrc_reg.start, resource_size(&rsrc_reg)); if (reg == NULL) { printk(KERN_ERR "%s: Can't map registers !", np->full_name); goto fail; @@ -465,7 +465,7 @@ static void __init ppc4xx_configure_pcix_POMs(struct pci_controller *hose, if (ppc4xx_setup_one_pcix_POM(hose, reg, res->start, res->start - hose->pci_mem_offset, - res->end + 1 - res->start, + resource_size(res), res->flags, j) == 0) { j++; @@ -492,7 +492,7 @@ static void __init ppc4xx_configure_pcix_PIMs(struct pci_controller *hose, int big_pim, int enable_msi_hole) { - resource_size_t size = res->end - res->start + 1; + resource_size_t size = resource_size(res); u32 sa; /* RAM is always at 0 */ @@ -555,7 +555,7 @@ static void __init ppc4xx_probe_pcix_bridge(struct device_node *np) bus_range = of_get_property(np, "bus-range", NULL); /* Map registers */ - reg = ioremap(rsrc_reg.start, rsrc_reg.end + 1 - rsrc_reg.start); + reg = ioremap(rsrc_reg.start, resource_size(&rsrc_reg)); if (reg == NULL) { printk(KERN_ERR "%s: Can't map registers !", np->full_name); goto fail; @@ -1604,7 +1604,7 @@ static void __init ppc4xx_configure_pciex_POMs(struct ppc4xx_pciex_port *port, if (ppc4xx_setup_one_pciex_POM(port, hose, mbase, res->start, res->start - hose->pci_mem_offset, - res->end + 1 - res->start, + resource_size(res), res->flags, j) == 0) { j++; @@ -1639,7 +1639,7 @@ static void __init ppc4xx_configure_pciex_PIMs(struct ppc4xx_pciex_port *port, void __iomem *mbase, struct resource *res) { - resource_size_t size = res->end - res->start + 1; + resource_size_t size = resource_size(res); u64 sa; if (port->endpoint) { diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c index b2acda07220..18e75ca19fe 100644 --- a/arch/powerpc/sysdev/qe_lib/qe_ic.c +++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c @@ -347,7 +347,7 @@ void __init qe_ic_init(struct device_node *node, unsigned int flags, return; } - qe_ic->regs = ioremap(res.start, res.end - res.start + 1); + qe_ic->regs = ioremap(res.start, resource_size(&res)); qe_ic->irqhost->host_data = qe_ic; qe_ic->hc_irq = qe_ic_irq_chip; diff --git a/arch/powerpc/sysdev/qe_lib/qe_io.c b/arch/powerpc/sysdev/qe_lib/qe_io.c index 77e4934b88c..fd1a6c3b172 100644 --- a/arch/powerpc/sysdev/qe_lib/qe_io.c +++ b/arch/powerpc/sysdev/qe_lib/qe_io.c @@ -41,7 +41,7 @@ int par_io_init(struct device_node *np) ret = of_address_to_resource(np, 0, &res); if (ret) return ret; - par_io = ioremap(res.start, res.end - res.start + 1); + par_io = ioremap(res.start, resource_size(&res)); num_ports = of_get_property(np, "num-ports", NULL); if (num_ports) diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 1f15ad43614..039a7820ba7 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -247,7 +247,7 @@ static int __init icp_native_init_one_node(struct device_node *np, return -1; } - if (icp_native_map_one_cpu(*indx, r.start, r.end - r.start)) + if (icp_native_map_one_cpu(*indx, r.start, resource_size(&r))) return -1; (*indx)++; diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index 32c385ef101..0f62f467275 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -58,7 +58,7 @@ int register_trapped_io(struct trapped_io *tiop) for (k = 0; k < tiop->num_resources; k++) { res = tiop->resource + k; - len += roundup((res->end - res->start) + 1, PAGE_SIZE); + len += roundup(resource_size(res), PAGE_SIZE); flags |= res->flags; } @@ -85,7 +85,7 @@ int register_trapped_io(struct trapped_io *tiop) (unsigned long)(tiop->virt_base + len), res->flags & IORESOURCE_IO ? "io" : "mmio", (unsigned long)res->start); - len += roundup((res->end - res->start) + 1, PAGE_SIZE); + len += roundup(resource_size(res), PAGE_SIZE); } tiop->magic = IO_TRAPPED_MAGIC; @@ -128,7 +128,7 @@ void __iomem *match_trapped_io_handler(struct list_head *list, return tiop->virt_base + voffs; } - len = (res->end - res->start) + 1; + len = resource_size(res); voffs += roundup(len, PAGE_SIZE); } } @@ -173,7 +173,7 @@ static unsigned long lookup_address(struct trapped_io *tiop, for (k = 0; k < tiop->num_resources; k++) { res = tiop->resource + k; - len = roundup((res->end - res->start) + 1, PAGE_SIZE); + len = roundup(resource_size(res), PAGE_SIZE); if (address < (vaddr + len)) return res->start + (address - vaddr); vaddr += len; diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index e2a3af31ff9..c5a33f007f8 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -170,7 +170,7 @@ void __init reserve_crashkernel(void) if (crashk_res.end == crashk_res.start) goto disable; - crash_size = PAGE_ALIGN(crashk_res.end - crashk_res.start + 1); + crash_size = PAGE_ALIGN(resource_size(&crashk_res)); if (!crashk_res.start) { unsigned long max = memblock_end_of_DRAM() - memory_limit; crashk_res.start = __memblock_alloc_base(crash_size, PAGE_SIZE, max); diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index 1c9c80a1a86..6ffccd6e015 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -228,7 +228,7 @@ _sparc_ioremap(struct resource *res, u32 bus, u32 pa, int sz) } pa &= PAGE_MASK; - sparc_mapiorange(bus, pa, res->start, res->end - res->start + 1); + sparc_mapiorange(bus, pa, res->start, resource_size(res)); return (void __iomem *)(unsigned long)(res->start + offset); } @@ -240,7 +240,7 @@ static void _sparc_free_io(struct resource *res) { unsigned long plen; - plen = res->end - res->start + 1; + plen = resource_size(res); BUG_ON((plen & (PAGE_SIZE-1)) != 0); sparc_unmapiorange(res->start, plen); release_resource(res); @@ -331,9 +331,9 @@ static void sbus_free_coherent(struct device *dev, size_t n, void *p, } n = PAGE_ALIGN(n); - if ((res->end-res->start)+1 != n) { + if (resource_size(res) != n) { printk("sbus_free_consistent: region 0x%lx asked 0x%zx\n", - (long)((res->end-res->start)+1), n); + (long)resource_size(res), n); return; } @@ -504,9 +504,9 @@ static void pci32_free_coherent(struct device *dev, size_t n, void *p, } n = PAGE_ALIGN(n); - if ((res->end-res->start)+1 != n) { + if (resource_size(res) != n) { printk("pci_free_consistent: region 0x%lx asked 0x%lx\n", - (long)((res->end-res->start)+1), (long)n); + (long)resource_size(res), (long)n); return; } diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 713dc91020a..2d1453dd93d 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -820,11 +820,9 @@ static int __pci_mmap_make_offset_bus(struct pci_dev *pdev, struct vm_area_struc unsigned long space_size, user_offset, user_size; if (mmap_state == pci_mmap_io) { - space_size = (pbm->io_space.end - - pbm->io_space.start) + 1; + space_size = resource_size(&pbm->io_space); } else { - space_size = (pbm->mem_space.end - - pbm->mem_space.start) + 1; + space_size = resource_size(&pbm->mem_space); } /* Make sure the request is in range. */ diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 6cdc9ba55fe..5f85d8b34db 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -553,8 +553,7 @@ static void __init setup_bootmem_allocator(void) #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) - reserve_bootmem(crashk_res.start, - crashk_res.end - crashk_res.start + 1, 0); + reserve_bootmem(crashk_res.start, resource_size(&crashk_res), 0); #endif } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index e8c33a30200..726494b5834 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1553,7 +1553,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) continue; /* cover the whole region */ - npages = (r->end - r->start) >> PAGE_SHIFT; + npages = resource_size(r) >> PAGE_SHIFT; npages++; iommu_range_reserve(tbl, r->start, npages); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index ba0a4cce53b..63228035f9d 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -234,7 +234,7 @@ void __init probe_roms(void) /* check for extension rom (ignore length byte!) */ rom = isa_bus_to_virt(extension_rom_resource.start); if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; + length = resource_size(&extension_rom_resource); if (romchecksum(rom, length)) { request_resource(&iomem_resource, &extension_rom_resource); upper = extension_rom_resource.start; diff --git a/drivers/char/bsr.c b/drivers/char/bsr.c index cf39bc08ce0..0c688232aab 100644 --- a/drivers/char/bsr.c +++ b/drivers/char/bsr.c @@ -212,7 +212,7 @@ static int bsr_add_node(struct device_node *bn) cur->bsr_minor = i + total_bsr_devs; cur->bsr_addr = res.start; - cur->bsr_len = res.end - res.start + 1; + cur->bsr_len = resource_size(&res); cur->bsr_bytes = bsr_bytes[i]; cur->bsr_stride = bsr_stride[i]; cur->bsr_dev = MKDEV(bsr_major, i + total_bsr_devs); diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c index 39ccdeada79..e90e1c74fd4 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c @@ -621,7 +621,7 @@ static int __devinit hwicap_setup(struct device *dev, int id, drvdata->mem_start = regs_res->start; drvdata->mem_end = regs_res->end; - drvdata->mem_size = regs_res->end - regs_res->start + 1; + drvdata->mem_size = resource_size(regs_res); if (!request_mem_region(drvdata->mem_start, drvdata->mem_size, DRIVER_NAME)) { diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 954e334e01b..06f9f27dbe7 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1304,8 +1304,7 @@ static int mv_xor_shared_probe(struct platform_device *pdev) if (!res) return -ENODEV; - msp->xor_base = devm_ioremap(&pdev->dev, res->start, - res->end - res->start + 1); + msp->xor_base = devm_ioremap(&pdev->dev, res->start, resource_size(res)); if (!msp->xor_base) return -EBUSY; @@ -1314,7 +1313,7 @@ static int mv_xor_shared_probe(struct platform_device *pdev) return -ENODEV; msp->xor_high_base = devm_ioremap(&pdev->dev, res->start, - res->end - res->start + 1); + resource_size(res)); if (!msp->xor_high_base) return -EBUSY; diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c index db1df59ae2b..9a6a274e692 100644 --- a/drivers/edac/cell_edac.c +++ b/drivers/edac/cell_edac.c @@ -140,7 +140,7 @@ static void __devinit cell_edac_init_csrows(struct mem_ctl_info *mci) if (of_node_to_nid(np) != priv->node) continue; csrow->first_page = r.start >> PAGE_SHIFT; - csrow->nr_pages = (r.end - r.start + 1) >> PAGE_SHIFT; + csrow->nr_pages = resource_size(&r) >> PAGE_SHIFT; csrow->last_page = csrow->first_page + csrow->nr_pages - 1; csrow->mtype = MEM_XDR; csrow->edac_mode = EDAC_SECDED; diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 38ab8e2cd7f..11e1a5dad96 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -538,15 +538,15 @@ static int __devinit mpc85xx_l2_err_probe(struct platform_device *op) /* we only need the error registers */ r.start += 0xe00; - if (!devm_request_mem_region(&op->dev, r.start, - r.end - r.start + 1, pdata->name)) { + if (!devm_request_mem_region(&op->dev, r.start, resource_size(&r), + pdata->name)) { printk(KERN_ERR "%s: Error while requesting mem region\n", __func__); res = -EBUSY; goto err; } - pdata->l2_vbase = devm_ioremap(&op->dev, r.start, r.end - r.start + 1); + pdata->l2_vbase = devm_ioremap(&op->dev, r.start, resource_size(&r)); if (!pdata->l2_vbase) { printk(KERN_ERR "%s: Unable to setup L2 err regs\n", __func__); res = -ENOMEM; @@ -987,15 +987,15 @@ static int __devinit mpc85xx_mc_err_probe(struct platform_device *op) goto err; } - if (!devm_request_mem_region(&op->dev, r.start, - r.end - r.start + 1, pdata->name)) { + if (!devm_request_mem_region(&op->dev, r.start, resource_size(&r), + pdata->name)) { printk(KERN_ERR "%s: Error while requesting mem region\n", __func__); res = -EBUSY; goto err; } - pdata->mc_vbase = devm_ioremap(&op->dev, r.start, r.end - r.start + 1); + pdata->mc_vbase = devm_ioremap(&op->dev, r.start, resource_size(&r)); if (!pdata->mc_vbase) { printk(KERN_ERR "%s: Unable to setup MC err regs\n", __func__); res = -ENOMEM; diff --git a/drivers/gpio/gpio-u300.c b/drivers/gpio/gpio-u300.c index d92790140fe..1a86fefd0f8 100644 --- a/drivers/gpio/gpio-u300.c +++ b/drivers/gpio/gpio-u300.c @@ -581,8 +581,8 @@ static int __init gpio_probe(struct platform_device *pdev) if (!memres) goto err_no_resource; - if (request_mem_region(memres->start, memres->end - memres->start, "GPIO Controller") - == NULL) { + if (!request_mem_region(memres->start, resource_size(memres), + "GPIO Controller")) { err = -ENODEV; goto err_no_ioregion; } @@ -640,7 +640,7 @@ static int __init gpio_probe(struct platform_device *pdev) free_irq(gpio_ports[i].irq, &gpio_ports[i]); iounmap(virtbase); err_no_ioremap: - release_mem_region(memres->start, memres->end - memres->start); + release_mem_region(memres->start, resource_size(memres)); err_no_ioregion: err_no_resource: clk_disable(clk); @@ -660,7 +660,7 @@ static int __exit gpio_remove(struct platform_device *pdev) for (i = 0 ; i < U300_GPIO_NUM_PORTS; i++) free_irq(gpio_ports[i].irq, &gpio_ports[i]); iounmap(virtbase); - release_mem_region(memres->start, memres->end - memres->start); + release_mem_region(memres->start, resource_size(memres)); clk_disable(clk); clk_put(clk); return 0; diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c index 9e8f4e1b0cc..712c7904d03 100644 --- a/drivers/ide/palm_bk3710.c +++ b/drivers/ide/palm_bk3710.c @@ -342,7 +342,7 @@ static int __init palm_bk3710_probe(struct platform_device *pdev) return -ENODEV; } - mem_size = mem->end - mem->start + 1; + mem_size = resource_size(mem); if (request_mem_region(mem->start, mem_size, "palm_bk3710") == NULL) { printk(KERN_ERR "failed to request memory region\n"); return -EBUSY; diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index bed3e39aac9..71c23195497 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -551,10 +551,10 @@ static int __init tx4939ide_probe(struct platform_device *pdev) return -ENODEV; if (!devm_request_mem_region(&pdev->dev, res->start, - res->end - res->start + 1, "tx4938ide")) + resource_size(res), "tx4938ide")) return -EBUSY; mapbase = (unsigned long)devm_ioremap(&pdev->dev, res->start, - res->end - res->start + 1); + resource_size(res)); if (!mapbase) return -EBUSY; memset(&hw, 0, sizeof(hw)); diff --git a/drivers/input/serio/sa1111ps2.c b/drivers/input/serio/sa1111ps2.c index d55874e5d1c..44fc8b4bcd8 100644 --- a/drivers/input/serio/sa1111ps2.c +++ b/drivers/input/serio/sa1111ps2.c @@ -300,8 +300,7 @@ static int __devinit ps2_probe(struct sa1111_dev *dev) out: sa1111_disable_device(ps2if->dev); - release_mem_region(dev->res.start, - dev->res.end - dev->res.start + 1); + release_mem_region(dev->res.start, resource_size(&dev->res)); free: sa1111_set_drvdata(dev, NULL); kfree(ps2if); @@ -317,8 +316,7 @@ static int __devexit ps2_remove(struct sa1111_dev *dev) struct ps2if *ps2if = sa1111_get_drvdata(dev); serio_unregister_port(ps2if->io); - release_mem_region(dev->res.start, - dev->res.end - dev->res.start + 1); + release_mem_region(dev->res.start, resource_size(&dev->res)); sa1111_set_drvdata(dev, NULL); kfree(ps2if); diff --git a/drivers/media/video/davinci/vpif.c b/drivers/media/video/davinci/vpif.c index 9f3bfc1eb24..af9680273ff 100644 --- a/drivers/media/video/davinci/vpif.c +++ b/drivers/media/video/davinci/vpif.c @@ -422,7 +422,7 @@ static int __init vpif_probe(struct platform_device *pdev) if (!res) return -ENOENT; - res_len = res->end - res->start + 1; + res_len = resource_size(res); res = request_mem_region(res->start, res_len, res->name); if (!res) diff --git a/drivers/media/video/omap24xxcam.c b/drivers/media/video/omap24xxcam.c index f6626e87dbc..69b60ba5dd7 100644 --- a/drivers/media/video/omap24xxcam.c +++ b/drivers/media/video/omap24xxcam.c @@ -1768,14 +1768,13 @@ static int __devinit omap24xxcam_probe(struct platform_device *pdev) dev_err(cam->dev, "no mem resource?\n"); goto err; } - if (!request_mem_region(mem->start, (mem->end - mem->start) + 1, - pdev->name)) { + if (!request_mem_region(mem->start, resource_size(mem), pdev->name)) { dev_err(cam->dev, "cannot reserve camera register I/O region\n"); goto err; } cam->mmio_base_phys = mem->start; - cam->mmio_size = (mem->end - mem->start) + 1; + cam->mmio_size = resource_size(mem); /* map the region */ cam->mmio_base = (unsigned long) diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c index 090d2a3a654..a8c08f332da 100644 --- a/drivers/message/i2o/iop.c +++ b/drivers/message/i2o/iop.c @@ -681,11 +681,11 @@ static int i2o_iop_systab_set(struct i2o_controller *c) if (root && allocate_resource(root, res, sb->desired_mem_size, sb->desired_mem_size, sb->desired_mem_size, 1 << 20, /* Unspecified, so use 1Mb and play safe */ NULL, NULL) >= 0) { c->mem_alloc = 1; - sb->current_mem_size = 1 + res->end - res->start; + sb->current_mem_size = resource_size(res); sb->current_mem_base = res->start; osm_info("%s: allocated %llu bytes of PCI memory at " "0x%016llX.\n", c->name, - (unsigned long long)(1 + res->end - res->start), + (unsigned long long)resource_size(res), (unsigned long long)res->start); } } @@ -703,11 +703,11 @@ static int i2o_iop_systab_set(struct i2o_controller *c) if (root && allocate_resource(root, res, sb->desired_io_size, sb->desired_io_size, sb->desired_io_size, 1 << 20, /* Unspecified, so use 1Mb and play safe */ NULL, NULL) >= 0) { c->io_alloc = 1; - sb->current_io_size = 1 + res->end - res->start; + sb->current_io_size = resource_size(res); sb->current_mem_base = res->start; osm_info("%s: allocated %llu bytes of PCI I/O at " "0x%016llX.\n", c->name, - (unsigned long long)(1 + res->end - res->start), + (unsigned long long)resource_size(res), (unsigned long long)res->start); } } diff --git a/drivers/mfd/tc6387xb.c b/drivers/mfd/tc6387xb.c index ad715bf49ca..71bc835324d 100644 --- a/drivers/mfd/tc6387xb.c +++ b/drivers/mfd/tc6387xb.c @@ -177,7 +177,7 @@ static int __devinit tc6387xb_probe(struct platform_device *dev) if (ret) goto err_resource; - tc6387xb->scr = ioremap(rscr->start, rscr->end - rscr->start + 1); + tc6387xb->scr = ioremap(rscr->start, resource_size(rscr)); if (!tc6387xb->scr) { ret = -ENOMEM; goto err_ioremap; diff --git a/drivers/misc/atmel-ssc.c b/drivers/misc/atmel-ssc.c index 4afffe610f9..769a4e8e10d 100644 --- a/drivers/misc/atmel-ssc.c +++ b/drivers/misc/atmel-ssc.c @@ -95,7 +95,7 @@ static int __init ssc_probe(struct platform_device *pdev) } ssc->pdev = pdev; - ssc->regs = ioremap(regs->start, regs->end - regs->start + 1); + ssc->regs = ioremap(regs->start, resource_size(regs)); if (!ssc->regs) { dev_dbg(&pdev->dev, "ioremap failed\n"); retval = -EINVAL; diff --git a/drivers/misc/atmel_pwm.c b/drivers/misc/atmel_pwm.c index 0f3fb4f03bd..28f5aaa19d4 100644 --- a/drivers/misc/atmel_pwm.c +++ b/drivers/misc/atmel_pwm.c @@ -329,7 +329,7 @@ static int __init pwm_probe(struct platform_device *pdev) p->pdev = pdev; p->mask = *mp; p->irq = irq; - p->base = ioremap(r->start, r->end - r->start + 1); + p->base = ioremap(r->start, resource_size(r)); if (!p->base) goto fail; p->clk = clk_get(&pdev->dev, "pwm_clk"); diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 66dcddb9c20..2a069f908b2 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1595,7 +1595,7 @@ static int dw_mci_probe(struct platform_device *pdev) INIT_LIST_HEAD(&host->queue); ret = -ENOMEM; - host->regs = ioremap(regs->start, regs->end - regs->start + 1); + host->regs = ioremap(regs->start, resource_size(regs)); if (!host->regs) goto err_freehost; diff --git a/drivers/mtd/maps/bfin-async-flash.c b/drivers/mtd/maps/bfin-async-flash.c index d4297a97e10..67815eed2f0 100644 --- a/drivers/mtd/maps/bfin-async-flash.c +++ b/drivers/mtd/maps/bfin-async-flash.c @@ -142,7 +142,7 @@ static int __devinit bfin_flash_probe(struct platform_device *pdev) state->map.write = bfin_flash_write; state->map.copy_to = bfin_flash_copy_to; state->map.bankwidth = pdata->width; - state->map.size = memory->end - memory->start + 1; + state->map.size = resource_size(memory); state->map.virt = (void __iomem *)memory->start; state->map.phys = memory->start; state->map.map_priv_1 = (unsigned long)state; diff --git a/drivers/mtd/maps/ixp2000.c b/drivers/mtd/maps/ixp2000.c index c00b9175ba9..1594a802631 100644 --- a/drivers/mtd/maps/ixp2000.c +++ b/drivers/mtd/maps/ixp2000.c @@ -155,7 +155,7 @@ static int ixp2000_flash_probe(struct platform_device *dev) if (!plat) return -ENODEV; - window_size = dev->resource->end - dev->resource->start + 1; + window_size = resource_size(dev->resource); dev_info(&dev->dev, "Probe of IXP2000 flash(%d banks x %dMiB)\n", ixp_data->nr_banks, ((u32)window_size >> 20)); @@ -194,16 +194,17 @@ static int ixp2000_flash_probe(struct platform_device *dev) info->map.copy_to = ixp2000_flash_copy_to; info->res = request_mem_region(dev->resource->start, - dev->resource->end - dev->resource->start + 1, - dev_name(&dev->dev)); + resource_size(dev->resource), + dev_name(&dev->dev)); if (!info->res) { dev_err(&dev->dev, "Could not reserve memory region\n"); err = -ENOMEM; goto Error; } - info->map.map_priv_1 = (unsigned long) ioremap(dev->resource->start, - dev->resource->end - dev->resource->start + 1); + info->map.map_priv_1 = + (unsigned long)ioremap(dev->resource->start, + resource_size(dev->resource)); if (!info->map.map_priv_1) { dev_err(&dev->dev, "Failed to ioremap flash region\n"); err = -EIO; diff --git a/drivers/mtd/maps/pxa2xx-flash.c b/drivers/mtd/maps/pxa2xx-flash.c index f59d62f74d4..7ae137d4b99 100644 --- a/drivers/mtd/maps/pxa2xx-flash.c +++ b/drivers/mtd/maps/pxa2xx-flash.c @@ -70,7 +70,7 @@ static int __devinit pxa2xx_flash_probe(struct platform_device *pdev) info->map.name = (char *) flash->name; info->map.bankwidth = flash->width; info->map.phys = res->start; - info->map.size = res->end - res->start + 1; + info->map.size = resource_size(res); info->parts = flash->parts; info->nr_parts = flash->nr_parts; diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c index b300705d41c..d4ba1f218e9 100644 --- a/drivers/mtd/nand/atmel_nand.c +++ b/drivers/mtd/nand/atmel_nand.c @@ -513,7 +513,7 @@ static int __init atmel_nand_probe(struct platform_device *pdev) host->io_phys = (dma_addr_t)mem->start; - host->io_base = ioremap(mem->start, mem->end - mem->start + 1); + host->io_base = ioremap(mem->start, resource_size(mem)); if (host->io_base == NULL) { printk(KERN_ERR "atmel_nand: ioremap failed\n"); res = -EIO; @@ -547,7 +547,7 @@ static int __init atmel_nand_probe(struct platform_device *pdev) if (no_ecc) nand_chip->ecc.mode = NAND_ECC_NONE; if (hard_ecc && regs) { - host->ecc = ioremap(regs->start, regs->end - regs->start + 1); + host->ecc = ioremap(regs->start, resource_size(regs)); if (host->ecc == NULL) { printk(KERN_ERR "atmel_nand: ioremap failed\n"); res = -EIO; diff --git a/drivers/mtd/nand/bcm_umi_nand.c b/drivers/mtd/nand/bcm_umi_nand.c index 9ec280738a9..8c569e454dc 100644 --- a/drivers/mtd/nand/bcm_umi_nand.c +++ b/drivers/mtd/nand/bcm_umi_nand.c @@ -380,7 +380,7 @@ static int __devinit bcm_umi_nand_probe(struct platform_device *pdev) return -ENXIO; /* map physical address */ - bcm_umi_io_base = ioremap(r->start, r->end - r->start + 1); + bcm_umi_io_base = ioremap(r->start, resource_size(r)); if (!bcm_umi_io_base) { printk(KERN_ERR "ioremap to access BCM UMI NAND chip failed\n"); diff --git a/drivers/mtd/nand/mpc5121_nfc.c b/drivers/mtd/nand/mpc5121_nfc.c index 2f7c930872f..eb1fbac63eb 100644 --- a/drivers/mtd/nand/mpc5121_nfc.c +++ b/drivers/mtd/nand/mpc5121_nfc.c @@ -713,7 +713,7 @@ static int __devinit mpc5121_nfc_probe(struct platform_device *op) } regs_paddr = res.start; - regs_size = res.end - res.start + 1; + regs_size = resource_size(&res); if (!devm_request_mem_region(dev, regs_paddr, regs_size, DRV_NAME)) { dev_err(dev, "Error requesting memory region!\n"); diff --git a/drivers/net/bcm63xx_enet.c b/drivers/net/bcm63xx_enet.c index f1573d492e9..85045cde312 100644 --- a/drivers/net/bcm63xx_enet.c +++ b/drivers/net/bcm63xx_enet.c @@ -1646,7 +1646,7 @@ static int __devinit bcm_enet_probe(struct platform_device *pdev) if (ret) goto out; - iomem_size = res_mem->end - res_mem->start + 1; + iomem_size = resource_size(res_mem); if (!request_mem_region(res_mem->start, iomem_size, "bcm63xx_enet")) { ret = -EBUSY; goto out; @@ -1861,7 +1861,7 @@ static int __devexit bcm_enet_remove(struct platform_device *pdev) /* release device resources */ iounmap(priv->base); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(res->start, res->end - res->start + 1); + release_mem_region(res->start, resource_size(res)); /* disable hw block clocks */ if (priv->phy_clk) { @@ -1897,7 +1897,7 @@ static int __devinit bcm_enet_shared_probe(struct platform_device *pdev) if (!res) return -ENODEV; - iomem_size = res->end - res->start + 1; + iomem_size = resource_size(res); if (!request_mem_region(res->start, iomem_size, "bcm63xx_enet_dma")) return -EBUSY; @@ -1915,7 +1915,7 @@ static int __devexit bcm_enet_shared_remove(struct platform_device *pdev) iounmap(bcm_enet_shared_base); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(res->start, res->end - res->start + 1); + release_mem_region(res->start, resource_size(res)); return 0; } diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c index 60a49e5a2a5..f76e88e7428 100644 --- a/drivers/net/can/softing/softing_main.c +++ b/drivers/net/can/softing/softing_main.c @@ -799,7 +799,7 @@ static __devinit int softing_pdev_probe(struct platform_device *pdev) if (!pres) goto platform_resource_failed; card->dpram_phys = pres->start; - card->dpram_size = pres->end - pres->start + 1; + card->dpram_size = resource_size(pres); card->dpram = ioremap_nocache(card->dpram_phys, card->dpram_size); if (!card->dpram) { dev_alert(&card->pdev->dev, "dpram ioremap failed\n"); diff --git a/drivers/net/davinci_emac.c b/drivers/net/davinci_emac.c index dcc4a170b0f..c35ba5fba8f 100644 --- a/drivers/net/davinci_emac.c +++ b/drivers/net/davinci_emac.c @@ -1821,7 +1821,7 @@ static int __devinit davinci_emac_probe(struct platform_device *pdev) } priv->emac_base_phys = res->start + pdata->ctrl_reg_offset; - size = res->end - res->start + 1; + size = resource_size(res); if (!request_mem_region(res->start, size, ndev->name)) { dev_err(&pdev->dev, "failed request_mem_region() for regs\n"); rc = -ENXIO; @@ -1926,7 +1926,7 @@ no_irq_res: cpdma_ctlr_destroy(priv->dma); no_dma: res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(res->start, res->end - res->start + 1); + release_mem_region(res->start, resource_size(res)); iounmap(priv->remap_addr); probe_quit: @@ -1960,7 +1960,7 @@ static int __devexit davinci_emac_remove(struct platform_device *pdev) cpdma_chan_destroy(priv->rxchan); cpdma_ctlr_destroy(priv->dma); - release_mem_region(res->start, res->end - res->start + 1); + release_mem_region(res->start, resource_size(res)); unregister_netdev(ndev); iounmap(priv->remap_addr); diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c index a83dd312c3a..15e4a71fbf1 100644 --- a/drivers/net/ethoc.c +++ b/drivers/net/ethoc.c @@ -965,7 +965,7 @@ static int __devinit ethoc_probe(struct platform_device *pdev) priv = netdev_priv(netdev); priv->netdev = netdev; priv->dma_alloc = 0; - priv->io_region_size = mmio->end - mmio->start + 1; + priv->io_region_size = resource_size(mmio); priv->iobase = devm_ioremap_nocache(&pdev->dev, netdev->base_addr, resource_size(mmio)); diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c index 9f81b1ac130..fe57eee8a67 100644 --- a/drivers/net/fec_mpc52xx.c +++ b/drivers/net/fec_mpc52xx.c @@ -867,10 +867,11 @@ static int __devinit mpc52xx_fec_probe(struct platform_device *op) "Error while parsing device node resource\n" ); goto err_netdev; } - if ((mem.end - mem.start + 1) < sizeof(struct mpc52xx_fec)) { + if (resource_size(&mem) < sizeof(struct mpc52xx_fec)) { printk(KERN_ERR DRIVER_NAME - " - invalid resource size (%lx < %x), check mpc52xx_devices.c\n", - (unsigned long)(mem.end - mem.start + 1), sizeof(struct mpc52xx_fec)); + " - invalid resource size (%lx < %x), check mpc52xx_devices.c\n", + (unsigned long)resource_size(&mem), + sizeof(struct mpc52xx_fec)); rv = -EINVAL; goto err_netdev; } diff --git a/drivers/net/fs_enet/mii-bitbang.c b/drivers/net/fs_enet/mii-bitbang.c index ad297544071..b09270b5d0a 100644 --- a/drivers/net/fs_enet/mii-bitbang.c +++ b/drivers/net/fs_enet/mii-bitbang.c @@ -120,7 +120,7 @@ static int __devinit fs_mii_bitbang_init(struct mii_bus *bus, if (ret) return ret; - if (res.end - res.start < 13) + if (resource_size(&res) <= 13) return -ENODEV; /* This should really encode the pin number as well, but all @@ -139,7 +139,7 @@ static int __devinit fs_mii_bitbang_init(struct mii_bus *bus, return -ENODEV; mdc_pin = *data; - bitbang->dir = ioremap(res.start, res.end - res.start + 1); + bitbang->dir = ioremap(res.start, resource_size(&res)); if (!bitbang->dir) return -ENOMEM; diff --git a/drivers/net/fs_enet/mii-fec.c b/drivers/net/fs_enet/mii-fec.c index 6a2e150e75b..e0e9d6c35d8 100644 --- a/drivers/net/fs_enet/mii-fec.c +++ b/drivers/net/fs_enet/mii-fec.c @@ -136,7 +136,7 @@ static int __devinit fs_enet_mdio_probe(struct platform_device *ofdev) snprintf(new_bus->id, MII_BUS_ID_SIZE, "%x", res.start); - fec->fecp = ioremap(res.start, res.end - res.start + 1); + fec->fecp = ioremap(res.start, resource_size(&res)); if (!fec->fecp) goto out_fec; diff --git a/drivers/net/gianfar_ptp.c b/drivers/net/gianfar_ptp.c index d8e175382d1..1c97861596f 100644 --- a/drivers/net/gianfar_ptp.c +++ b/drivers/net/gianfar_ptp.c @@ -491,7 +491,7 @@ static int gianfar_ptp_probe(struct platform_device *dev) spin_lock_init(&etsects->lock); etsects->regs = ioremap(etsects->rsrc->start, - 1 + etsects->rsrc->end - etsects->rsrc->start); + resource_size(etsects->rsrc)); if (!etsects->regs) { pr_err("ioremap ptp registers failed\n"); goto no_ioremap; diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c index 079450fe5e9..725399ea069 100644 --- a/drivers/net/ibm_newemac/core.c +++ b/drivers/net/ibm_newemac/core.c @@ -2770,7 +2770,7 @@ static int __devinit emac_probe(struct platform_device *ofdev) } // TODO : request_mem_region dev->emacp = ioremap(dev->rsrc_regs.start, - dev->rsrc_regs.end - dev->rsrc_regs.start + 1); + resource_size(&dev->rsrc_regs)); if (dev->emacp == NULL) { printk(KERN_ERR "%s: Can't map device registers!\n", np->full_name); diff --git a/drivers/net/macb.c b/drivers/net/macb.c index 6c6a02869df..27125cdd7e0 100644 --- a/drivers/net/macb.c +++ b/drivers/net/macb.c @@ -1169,7 +1169,7 @@ static int __init macb_probe(struct platform_device *pdev) clk_enable(bp->hclk); #endif - bp->regs = ioremap(regs->start, regs->end - regs->start + 1); + bp->regs = ioremap(regs->start, resource_size(regs)); if (!bp->regs) { dev_err(&pdev->dev, "failed to map registers, aborting.\n"); err = -ENOMEM; diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index a5d9b1c310b..b7564825738 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -2593,7 +2593,7 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev) if (msp == NULL) goto out; - msp->base = ioremap(res->start, res->end - res->start + 1); + msp->base = ioremap(res->start, resource_size(res)); if (msp->base == NULL) goto out_free; diff --git a/drivers/net/pxa168_eth.c b/drivers/net/pxa168_eth.c index 89f7540d90f..df1292eb9c2 100644 --- a/drivers/net/pxa168_eth.c +++ b/drivers/net/pxa168_eth.c @@ -1502,7 +1502,7 @@ static int pxa168_eth_probe(struct platform_device *pdev) err = -ENODEV; goto err_netdev; } - pep->base = ioremap(res->start, res->end - res->start + 1); + pep->base = ioremap(res->start, resource_size(res)); if (pep->base == NULL) { err = -ENOMEM; goto err_netdev; diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c index 68d50429ddf..ea65f7ec360 100644 --- a/drivers/net/sb1250-mac.c +++ b/drivers/net/sb1250-mac.c @@ -2597,7 +2597,7 @@ static int __devinit sbmac_probe(struct platform_device *pldev) res = platform_get_resource(pldev, IORESOURCE_MEM, 0); BUG_ON(!res); - sbm_base = ioremap_nocache(res->start, res->end - res->start + 1); + sbm_base = ioremap_nocache(res->start, resource_size(res)); if (!sbm_base) { printk(KERN_ERR "%s: unable to map device registers\n", dev_name(&pldev->dev)); diff --git a/drivers/parport/parport_ax88796.c b/drivers/parport/parport_ax88796.c index 2c5ac2bf5c5..844f6137970 100644 --- a/drivers/parport/parport_ax88796.c +++ b/drivers/parport/parport_ax88796.c @@ -293,7 +293,7 @@ static int parport_ax88796_probe(struct platform_device *pdev) goto exit_mem; } - size = (res->end - res->start) + 1; + size = resource_size(res); spacing = size / 3; dd->io = request_mem_region(res->start, size, pdev->name); diff --git a/drivers/pci/hotplug/shpchp_sysfs.c b/drivers/pci/hotplug/shpchp_sysfs.c index 071b7dc0094..efa30da1ae8 100644 --- a/drivers/pci/hotplug/shpchp_sysfs.c +++ b/drivers/pci/hotplug/shpchp_sysfs.c @@ -50,29 +50,26 @@ static ssize_t show_ctrl (struct device *dev, struct device_attribute *attr, cha pci_bus_for_each_resource(bus, res, index) { if (res && (res->flags & IORESOURCE_MEM) && !(res->flags & IORESOURCE_PREFETCH)) { - out += sprintf(out, "start = %8.8llx, " - "length = %8.8llx\n", - (unsigned long long)res->start, - (unsigned long long)(res->end - res->start)); + out += sprintf(out, "start = %8.8llx, length = %8.8llx\n", + (unsigned long long)res->start, + (unsigned long long)resource_size(res)); } } out += sprintf(out, "Free resources: prefetchable memory\n"); pci_bus_for_each_resource(bus, res, index) { if (res && (res->flags & IORESOURCE_MEM) && (res->flags & IORESOURCE_PREFETCH)) { - out += sprintf(out, "start = %8.8llx, " - "length = %8.8llx\n", - (unsigned long long)res->start, - (unsigned long long)(res->end - res->start)); + out += sprintf(out, "start = %8.8llx, length = %8.8llx\n", + (unsigned long long)res->start, + (unsigned long long)resource_size(res)); } } out += sprintf(out, "Free resources: IO\n"); pci_bus_for_each_resource(bus, res, index) { if (res && (res->flags & IORESOURCE_IO)) { - out += sprintf(out, "start = %8.8llx, " - "length = %8.8llx\n", - (unsigned long long)res->start, - (unsigned long long)(res->end - res->start)); + out += sprintf(out, "start = %8.8llx, length = %8.8llx\n", + (unsigned long long)res->start, + (unsigned long long)resource_size(res)); } } out += sprintf(out, "Free resources: bus numbers\n"); diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c index fb33fa42d24..4902206f53d 100644 --- a/drivers/pcmcia/at91_cf.c +++ b/drivers/pcmcia/at91_cf.c @@ -283,8 +283,7 @@ static int __init at91_cf_probe(struct platform_device *pdev) } /* reserve chip-select regions */ - if (!request_mem_region(io->start, io->end + 1 - io->start, - driver_name)) { + if (!request_mem_region(io->start, resource_size(io), driver_name)) { status = -ENXIO; goto fail1; } @@ -308,7 +307,7 @@ static int __init at91_cf_probe(struct platform_device *pdev) return 0; fail2: - release_mem_region(io->start, io->end + 1 - io->start); + release_mem_region(io->start, resource_size(io)); fail1: if (cf->socket.io_offset) iounmap((void __iomem *) cf->socket.io_offset); @@ -339,7 +338,7 @@ static int __exit at91_cf_remove(struct platform_device *pdev) struct resource *io = cf->socket.io[0].res; pcmcia_unregister_socket(&cf->socket); - release_mem_region(io->start, io->end + 1 - io->start); + release_mem_region(io->start, resource_size(io)); iounmap((void __iomem *) cf->socket.io_offset); if (board->irq_pin) { free_irq(board->irq_pin, cf); diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index 6defd4a8168..06ad3e5e7d3 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -209,9 +209,9 @@ static int __devinit electra_cf_probe(struct platform_device *ofdev) cf->ofdev = ofdev; cf->mem_phys = mem.start; - cf->mem_size = PAGE_ALIGN(mem.end - mem.start); + cf->mem_size = PAGE_ALIGN(resource_size(&mem)); cf->mem_base = ioremap(cf->mem_phys, cf->mem_size); - cf->io_size = PAGE_ALIGN(io.end - io.start); + cf->io_size = PAGE_ALIGN(resource_size(&io)); area = __get_vm_area(cf->io_size, 0, PHB_IO_BASE, PHB_IO_END); if (area == NULL) diff --git a/drivers/pcmcia/rsrc_iodyn.c b/drivers/pcmcia/rsrc_iodyn.c index 523eb691c30..f53c237bda2 100644 --- a/drivers/pcmcia/rsrc_iodyn.c +++ b/drivers/pcmcia/rsrc_iodyn.c @@ -135,7 +135,7 @@ static int iodyn_find_io(struct pcmcia_socket *s, unsigned int attr, try = res->end + 1; if ((*base == 0) || (*base == try)) { if (adjust_resource(s->io[i].res, res->start, - res->end - res->start + num + 1)) + resource_size(res) + num)) continue; *base = try; s->io[i].InUse += num; @@ -147,8 +147,8 @@ static int iodyn_find_io(struct pcmcia_socket *s, unsigned int attr, try = res->start - num; if ((*base == 0) || (*base == try)) { if (adjust_resource(s->io[i].res, - res->start - num, - res->end - res->start + num + 1)) + res->start - num, + resource_size(res) + num)) continue; *base = try; s->io[i].InUse += num; diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index b187555d438..9da9656242a 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -770,7 +770,7 @@ static int nonstatic_find_io(struct pcmcia_socket *s, unsigned int attr, res->end + num); if (!ret) { ret = adjust_resource(s->io[i].res, res->start, - res->end - res->start + num + 1); + resource_size(res) + num); if (ret) continue; *base = try; @@ -788,8 +788,8 @@ static int nonstatic_find_io(struct pcmcia_socket *s, unsigned int attr, res->end); if (!ret) { ret = adjust_resource(s->io[i].res, - res->start - num, - res->end - res->start + num + 1); + res->start - num, + resource_size(res) + num); if (ret) continue; *base = try; diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c index 100e4d9372f..1a6937d9118 100644 --- a/drivers/pnp/pnpacpi/rsparser.c +++ b/drivers/pnp/pnpacpi/rsparser.c @@ -1018,7 +1018,7 @@ static void pnpacpi_encode_io(struct pnp_dev *dev, io->minimum = p->start; io->maximum = p->end; io->alignment = 0; /* Correct? */ - io->address_length = p->end - p->start + 1; + io->address_length = resource_size(p); } else { io->minimum = 0; io->address_length = 0; @@ -1036,7 +1036,7 @@ static void pnpacpi_encode_fixed_io(struct pnp_dev *dev, if (pnp_resource_enabled(p)) { fixed_io->address = p->start; - fixed_io->address_length = p->end - p->start + 1; + fixed_io->address_length = resource_size(p); } else { fixed_io->address = 0; fixed_io->address_length = 0; @@ -1059,7 +1059,7 @@ static void pnpacpi_encode_mem24(struct pnp_dev *dev, memory24->minimum = p->start; memory24->maximum = p->end; memory24->alignment = 0; - memory24->address_length = p->end - p->start + 1; + memory24->address_length = resource_size(p); } else { memory24->minimum = 0; memory24->address_length = 0; @@ -1083,7 +1083,7 @@ static void pnpacpi_encode_mem32(struct pnp_dev *dev, memory32->minimum = p->start; memory32->maximum = p->end; memory32->alignment = 0; - memory32->address_length = p->end - p->start + 1; + memory32->address_length = resource_size(p); } else { memory32->minimum = 0; memory32->alignment = 0; @@ -1106,7 +1106,7 @@ static void pnpacpi_encode_fixed_mem32(struct pnp_dev *dev, p->flags & IORESOURCE_MEM_WRITEABLE ? ACPI_READ_WRITE_MEMORY : ACPI_READ_ONLY_MEMORY; fixed_memory32->address = p->start; - fixed_memory32->address_length = p->end - p->start + 1; + fixed_memory32->address_length = resource_size(p); } else { fixed_memory32->address = 0; fixed_memory32->address_length = 0; diff --git a/drivers/pnp/pnpbios/rsparser.c b/drivers/pnp/pnpbios/rsparser.c index cb1f47bfee9..cca2f9f9f3e 100644 --- a/drivers/pnp/pnpbios/rsparser.c +++ b/drivers/pnp/pnpbios/rsparser.c @@ -505,7 +505,7 @@ static void pnpbios_encode_mem(struct pnp_dev *dev, unsigned char *p, if (pnp_resource_enabled(res)) { base = res->start; - len = res->end - res->start + 1; + len = resource_size(res); } else { base = 0; len = 0; @@ -529,7 +529,7 @@ static void pnpbios_encode_mem32(struct pnp_dev *dev, unsigned char *p, if (pnp_resource_enabled(res)) { base = res->start; - len = res->end - res->start + 1; + len = resource_size(res); } else { base = 0; len = 0; @@ -559,7 +559,7 @@ static void pnpbios_encode_fixed_mem32(struct pnp_dev *dev, unsigned char *p, if (pnp_resource_enabled(res)) { base = res->start; - len = res->end - res->start + 1; + len = resource_size(res); } else { base = 0; len = 0; @@ -617,7 +617,7 @@ static void pnpbios_encode_port(struct pnp_dev *dev, unsigned char *p, if (pnp_resource_enabled(res)) { base = res->start; - len = res->end - res->start + 1; + len = resource_size(res); } else { base = 0; len = 0; @@ -636,11 +636,11 @@ static void pnpbios_encode_fixed_port(struct pnp_dev *dev, unsigned char *p, struct resource *res) { unsigned long base = res->start; - unsigned long len = res->end - res->start + 1; + unsigned long len = resource_size(res); if (pnp_resource_enabled(res)) { base = res->start; - len = res->end - res->start + 1; + len = resource_size(res); } else { base = 0; len = 0; diff --git a/drivers/rtc/rtc-at32ap700x.c b/drivers/rtc/rtc-at32ap700x.c index e725d51e773..8dd08305aae 100644 --- a/drivers/rtc/rtc-at32ap700x.c +++ b/drivers/rtc/rtc-at32ap700x.c @@ -223,7 +223,7 @@ static int __init at32_rtc_probe(struct platform_device *pdev) } rtc->irq = irq; - rtc->regs = ioremap(regs->start, regs->end - regs->start + 1); + rtc->regs = ioremap(regs->start, resource_size(regs)); if (!rtc->regs) { ret = -ENOMEM; dev_dbg(&pdev->dev, "could not map I/O memory\n"); diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 911e75cdc12..05beb6c1ca7 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -606,7 +606,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) * (needing ioremap etc), not i/o space resources like this ... */ ports = request_region(ports->start, - ports->end + 1 - ports->start, + resource_size(ports), driver_name); if (!ports) { dev_dbg(dev, "i/o registers already in use\n"); @@ -750,7 +750,7 @@ cleanup1: cmos_rtc.dev = NULL; rtc_device_unregister(cmos_rtc.rtc); cleanup0: - release_region(ports->start, ports->end + 1 - ports->start); + release_region(ports->start, resource_size(ports)); return retval; } @@ -779,7 +779,7 @@ static void __exit cmos_do_remove(struct device *dev) cmos->rtc = NULL; ports = cmos->iomem; - release_region(ports->start, ports->end + 1 - ports->start); + release_region(ports->start, resource_size(ports)); cmos->iomem = NULL; cmos->dev = NULL; diff --git a/drivers/rtc/rtc-ds1286.c b/drivers/rtc/rtc-ds1286.c index 47e681df31e..68e6caf2549 100644 --- a/drivers/rtc/rtc-ds1286.c +++ b/drivers/rtc/rtc-ds1286.c @@ -343,7 +343,7 @@ static int __devinit ds1286_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->size = res->end - res->start + 1; + priv->size = resource_size(res); if (!request_mem_region(res->start, priv->size, pdev->name)) { ret = -EBUSY; goto out; diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index fbabc773dde..568ad30617e 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -490,7 +490,7 @@ ds1511_rtc_probe(struct platform_device *pdev) pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->size = res->end - res->start + 1; + pdata->size = resource_size(res); if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, pdev->name)) return -EBUSY; diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index 042630c90dd..d84a448dd75 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -173,7 +173,7 @@ static int __devinit ds1742_rtc_probe(struct platform_device *pdev) pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->size = res->end - res->start + 1; + pdata->size = resource_size(res); if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, pdev->name)) return -EBUSY; diff --git a/drivers/rtc/rtc-m48t35.c b/drivers/rtc/rtc-m48t35.c index 7410875e583..8e2a24e33ed 100644 --- a/drivers/rtc/rtc-m48t35.c +++ b/drivers/rtc/rtc-m48t35.c @@ -154,7 +154,7 @@ static int __devinit m48t35_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->size = res->end - res->start + 1; + priv->size = resource_size(res); /* * kludge: remove the #ifndef after ioc3 resource * conflicts are resolved diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index 3978f4caf72..28365388fb6 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -433,7 +433,7 @@ static int __devinit m48t59_rtc_probe(struct platform_device *pdev) if (!m48t59->ioaddr) { /* ioaddr not mapped externally */ - m48t59->ioaddr = ioremap(res->start, res->end - res->start + 1); + m48t59->ioaddr = ioremap(res->start, resource_size(res)); if (!m48t59->ioaddr) goto out; } diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index 0cec5650d56..d33544802a2 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -332,9 +332,8 @@ vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq) if (!iomem) return -ENODEV; - iomem = request_mem_region(iomem->start, - iomem->end + 1 - iomem->start, - driver_name); + iomem = request_mem_region(iomem->start, resource_size(iomem), + driver_name); if (!iomem) { dev_dbg(dev, "i/o mem already in use.\n"); return -EBUSY; diff --git a/drivers/rtc/rtc-puv3.c b/drivers/rtc/rtc-puv3.c index 46f14b82f3a..b3eba3cddd4 100644 --- a/drivers/rtc/rtc-puv3.c +++ b/drivers/rtc/rtc-puv3.c @@ -267,9 +267,8 @@ static int puv3_rtc_probe(struct platform_device *pdev) return -ENOENT; } - puv3_rtc_mem = request_mem_region(res->start, - res->end-res->start+1, - pdev->name); + puv3_rtc_mem = request_mem_region(res->start, resource_size(res), + pdev->name); if (puv3_rtc_mem == NULL) { dev_err(&pdev->dev, "failed to reserve memory region\n"); diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 16512ecae31..2a65e85e0f5 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -455,8 +455,7 @@ static int __devinit s3c_rtc_probe(struct platform_device *pdev) return -ENOENT; } - s3c_rtc_mem = request_mem_region(res->start, - res->end-res->start+1, + s3c_rtc_mem = request_mem_region(res->start, resource_size(res), pdev->name); if (s3c_rtc_mem == NULL) { @@ -465,7 +464,7 @@ static int __devinit s3c_rtc_probe(struct platform_device *pdev) goto err_nores; } - s3c_rtc_base = ioremap(res->start, res->end - res->start + 1); + s3c_rtc_base = ioremap(res->start, resource_size(res)); if (s3c_rtc_base == NULL) { dev_err(&pdev->dev, "failed ioremap()\n"); ret = -EINVAL; diff --git a/drivers/staging/generic_serial/ser_a2232.c b/drivers/staging/generic_serial/ser_a2232.c index 3f47c2ead8e..0c08e1c2558 100644 --- a/drivers/staging/generic_serial/ser_a2232.c +++ b/drivers/staging/generic_serial/ser_a2232.c @@ -746,7 +746,8 @@ static int __init a2232board_init(void) zd_a2232[nr_a2232] = z; boardaddr = ZTWO_VADDR( z->resource.start ); - printk("Board is located at address 0x%x, size is 0x%x.\n", boardaddr, (unsigned int) ((z->resource.end+1) - (z->resource.start))); + printk("Board is located at address 0x%x, size is 0x%x\n", + boardaddr, (unsigned int)resource_size(&z->resource)); mem = (volatile struct a2232memory *) boardaddr; diff --git a/drivers/staging/gma500/psb_gtt.c b/drivers/staging/gma500/psb_gtt.c index 74c5a6569d0..280f9d44546 100644 --- a/drivers/staging/gma500/psb_gtt.c +++ b/drivers/staging/gma500/psb_gtt.c @@ -80,7 +80,7 @@ static int psb_gtt_insert(struct drm_device *dev, struct gtt_range *r) { struct drm_psb_private *dev_priv = dev->dev_private; u32 *gtt_slot, pte; - int numpages = (r->resource.end + 1 - r->resource.start) >> PAGE_SHIFT; + int numpages = resource_size(&r->resource) >> PAGE_SHIFT; struct page **pages; int i; @@ -121,7 +121,7 @@ static void psb_gtt_remove(struct drm_device *dev, struct gtt_range *r) { struct drm_psb_private *dev_priv = dev->dev_private; u32 *gtt_slot, pte; - int numpages = (r->resource.end + 1 - r->resource.start) >> PAGE_SHIFT; + int numpages = resource_size(&r->resource) >> PAGE_SHIFT; int i; WARN_ON(r->stolen); @@ -149,7 +149,7 @@ static int psb_gtt_attach_pages(struct gtt_range *gt) struct address_space *mapping; int i; struct page *p; - int pages = (gt->resource.end + 1 - gt->resource.start) >> PAGE_SHIFT; + int pages = resource_size(>->resource) >> PAGE_SHIFT; WARN_ON(gt->pages); @@ -191,7 +191,7 @@ err: static void psb_gtt_detach_pages(struct gtt_range *gt) { int i; - int pages = (gt->resource.end + 1 - gt->resource.start) >> PAGE_SHIFT; + int pages = resource_size(>->resource) >> PAGE_SHIFT; for (i = 0; i < pages; i++) { /* FIXME: do we need to force dirty */ diff --git a/drivers/tty/serial/bfin_5xx.c b/drivers/tty/serial/bfin_5xx.c index 9b1ff2b6bb3..ff6979181ac 100644 --- a/drivers/tty/serial/bfin_5xx.c +++ b/drivers/tty/serial/bfin_5xx.c @@ -1304,8 +1304,7 @@ static int bfin_serial_probe(struct platform_device *pdev) goto out_error_free_peripherals; } - uart->port.membase = ioremap(res->start, - res->end - res->start); + uart->port.membase = ioremap(res->start, resource_size(res)); if (!uart->port.membase) { dev_err(&pdev->dev, "Cannot map uart IO\n"); ret = -ENXIO; @@ -1483,7 +1482,7 @@ static int bfin_earlyprintk_probe(struct platform_device *pdev) } bfin_earlyprintk_port.port.membase = ioremap(res->start, - res->end - res->start); + resource_size(res)); if (!bfin_earlyprintk_port.port.membase) { dev_err(&pdev->dev, "Cannot map uart IO\n"); ret = -ENXIO; diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index a54473123e0..22fe801cce3 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -954,7 +954,7 @@ static void imx_release_port(struct uart_port *port) struct resource *mmres; mmres = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(mmres->start, mmres->end - mmres->start + 1); + release_mem_region(mmres->start, resource_size(mmres)); } /* @@ -970,8 +970,7 @@ static int imx_request_port(struct uart_port *port) if (!mmres) return -ENODEV; - ret = request_mem_region(mmres->start, mmres->end - mmres->start + 1, - "imx-uart"); + ret = request_mem_region(mmres->start, resource_size(mmres), "imx-uart"); return ret ? 0 : -EBUSY; } diff --git a/drivers/tty/serial/m32r_sio.c b/drivers/tty/serial/m32r_sio.c index 84db7321cce..8e07517f8ac 100644 --- a/drivers/tty/serial/m32r_sio.c +++ b/drivers/tty/serial/m32r_sio.c @@ -892,7 +892,7 @@ static int m32r_sio_request_port(struct uart_port *port) * If we have a mapbase, then request that as well. */ if (ret == 0 && up->port.flags & UPF_IOREMAP) { - int size = res->end - res->start + 1; + int size = resource_size(res); up->port.membase = ioremap(up->port.mapbase, size); if (!up->port.membase) diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index 47cadf47414..c37df8d0fa2 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -1241,8 +1241,8 @@ static int serial_omap_probe(struct platform_device *pdev) return -ENODEV; } - if (!request_mem_region(mem->start, (mem->end - mem->start) + 1, - pdev->dev.driver->name)) { + if (!request_mem_region(mem->start, resource_size(mem), + pdev->dev.driver->name)) { dev_err(&pdev->dev, "memory region already claimed\n"); return -EBUSY; } @@ -1308,7 +1308,7 @@ err: dev_err(&pdev->dev, "[UART%d]: failure [%s]: %d\n", pdev->id, __func__, ret); do_release_region: - release_mem_region(mem->start, (mem->end - mem->start) + 1); + release_mem_region(mem->start, resource_size(mem)); return ret; } diff --git a/drivers/tty/serial/pxa.c b/drivers/tty/serial/pxa.c index 4302e6e3768..531931c1b25 100644 --- a/drivers/tty/serial/pxa.c +++ b/drivers/tty/serial/pxa.c @@ -803,7 +803,7 @@ static int serial_pxa_probe(struct platform_device *dev) break; } - sport->port.membase = ioremap(mmres->start, mmres->end - mmres->start + 1); + sport->port.membase = ioremap(mmres->start, resource_size(mmres)); if (!sport->port.membase) { ret = -ENOMEM; goto err_clk; diff --git a/drivers/tty/serial/sunsu.c b/drivers/tty/serial/sunsu.c index 92aa54550e8..ad0f8f5f6ea 100644 --- a/drivers/tty/serial/sunsu.c +++ b/drivers/tty/serial/sunsu.c @@ -1435,7 +1435,7 @@ static int __devinit su_probe(struct platform_device *op) rp = &op->resource[0]; up->port.mapbase = rp->start; - up->reg_size = (rp->end - rp->start) + 1; + up->reg_size = resource_size(rp); up->port.membase = of_ioremap(rp, 0, up->reg_size, "su"); if (!up->port.membase) { if (type != SU_PORT_PORT) diff --git a/drivers/tty/serial/vt8500_serial.c b/drivers/tty/serial/vt8500_serial.c index 37fc4e3d487..026cb9ea5cd 100644 --- a/drivers/tty/serial/vt8500_serial.c +++ b/drivers/tty/serial/vt8500_serial.c @@ -573,8 +573,7 @@ static int __init vt8500_serial_probe(struct platform_device *pdev) snprintf(vt8500_port->name, sizeof(vt8500_port->name), "VT8500 UART%d", pdev->id); - vt8500_port->uart.membase = ioremap(mmres->start, - mmres->end - mmres->start + 1); + vt8500_port->uart.membase = ioremap(mmres->start, resource_size(mmres)); if (!vt8500_port->uart.membase) { ret = -ENOMEM; goto err; diff --git a/drivers/uio/uio_pdrv.c b/drivers/uio/uio_pdrv.c index 7d3e469b990..bdc3db94612 100644 --- a/drivers/uio/uio_pdrv.c +++ b/drivers/uio/uio_pdrv.c @@ -58,7 +58,7 @@ static int uio_pdrv_probe(struct platform_device *pdev) uiomem->memtype = UIO_MEM_PHYS; uiomem->addr = r->start; - uiomem->size = r->end - r->start + 1; + uiomem->size = resource_size(r); ++uiomem; } diff --git a/drivers/uio/uio_pdrv_genirq.c b/drivers/uio/uio_pdrv_genirq.c index 0f424af7f10..31e799d9efe 100644 --- a/drivers/uio/uio_pdrv_genirq.c +++ b/drivers/uio/uio_pdrv_genirq.c @@ -137,7 +137,7 @@ static int uio_pdrv_genirq_probe(struct platform_device *pdev) uiomem->memtype = UIO_MEM_PHYS; uiomem->addr = r->start; - uiomem->size = r->end - r->start + 1; + uiomem->size = resource_size(r); ++uiomem; } diff --git a/drivers/usb/gadget/atmel_usba_udc.c b/drivers/usb/gadget/atmel_usba_udc.c index db1a659702b..f045c8968a6 100644 --- a/drivers/usb/gadget/atmel_usba_udc.c +++ b/drivers/usb/gadget/atmel_usba_udc.c @@ -272,7 +272,7 @@ static void usba_init_debugfs(struct usba_udc *udc) regs_resource = platform_get_resource(udc->pdev, IORESOURCE_MEM, CTRL_IOMEM_ID); - regs->d_inode->i_size = regs_resource->end - regs_resource->start + 1; + regs->d_inode->i_size = resource_size(regs_resource); udc->debugfs_regs = regs; usba_ep_init_debugfs(udc, to_usba_ep(udc->gadget.ep0)); diff --git a/drivers/usb/gadget/fsl_udc_core.c b/drivers/usb/gadget/fsl_udc_core.c index 2cd9a60c7f3..9c8e56fd0ff 100644 --- a/drivers/usb/gadget/fsl_udc_core.c +++ b/drivers/usb/gadget/fsl_udc_core.c @@ -2445,7 +2445,7 @@ static int __init fsl_udc_probe(struct platform_device *pdev) } if (pdata->operating_mode == FSL_USB2_DR_DEVICE) { - if (!request_mem_region(res->start, res->end - res->start + 1, + if (!request_mem_region(res->start, resource_size(res), driver_name)) { ERR("request mem region for %s failed\n", pdev->name); ret = -EBUSY; @@ -2593,7 +2593,7 @@ err_iounmap_noclk: iounmap(dr_regs); err_release_mem_region: if (pdata->operating_mode == FSL_USB2_DR_DEVICE) - release_mem_region(res->start, res->end - res->start + 1); + release_mem_region(res->start, resource_size(res)); err_kfree: kfree(udc_controller); udc_controller = NULL; @@ -2628,7 +2628,7 @@ static int __exit fsl_udc_remove(struct platform_device *pdev) free_irq(udc_controller->irq, udc_controller); iounmap(dr_regs); if (pdata->operating_mode == FSL_USB2_DR_DEVICE) - release_mem_region(res->start, res->end - res->start + 1); + release_mem_region(res->start, resource_size(res)); device_unregister(&udc_controller->gadget.dev); /* free udc --wait for the release() finished */ diff --git a/drivers/usb/host/ehci-ath79.c b/drivers/usb/host/ehci-ath79.c index 98cc8a13169..eab3d7059fb 100644 --- a/drivers/usb/host/ehci-ath79.c +++ b/drivers/usb/host/ehci-ath79.c @@ -146,7 +146,7 @@ static int ehci_ath79_probe(struct platform_device *pdev) return -ENOMEM; hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { dev_dbg(&pdev->dev, "controller already in use\n"); diff --git a/drivers/usb/host/ehci-cns3xxx.c b/drivers/usb/host/ehci-cns3xxx.c index d41745c6f0c..6536abdea6e 100644 --- a/drivers/usb/host/ehci-cns3xxx.c +++ b/drivers/usb/host/ehci-cns3xxx.c @@ -107,7 +107,7 @@ static int cns3xxx_ehci_probe(struct platform_device *pdev) } hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, driver->description)) { diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c index f380bf97e5a..34a3140d1e5 100644 --- a/drivers/usb/host/ehci-fsl.c +++ b/drivers/usb/host/ehci-fsl.c @@ -100,7 +100,7 @@ static int usb_hcd_fsl_probe(const struct hc_driver *driver, goto err2; } hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, driver->description)) { dev_dbg(&pdev->dev, "controller already in use\n"); diff --git a/drivers/usb/host/ehci-grlib.c b/drivers/usb/host/ehci-grlib.c index 93b230dc51a..fdfd8c5b639 100644 --- a/drivers/usb/host/ehci-grlib.c +++ b/drivers/usb/host/ehci-grlib.c @@ -130,7 +130,7 @@ static int __devinit ehci_hcd_grlib_probe(struct platform_device *op) return -ENOMEM; hcd->rsrc_start = res.start; - hcd->rsrc_len = res.end - res.start + 1; + hcd->rsrc_len = resource_size(&res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { printk(KERN_ERR "%s: request_mem_region failed\n", __FILE__); diff --git a/drivers/usb/host/ehci-ixp4xx.c b/drivers/usb/host/ehci-ixp4xx.c index 50e600d26e2..c4460f3d009 100644 --- a/drivers/usb/host/ehci-ixp4xx.c +++ b/drivers/usb/host/ehci-ixp4xx.c @@ -100,7 +100,7 @@ static int ixp4xx_ehci_probe(struct platform_device *pdev) goto fail_request_resource; } hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, driver->description)) { diff --git a/drivers/usb/host/ehci-octeon.c b/drivers/usb/host/ehci-octeon.c index ff55757ba7d..c3ba3ed5f3a 100644 --- a/drivers/usb/host/ehci-octeon.c +++ b/drivers/usb/host/ehci-octeon.c @@ -124,7 +124,7 @@ static int ehci_octeon_drv_probe(struct platform_device *pdev) return -ENOMEM; hcd->rsrc_start = res_mem->start; - hcd->rsrc_len = res_mem->end - res_mem->start + 1; + hcd->rsrc_len = resource_size(res_mem); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, OCTEON_EHCI_HCD_NAME)) { diff --git a/drivers/usb/host/ehci-pmcmsp.c b/drivers/usb/host/ehci-pmcmsp.c index cd69099cda1..e8d54de44ac 100644 --- a/drivers/usb/host/ehci-pmcmsp.c +++ b/drivers/usb/host/ehci-pmcmsp.c @@ -124,7 +124,7 @@ static int usb_hcd_msp_map_regs(struct mspusb_device *dev) res = platform_get_resource(pdev, IORESOURCE_MEM, 1); if (res == NULL) return -ENOMEM; - res_len = res->end - res->start + 1; + res_len = resource_size(res); if (!request_mem_region(res->start, res_len, "mab regs")) return -EBUSY; @@ -140,7 +140,7 @@ static int usb_hcd_msp_map_regs(struct mspusb_device *dev) retval = -ENOMEM; goto err2; } - res_len = res->end - res->start + 1; + res_len = resource_size(res); if (!request_mem_region(res->start, res_len, "usbid regs")) { retval = -EBUSY; goto err2; @@ -154,13 +154,13 @@ static int usb_hcd_msp_map_regs(struct mspusb_device *dev) return 0; err3: res = platform_get_resource(pdev, IORESOURCE_MEM, 2); - res_len = res->end - res->start + 1; + res_len = resource_size(res); release_mem_region(res->start, res_len); err2: iounmap(dev->mab_regs); err1: res = platform_get_resource(pdev, IORESOURCE_MEM, 1); - res_len = res->end - res->start + 1; + res_len = resource_size(res); release_mem_region(res->start, res_len); dev_err(&pdev->dev, "Failed to map non-EHCI regs.\n"); return retval; @@ -194,7 +194,7 @@ int usb_hcd_msp_probe(const struct hc_driver *driver, goto err1; } hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, dev->name)) { retval = -EBUSY; goto err1; diff --git a/drivers/usb/host/ehci-ppc-of.c b/drivers/usb/host/ehci-ppc-of.c index 8552db6c29c..41d11fe1425 100644 --- a/drivers/usb/host/ehci-ppc-of.c +++ b/drivers/usb/host/ehci-ppc-of.c @@ -130,7 +130,7 @@ static int __devinit ehci_hcd_ppc_of_probe(struct platform_device *op) return -ENOMEM; hcd->rsrc_start = res.start; - hcd->rsrc_len = res.end - res.start + 1; + hcd->rsrc_len = resource_size(&res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { printk(KERN_ERR "%s: request_mem_region failed\n", __FILE__); diff --git a/drivers/usb/host/ehci-w90x900.c b/drivers/usb/host/ehci-w90x900.c index 52a027aaa37..d661cf7de14 100644 --- a/drivers/usb/host/ehci-w90x900.c +++ b/drivers/usb/host/ehci-w90x900.c @@ -41,7 +41,7 @@ static int __devinit usb_w90x900_probe(const struct hc_driver *driver, } hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { retval = -EBUSY; diff --git a/drivers/usb/host/ehci-xilinx-of.c b/drivers/usb/host/ehci-xilinx-of.c index a64d6d66d76..32793ce3d9e 100644 --- a/drivers/usb/host/ehci-xilinx-of.c +++ b/drivers/usb/host/ehci-xilinx-of.c @@ -174,7 +174,7 @@ static int __devinit ehci_hcd_xilinx_of_probe(struct platform_device *op) return -ENOMEM; hcd->rsrc_start = res.start; - hcd->rsrc_len = res.end - res.start + 1; + hcd->rsrc_len = resource_size(&res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { printk(KERN_ERR "%s: request_mem_region failed\n", __FILE__); diff --git a/drivers/usb/host/fhci-hcd.c b/drivers/usb/host/fhci-hcd.c index 19223c7449e..572ea53b022 100644 --- a/drivers/usb/host/fhci-hcd.c +++ b/drivers/usb/host/fhci-hcd.c @@ -605,7 +605,7 @@ static int __devinit of_fhci_probe(struct platform_device *ofdev) goto err_regs; } - hcd->regs = ioremap(usb_regs.start, usb_regs.end - usb_regs.start + 1); + hcd->regs = ioremap(usb_regs.start, resource_size(&usb_regs)); if (!hcd->regs) { dev_err(dev, "could not ioremap regs\n"); ret = -ENOMEM; diff --git a/drivers/usb/host/ohci-ath79.c b/drivers/usb/host/ohci-ath79.c index ffea3e7cb0a..c620c50f677 100644 --- a/drivers/usb/host/ohci-ath79.c +++ b/drivers/usb/host/ohci-ath79.c @@ -93,8 +93,8 @@ static int ohci_ath79_probe(struct platform_device *pdev) ret = -ENODEV; goto err_put_hcd; } - hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_start = res->start; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { dev_dbg(&pdev->dev, "controller already in use\n"); diff --git a/drivers/usb/host/ohci-cns3xxx.c b/drivers/usb/host/ohci-cns3xxx.c index f05ef87e934..5a00a1e1c6c 100644 --- a/drivers/usb/host/ohci-cns3xxx.c +++ b/drivers/usb/host/ohci-cns3xxx.c @@ -100,7 +100,7 @@ static int cns3xxx_ohci_probe(struct platform_device *pdev) goto err1; } hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, driver->description)) { diff --git a/drivers/usb/host/ohci-da8xx.c b/drivers/usb/host/ohci-da8xx.c index d22fb4d577b..6aca2c4453f 100644 --- a/drivers/usb/host/ohci-da8xx.c +++ b/drivers/usb/host/ohci-da8xx.c @@ -322,7 +322,7 @@ static int usb_hcd_da8xx_probe(const struct hc_driver *driver, goto err2; } hcd->rsrc_start = mem->start; - hcd->rsrc_len = mem->end - mem->start + 1; + hcd->rsrc_len = resource_size(mem); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { dev_dbg(&pdev->dev, "request_mem_region failed\n"); diff --git a/drivers/usb/host/ohci-octeon.c b/drivers/usb/host/ohci-octeon.c index e4ddfaf8870..d8b45647d1d 100644 --- a/drivers/usb/host/ohci-octeon.c +++ b/drivers/usb/host/ohci-octeon.c @@ -135,7 +135,7 @@ static int ohci_octeon_drv_probe(struct platform_device *pdev) return -ENOMEM; hcd->rsrc_start = res_mem->start; - hcd->rsrc_len = res_mem->end - res_mem->start + 1; + hcd->rsrc_len = resource_size(res_mem); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, OCTEON_OHCI_HCD_NAME)) { diff --git a/drivers/usb/host/ohci-ppc-of.c b/drivers/usb/host/ohci-ppc-of.c index 1ca1821320f..0c12f4e14dc 100644 --- a/drivers/usb/host/ohci-ppc-of.c +++ b/drivers/usb/host/ohci-ppc-of.c @@ -110,7 +110,7 @@ static int __devinit ohci_hcd_ppc_of_probe(struct platform_device *op) return -ENOMEM; hcd->rsrc_start = res.start; - hcd->rsrc_len = res.end - res.start + 1; + hcd->rsrc_len = resource_size(&res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { printk(KERN_ERR "%s: request_mem_region failed\n", __FILE__); diff --git a/drivers/usb/host/ohci-ppc-soc.c b/drivers/usb/host/ohci-ppc-soc.c index 89e670e38c1..c0f595c4448 100644 --- a/drivers/usb/host/ohci-ppc-soc.c +++ b/drivers/usb/host/ohci-ppc-soc.c @@ -56,7 +56,7 @@ static int usb_hcd_ppc_soc_probe(const struct hc_driver *driver, if (!hcd) return -ENOMEM; hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { pr_debug("%s: request_mem_region failed\n", __FILE__); diff --git a/drivers/usb/host/ohci-sa1111.c b/drivers/usb/host/ohci-sa1111.c index d8eb3bdafab..4204d9720d2 100644 --- a/drivers/usb/host/ohci-sa1111.c +++ b/drivers/usb/host/ohci-sa1111.c @@ -131,7 +131,7 @@ int usb_hcd_sa1111_probe (const struct hc_driver *driver, if (!hcd) return -ENOMEM; hcd->rsrc_start = dev->res.start; - hcd->rsrc_len = dev->res.end - dev->res.start + 1; + hcd->rsrc_len = resource_size(&dev->res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { dbg("request_mem_region failed"); diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c index 041d30f30c1..78918ca0da2 100644 --- a/drivers/usb/host/ohci-sm501.c +++ b/drivers/usb/host/ohci-sm501.c @@ -103,8 +103,7 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev) goto err0; } - if (!request_mem_region(mem->start, mem->end - mem->start + 1, - pdev->name)) { + if (!request_mem_region(mem->start, resource_size(mem), pdev->name)) { dev_err(dev, "request_mem_region failed\n"); retval = -EBUSY; goto err0; @@ -126,7 +125,7 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev) if (!dma_declare_coherent_memory(dev, mem->start, mem->start - mem->parent->start, - (mem->end - mem->start) + 1, + resource_size(mem), DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE)) { dev_err(dev, "cannot declare coherent memory\n"); @@ -149,7 +148,7 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev) } hcd->rsrc_start = res->start; - hcd->rsrc_len = res->end - res->start + 1; + hcd->rsrc_len = resource_size(res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, pdev->name)) { dev_err(dev, "request_mem_region failed\n"); @@ -185,7 +184,7 @@ err3: err2: dma_release_declared_memory(dev); err1: - release_mem_region(mem->start, mem->end - mem->start + 1); + release_mem_region(mem->start, resource_size(mem)); err0: return retval; } @@ -201,7 +200,7 @@ static int ohci_hcd_sm501_drv_remove(struct platform_device *pdev) dma_release_declared_memory(&pdev->dev); mem = platform_get_resource(pdev, IORESOURCE_MEM, 1); if (mem) - release_mem_region(mem->start, mem->end - mem->start + 1); + release_mem_region(mem->start, resource_size(mem)); /* mask interrupts and disable power */ diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c index 3558491dd87..57ad1271fc9 100644 --- a/drivers/usb/host/ohci-tmio.c +++ b/drivers/usb/host/ohci-tmio.c @@ -208,13 +208,13 @@ static int __devinit ohci_hcd_tmio_drv_probe(struct platform_device *dev) } hcd->rsrc_start = regs->start; - hcd->rsrc_len = regs->end - regs->start + 1; + hcd->rsrc_len = resource_size(regs); tmio = hcd_to_tmio(hcd); spin_lock_init(&tmio->lock); - tmio->ccr = ioremap(config->start, config->end - config->start + 1); + tmio->ccr = ioremap(config->start, resource_size(config)); if (!tmio->ccr) { ret = -ENOMEM; goto err_ioremap_ccr; @@ -228,7 +228,7 @@ static int __devinit ohci_hcd_tmio_drv_probe(struct platform_device *dev) if (!dma_declare_coherent_memory(&dev->dev, sram->start, sram->start, - sram->end - sram->start + 1, + resource_size(sram), DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE)) { ret = -EBUSY; goto err_dma_declare; diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c index 5fbe997dc6d..dcd889803f0 100644 --- a/drivers/usb/host/oxu210hp-hcd.c +++ b/drivers/usb/host/oxu210hp-hcd.c @@ -3828,7 +3828,7 @@ static int oxu_drv_probe(struct platform_device *pdev) return -ENODEV; } memstart = res->start; - memlen = res->end - res->start + 1; + memlen = resource_size(res); dev_dbg(&pdev->dev, "MEM resource %lx-%lx\n", memstart, memlen); if (!request_mem_region(memstart, memlen, oxu_hc_driver.description)) { diff --git a/drivers/usb/host/uhci-grlib.c b/drivers/usb/host/uhci-grlib.c index d01c1e22768..f7a62138e3e 100644 --- a/drivers/usb/host/uhci-grlib.c +++ b/drivers/usb/host/uhci-grlib.c @@ -111,7 +111,7 @@ static int __devinit uhci_hcd_grlib_probe(struct platform_device *op) return -ENOMEM; hcd->rsrc_start = res.start; - hcd->rsrc_len = res.end - res.start + 1; + hcd->rsrc_len = resource_size(&res); if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len, hcd_name)) { printk(KERN_ERR "%s: request_mem_region failed\n", __FILE__); diff --git a/drivers/usb/host/whci/init.c b/drivers/usb/host/whci/init.c index f7582e8e216..d3e13b640d4 100644 --- a/drivers/usb/host/whci/init.c +++ b/drivers/usb/host/whci/init.c @@ -178,7 +178,7 @@ void whc_clean_up(struct whc *whc) if (whc->qset_pool) dma_pool_destroy(whc->qset_pool); - len = whc->umc->resource.end - whc->umc->resource.start + 1; + len = resource_size(&whc->umc->resource); if (whc->base) iounmap(whc->base); if (whc->base_phys) diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c index 70a004aa19d..3ae3c702500 100644 --- a/drivers/uwb/whc-rc.c +++ b/drivers/uwb/whc-rc.c @@ -222,7 +222,7 @@ int whcrc_setup_rc_umc(struct whcrc *whcrc) struct umc_dev *umc_dev = whcrc->umc_dev; whcrc->area = umc_dev->resource.start; - whcrc->rc_len = umc_dev->resource.end - umc_dev->resource.start + 1; + whcrc->rc_len = resource_size(&umc_dev->resource); result = -EBUSY; if (request_mem_region(whcrc->area, whcrc->rc_len, KBUILD_MODNAME) == NULL) { dev_err(dev, "can't request URC region (%zu bytes @ 0x%lx): %d\n", diff --git a/drivers/video/atmel_lcdfb.c b/drivers/video/atmel_lcdfb.c index 4484c721f0f..817ab60f753 100644 --- a/drivers/video/atmel_lcdfb.c +++ b/drivers/video/atmel_lcdfb.c @@ -906,7 +906,7 @@ static int __init atmel_lcdfb_probe(struct platform_device *pdev) if (map) { /* use a pre-allocated memory buffer */ info->fix.smem_start = map->start; - info->fix.smem_len = map->end - map->start + 1; + info->fix.smem_len = resource_size(map); if (!request_mem_region(info->fix.smem_start, info->fix.smem_len, pdev->name)) { ret = -EBUSY; @@ -932,7 +932,7 @@ static int __init atmel_lcdfb_probe(struct platform_device *pdev) /* LCDC registers */ info->fix.mmio_start = regs->start; - info->fix.mmio_len = regs->end - regs->start + 1; + info->fix.mmio_len = resource_size(regs); if (!request_mem_region(info->fix.mmio_start, info->fix.mmio_len, pdev->name)) { diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index ebb893c49e9..ad41f508b42 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c @@ -3460,9 +3460,10 @@ static int __devinit atyfb_setup_generic(struct pci_dev *pdev, raddr = addr + 0x7ff000UL; rrp = &pdev->resource[2]; - if ((rrp->flags & IORESOURCE_MEM) && request_mem_region(rrp->start, rrp->end - rrp->start + 1, "atyfb")) { + if ((rrp->flags & IORESOURCE_MEM) && + request_mem_region(rrp->start, resource_size(rrp), "atyfb")) { par->aux_start = rrp->start; - par->aux_size = rrp->end - rrp->start + 1; + par->aux_size = resource_size(rrp); raddr = rrp->start; PRINTKI("using auxiliary register aperture\n"); } @@ -3552,7 +3553,7 @@ static int __devinit atyfb_pci_probe(struct pci_dev *pdev, /* Reserve space */ res_start = rp->start; - res_size = rp->end - rp->start + 1; + res_size = resource_size(rp); if (!request_mem_region(res_start, res_size, "atyfb")) return -EBUSY; diff --git a/drivers/video/au1100fb.c b/drivers/video/au1100fb.c index 34b2fc472fe..01a8fde67f2 100644 --- a/drivers/video/au1100fb.c +++ b/drivers/video/au1100fb.c @@ -486,7 +486,7 @@ static int __devinit au1100fb_drv_probe(struct platform_device *dev) } au1100fb_fix.mmio_start = regs_res->start; - au1100fb_fix.mmio_len = regs_res->end - regs_res->start + 1; + au1100fb_fix.mmio_len = resource_size(regs_res); if (!request_mem_region(au1100fb_fix.mmio_start, au1100fb_fix.mmio_len, DRIVER_NAME)) { diff --git a/drivers/video/cobalt_lcdfb.c b/drivers/video/cobalt_lcdfb.c index 42fe155aba0..e02764319ff 100644 --- a/drivers/video/cobalt_lcdfb.c +++ b/drivers/video/cobalt_lcdfb.c @@ -303,7 +303,7 @@ static int __devinit cobalt_lcdfb_probe(struct platform_device *dev) return -EBUSY; } - info->screen_size = res->end - res->start + 1; + info->screen_size = resource_size(res); info->screen_base = ioremap(res->start, info->screen_size); info->fbops = &cobalt_lcd_fbops; info->fix = cobalt_lcdfb_fix; diff --git a/drivers/video/controlfb.c b/drivers/video/controlfb.c index c225dcce89e..9075bea5587 100644 --- a/drivers/video/controlfb.c +++ b/drivers/video/controlfb.c @@ -709,11 +709,11 @@ static int __init control_of_init(struct device_node *dp) /* Map in frame buffer and registers */ p->fb_orig_base = fb_res.start; - p->fb_orig_size = fb_res.end - fb_res.start + 1; + p->fb_orig_size = resource_size(&fb_res); /* use the big-endian aperture (??) */ p->frame_buffer_phys = fb_res.start + 0x800000; p->control_regs_phys = reg_res.start; - p->control_regs_size = reg_res.end - reg_res.start + 1; + p->control_regs_size = resource_size(®_res); if (!p->fb_orig_base || !request_mem_region(p->fb_orig_base,p->fb_orig_size,"controlfb")) { diff --git a/drivers/video/mb862xx/mb862xxfbdrv.c b/drivers/video/mb862xx/mb862xxfbdrv.c index f70bd63b018..ee1de3e26de 100644 --- a/drivers/video/mb862xx/mb862xxfbdrv.c +++ b/drivers/video/mb862xx/mb862xxfbdrv.c @@ -697,7 +697,7 @@ static int __devinit of_platform_mb862xx_probe(struct platform_device *ofdev) goto fbrel; } - res_size = 1 + res.end - res.start; + res_size = resource_size(&res); par->res = request_mem_region(res.start, res_size, DRV_NAME); if (par->res == NULL) { dev_err(dev, "Cannot claim framebuffer/mmio\n"); @@ -787,7 +787,7 @@ static int __devexit of_platform_mb862xx_remove(struct platform_device *ofdev) { struct fb_info *fbi = dev_get_drvdata(&ofdev->dev); struct mb862xxfb_par *par = fbi->par; - resource_size_t res_size = 1 + par->res->end - par->res->start; + resource_size_t res_size = resource_size(par->res); unsigned long reg; dev_dbg(fbi->dev, "%s release\n", fbi->fix.id); diff --git a/drivers/video/msm/mdp.c b/drivers/video/msm/mdp.c index c3636d55a3c..243d16f09b8 100644 --- a/drivers/video/msm/mdp.c +++ b/drivers/video/msm/mdp.c @@ -406,8 +406,7 @@ int mdp_probe(struct platform_device *pdev) goto error_get_irq; } - mdp->base = ioremap(resource->start, - resource->end - resource->start); + mdp->base = ioremap(resource->start, resource_size(resource)); if (mdp->base == 0) { printk(KERN_ERR "msmfb: cannot allocate mdp regs!\n"); ret = -ENOMEM; diff --git a/drivers/video/msm/msm_fb.c b/drivers/video/msm/msm_fb.c index ec351309e60..c6e3b4fcdd6 100644 --- a/drivers/video/msm/msm_fb.c +++ b/drivers/video/msm/msm_fb.c @@ -525,10 +525,9 @@ static int setup_fbmem(struct msmfb_info *msmfb, struct platform_device *pdev) return -ENOMEM; } fb->fix.smem_start = resource->start; - fb->fix.smem_len = resource->end - resource->start; - fbram = ioremap(resource->start, - resource->end - resource->start); - if (fbram == 0) { + fb->fix.smem_len = resource_size(resource); + fbram = ioremap(resource->start, resource_size(resource)); + if (fbram == NULL) { printk(KERN_ERR "msmfb: cannot allocate fbram!\n"); return -ENOMEM; } diff --git a/drivers/video/nuc900fb.c b/drivers/video/nuc900fb.c index f838d9e277f..0fff59782e4 100644 --- a/drivers/video/nuc900fb.c +++ b/drivers/video/nuc900fb.c @@ -551,7 +551,7 @@ static int __devinit nuc900fb_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - size = (res->end - res->start) + 1; + size = resource_size(res); fbi->mem = request_mem_region(res->start, size, pdev->name); if (fbi->mem == NULL) { dev_err(&pdev->dev, "failed to alloc memory region\n"); diff --git a/drivers/video/platinumfb.c b/drivers/video/platinumfb.c index ef532d9d3c9..f27ae16ead2 100644 --- a/drivers/video/platinumfb.c +++ b/drivers/video/platinumfb.c @@ -567,7 +567,7 @@ static int __devinit platinumfb_probe(struct platform_device* odev) * northbridge and that can fail. Only request framebuffer */ if (!request_mem_region(pinfo->rsrc_fb.start, - pinfo->rsrc_fb.end - pinfo->rsrc_fb.start + 1, + resource_size(&pinfo->rsrc_fb), "platinumfb framebuffer")) { printk(KERN_ERR "platinumfb: Can't request framebuffer !\n"); framebuffer_release(info); @@ -658,8 +658,7 @@ static int __devexit platinumfb_remove(struct platform_device* odev) iounmap(pinfo->cmap_regs); release_mem_region(pinfo->rsrc_fb.start, - pinfo->rsrc_fb.end - - pinfo->rsrc_fb.start + 1); + resource_size(&pinfo->rsrc_fb)); release_mem_region(pinfo->cmap_regs_phys, 0x1000); diff --git a/drivers/video/pxa168fb.c b/drivers/video/pxa168fb.c index bb95ec56d25..18ead6f0184 100644 --- a/drivers/video/pxa168fb.c +++ b/drivers/video/pxa168fb.c @@ -662,7 +662,7 @@ static int __devinit pxa168fb_probe(struct platform_device *pdev) info->fix.ypanstep = 0; info->fix.ywrapstep = 0; info->fix.mmio_start = res->start; - info->fix.mmio_len = res->end - res->start + 1; + info->fix.mmio_len = resource_size(res); info->fix.accel = FB_ACCEL_NONE; info->fbops = &pxa168fb_ops; info->pseudo_palette = fbi->pseudo_palette; diff --git a/include/linux/dio.h b/include/linux/dio.h index b2dd31ca171..2cc0fd00463 100644 --- a/include/linux/dio.h +++ b/include/linux/dio.h @@ -254,7 +254,7 @@ static inline struct dio_driver *dio_dev_driver(const struct dio_dev *d) #define dio_resource_start(d) ((d)->resource.start) #define dio_resource_end(d) ((d)->resource.end) -#define dio_resource_len(d) ((d)->resource.end-(d)->resource.start+1) +#define dio_resource_len(d) (resource_size(&(d)->resource)) #define dio_resource_flags(d) ((d)->resource.flags) #define dio_request_device(d, name) \ diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 1bc1338b817..195aafc6cd0 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -50,7 +50,7 @@ static inline resource_size_t pnp_resource_len(struct resource *res) { if (res->start == 0 && res->end == 0) return 0; - return res->end - res->start + 1; + return resource_size(res); } diff --git a/include/linux/zorro.h b/include/linux/zorro.h index 7bf9db525e9..dff42025649 100644 --- a/include/linux/zorro.h +++ b/include/linux/zorro.h @@ -187,7 +187,7 @@ extern struct zorro_dev *zorro_find_device(zorro_id id, #define zorro_resource_start(z) ((z)->resource.start) #define zorro_resource_end(z) ((z)->resource.end) -#define zorro_resource_len(z) ((z)->resource.end-(z)->resource.start+1) +#define zorro_resource_len(z) (resource_size(&(z)->resource)) #define zorro_resource_flags(z) ((z)->resource.flags) #define zorro_request_device(z, name) \ diff --git a/kernel/kexec.c b/kernel/kexec.c index 8d814cbc810..296fbc84d65 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void) size_t size = 0; mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) - size = crashk_res.end - crashk_res.start + 1; + size = resource_size(&crashk_res); mutex_unlock(&kexec_mutex); return size; } diff --git a/sound/aoa/soundbus/i2sbus/core.c b/sound/aoa/soundbus/i2sbus/core.c index 3ff8cc5f487..01065833588 100644 --- a/sound/aoa/soundbus/i2sbus/core.c +++ b/sound/aoa/soundbus/i2sbus/core.c @@ -262,8 +262,7 @@ static int i2sbus_add_dev(struct macio_dev *macio, */ dev->allocated_resource[i] = request_mem_region(dev->resources[i].start, - dev->resources[i].end - - dev->resources[i].start + 1, + resource_size(&dev->resources[i]), dev->rnames[i]); if (!dev->allocated_resource[i]) { printk(KERN_ERR "i2sbus: failed to claim resource %d!\n", i); @@ -272,19 +271,19 @@ static int i2sbus_add_dev(struct macio_dev *macio, } r = &dev->resources[aoa_resource_i2smmio]; - rlen = r->end - r->start + 1; + rlen = resource_size(r); if (rlen < sizeof(struct i2s_interface_regs)) goto err; dev->intfregs = ioremap(r->start, rlen); r = &dev->resources[aoa_resource_txdbdma]; - rlen = r->end - r->start + 1; + rlen = resource_size(r); if (rlen < sizeof(struct dbdma_regs)) goto err; dev->out.dbdma = ioremap(r->start, rlen); r = &dev->resources[aoa_resource_rxdbdma]; - rlen = r->end - r->start + 1; + rlen = resource_size(r); if (rlen < sizeof(struct dbdma_regs)) goto err; dev->in.dbdma = ioremap(r->start, rlen); diff --git a/sound/atmel/abdac.c b/sound/atmel/abdac.c index 6e240918189..30468b31cad 100644 --- a/sound/atmel/abdac.c +++ b/sound/atmel/abdac.c @@ -448,7 +448,7 @@ static int __devinit atmel_abdac_probe(struct platform_device *pdev) goto out_free_card; } - dac->regs = ioremap(regs->start, regs->end - regs->start + 1); + dac->regs = ioremap(regs->start, resource_size(regs)); if (!dac->regs) { dev_dbg(&pdev->dev, "could not remap register memory\n"); goto out_free_card; diff --git a/sound/atmel/ac97c.c b/sound/atmel/ac97c.c index b310702c646..41b901bde5c 100644 --- a/sound/atmel/ac97c.c +++ b/sound/atmel/ac97c.c @@ -971,7 +971,7 @@ static int __devinit atmel_ac97c_probe(struct platform_device *pdev) chip->card = card; chip->pclk = pclk; chip->pdev = pdev; - chip->regs = ioremap(regs->start, regs->end - regs->start + 1); + chip->regs = ioremap(regs->start, resource_size(regs)); if (!chip->regs) { dev_dbg(&pdev->dev, "could not remap register memory\n"); diff --git a/sound/ppc/pmac.c b/sound/ppc/pmac.c index 3ecbd67f88c..ab96cde7417 100644 --- a/sound/ppc/pmac.c +++ b/sound/ppc/pmac.c @@ -881,8 +881,7 @@ static int snd_pmac_free(struct snd_pmac *chip) for (i = 0; i < 3; i++) { if (chip->requested & (1 << i)) release_mem_region(chip->rsrc[i].start, - chip->rsrc[i].end - - chip->rsrc[i].start + 1); + resource_size(&chip->rsrc[i])); } } @@ -1228,8 +1227,7 @@ int __devinit snd_pmac_new(struct snd_card *card, struct snd_pmac **chip_return) goto __error; } if (request_mem_region(chip->rsrc[i].start, - chip->rsrc[i].end - - chip->rsrc[i].start + 1, + resource_size(&chip->rsrc[i]), rnames[i]) == NULL) { printk(KERN_ERR "snd: can't request rsrc " " %d (%s: %pR)\n", @@ -1254,8 +1252,7 @@ int __devinit snd_pmac_new(struct snd_card *card, struct snd_pmac **chip_return) goto __error; } if (request_mem_region(chip->rsrc[i].start, - chip->rsrc[i].end - - chip->rsrc[i].start + 1, + resource_size(&chip->rsrc[i]), rnames[i]) == NULL) { printk(KERN_ERR "snd: can't request rsrc " " %d (%s: %pR)\n", diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c index 313e0ccedd5..6a882aa5553 100644 --- a/sound/soc/fsl/fsl_ssi.c +++ b/sound/soc/fsl/fsl_ssi.c @@ -678,7 +678,7 @@ static int __devinit fsl_ssi_probe(struct platform_device *pdev) kfree(ssi_private); return ret; } - ssi_private->ssi = ioremap(res.start, 1 + res.end - res.start); + ssi_private->ssi = ioremap(res.start, resource_size(&res)); ssi_private->ssi_phys = res.start; ssi_private->irq = irq_of_parse_and_map(np, 0); diff --git a/sound/soc/fsl/mpc5200_dma.c b/sound/soc/fsl/mpc5200_dma.c index fff695ccdd3..86023142a4c 100644 --- a/sound/soc/fsl/mpc5200_dma.c +++ b/sound/soc/fsl/mpc5200_dma.c @@ -384,7 +384,7 @@ static int mpc5200_hpcd_probe(struct of_device *op) dev_err(&op->dev, "Missing reg property\n"); return -ENODEV; } - regs = ioremap(res.start, 1 + res.end - res.start); + regs = ioremap(res.start, resource_size(&res)); if (!regs) { dev_err(&op->dev, "Could not map registers\n"); return -ENODEV; -- cgit v1.2.3-70-g09d2 From bdd4e85dc36cdbcfc1608a5b2a17c80a9db8986a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 01:13:27 +0200 Subject: sched: Isolate preempt counting in its own config option Create a new CONFIG_PREEMPT_COUNT that handles the inc/dec of preempt count offset independently. So that the offset can be updated by preempt_disable() and preempt_enable() even without the need for CONFIG_PREEMPT beeing set. This prepares to make CONFIG_DEBUG_SPINLOCK_SLEEP working with !CONFIG_PREEMPT where it currently doesn't detect code that sleeps inside explicit preemption disabled sections. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/bit_spinlock.h | 2 +- include/linux/hardirq.h | 4 ++-- include/linux/pagemap.h | 4 ++-- include/linux/preempt.h | 26 +++++++++++++++++--------- include/linux/rcupdate.h | 12 ++++++------ include/linux/sched.h | 2 +- kernel/Kconfig.preempt | 3 +++ kernel/sched.c | 2 +- 8 files changed, 33 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index b4326bfa684..564d997e216 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -88,7 +88,7 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); -#elif defined CONFIG_PREEMPT +#elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index ba362171e8a..f743883f769 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -93,7 +93,7 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) -#if defined(CONFIG_PREEMPT) +#if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_CHECK_OFFSET 1 #else # define PREEMPT_CHECK_OFFSET 0 @@ -115,7 +115,7 @@ #define in_atomic_preempt_off() \ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 716875e5352..8e38d4c140f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -134,7 +134,7 @@ static inline int page_cache_get_speculative(struct page *page) VM_BUG_ON(in_interrupt()); #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT +# ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif /* @@ -172,7 +172,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) VM_BUG_ON(in_interrupt()); #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT +# ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif VM_BUG_ON(page_count(page) == 0); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 2e681d9555b..58969b2a8a8 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -27,6 +27,21 @@ asmlinkage void preempt_schedule(void); +#define preempt_check_resched() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + preempt_schedule(); \ +} while (0) + +#else /* !CONFIG_PREEMPT */ + +#define preempt_check_resched() do { } while (0) + +#endif /* CONFIG_PREEMPT */ + + +#ifdef CONFIG_PREEMPT_COUNT + #define preempt_disable() \ do { \ inc_preempt_count(); \ @@ -39,12 +54,6 @@ do { \ dec_preempt_count(); \ } while (0) -#define preempt_check_resched() \ -do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ - preempt_schedule(); \ -} while (0) - #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ @@ -80,18 +89,17 @@ do { \ preempt_check_resched(); \ } while (0) -#else +#else /* !CONFIG_PREEMPT_COUNT */ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) -#define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() do { } while (0) #define preempt_enable_no_resched_notrace() do { } while (0) #define preempt_enable_notrace() do { } while (0) -#endif +#endif /* CONFIG_PREEMPT_COUNT */ #ifdef CONFIG_PREEMPT_NOTIFIERS diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 99f9aa7c280..8f4f881a0ad 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -239,7 +239,7 @@ extern int rcu_read_lock_bh_held(void); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. */ -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { int lockdep_opinion = 0; @@ -250,12 +250,12 @@ static inline int rcu_read_lock_sched_held(void) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPT_COUNT */ static inline int rcu_read_lock_sched_held(void) { return 1; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -276,17 +276,17 @@ static inline int rcu_read_lock_bh_held(void) return 1; } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { return preempt_count() != 0 || irqs_disabled(); } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPT_COUNT */ static inline int rcu_read_lock_sched_held(void) { return 1; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 483c1ed5bc4..4ecd5cbe7e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2502,7 +2502,7 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET #else #define PREEMPT_LOCK_OFFSET 0 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b95b35..24e7cb0ba26 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" + select PREEMPT_COUNT help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -52,3 +53,5 @@ config PREEMPT endchoice +config PREEMPT_COUNT + bool \ No newline at end of file diff --git a/kernel/sched.c b/kernel/sched.c index 01d9536aaa8..90ad7cf2b29 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2843,7 +2843,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif -- cgit v1.2.3-70-g09d2 From e7e2ee89a9dbf48d70a922d5625cd7320a27cbff Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 10 May 2011 13:27:21 -0700 Subject: tracing: Schedule a delayed work to call wakeup() In using syscall tracing by concurrent processes, the wakeup() that is called in the event commit function causes contention on the spin lock of the waitqueue. I enabled sys_enter_getuid and sys_exit_getuid tracepoints, and by running getuid_microbench from autotest in parallel I found that the contention causes exponential latency increase in the tracing path. The autotest binary getuid_microbench calls getuid() in a tight loop for the given number of iterations and measures the average time required to complete a single invocation of syscall. The patch schedules a delayed work after 2 ms once an event commit calls to wake up the trace wait_queue. This removes the delay caused by contention on spin lock in wakeup() and amortizes the wakeup() calls scheduled over the 2 ms period. In the following example, the script enables the sys_enter_getuid and sys_exit_getuid tracepoints and runs the getuid_microbench in parallel with the given number of processes. The output clearly shows the latency increase caused by contentions. $ ~/getuid.sh 1 1000000 calls in 0.720974253 s (720.974253 ns/call) $ ~/getuid.sh 2 1000000 calls in 1.166457554 s (1166.457554 ns/call) 1000000 calls in 1.168933765 s (1168.933765 ns/call) $ ~/getuid.sh 3 1000000 calls in 1.783827516 s (1783.827516 ns/call) 1000000 calls in 1.795553270 s (1795.553270 ns/call) 1000000 calls in 1.796493376 s (1796.493376 ns/call) $ ~/getuid.sh 4 1000000 calls in 4.483041796 s (4483.041796 ns/call) 1000000 calls in 4.484165388 s (4484.165388 ns/call) 1000000 calls in 4.484850762 s (4484.850762 ns/call) 1000000 calls in 4.485643576 s (4485.643576 ns/call) $ ~/getuid.sh 5 1000000 calls in 6.497521653 s (6497.521653 ns/call) 1000000 calls in 6.502000236 s (6502.000236 ns/call) 1000000 calls in 6.501709115 s (6501.709115 ns/call) 1000000 calls in 6.502124100 s (6502.124100 ns/call) 1000000 calls in 6.502936358 s (6502.936358 ns/call) After the patch, the latencies scale better. 1000000 calls in 0.728720455 s (728.720455 ns/call) 1000000 calls in 0.842782857 s (842.782857 ns/call) 1000000 calls in 0.883803135 s (883.803135 ns/call) 1000000 calls in 0.902077764 s (902.077764 ns/call) 1000000 calls in 0.902838202 s (902.838202 ns/call) 1000000 calls in 0.908896885 s (908.896885 ns/call) 1000000 calls in 0.932523515 s (932.523515 ns/call) 1000000 calls in 0.958009672 s (958.009672 ns/call) 1000000 calls in 0.986188020 s (986.188020 ns/call) 1000000 calls in 0.989771102 s (989.771102 ns/call) 1000000 calls in 0.933518391 s (933.518391 ns/call) 1000000 calls in 0.958897947 s (958.897947 ns/call) 1000000 calls in 1.031038897 s (1031.038897 ns/call) 1000000 calls in 1.089516025 s (1089.516025 ns/call) 1000000 calls in 1.141998347 s (1141.998347 ns/call) Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1305059241-7629-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee9c921d7f2..71777c8fe36 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); +static void wakeup_work_handler(struct work_struct *work) +{ + wake_up(&trace_wait); +} + +static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); + /** * trace_wake_up - wake up tasks waiting for trace input * - * Simply wakes up any task that is blocked on the trace_wait - * queue. These is used with trace_poll for tasks polling the trace. + * Schedules a delayed work to wake up any task that is blocked on the + * trace_wait queue. These is used with trace_poll for tasks polling the + * trace. */ void trace_wake_up(void) { - int cpu; + const unsigned long delay = msecs_to_jiffies(2); if (trace_flags & TRACE_ITER_BLOCK) return; - /* - * The runqueue_is_locked() can fail, but this is the best we - * have for now: - */ - cpu = get_cpu(); - if (!runqueue_is_locked(cpu)) - wake_up(&trace_wait); - put_cpu(); + schedule_delayed_work(&wakeup_work, delay); } static int __init set_buf_size(char *str) -- cgit v1.2.3-70-g09d2 From 7ea5906405a1f3fc1c0033dfd7e02f2cfd1de5e5 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 3 May 2011 17:56:42 -0700 Subject: tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 36 +++++++++++++++++++----------------- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 7 +++---- 4 files changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index ab38ac80b0f..b891de96000 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -169,7 +169,7 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, size_t ring_buffer_page_len(void *page); -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b0c7aa40794..725153d6cf7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -997,13 +997,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct buffer_page *bpage, *tmp; - unsigned long addr; LIST_HEAD(pages); unsigned i; WARN_ON(!nr_pages); for (i = 0; i < nr_pages; i++) { + struct page *page; + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) @@ -1013,10 +1014,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } @@ -1045,7 +1047,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long addr; + struct page *page; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), @@ -1067,10 +1069,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto fail_free_reader; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -1314,7 +1316,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) unsigned nr_pages, rm_pages, new_pages; struct buffer_page *bpage, *tmp; unsigned long buffer_size; - unsigned long addr; LIST_HEAD(pages); int i, cpu; @@ -1375,16 +1376,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { + struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } } @@ -3730,16 +3732,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or NULL on error. */ -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { struct buffer_data_page *bpage; - unsigned long addr; + struct page *page; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) return NULL; - bpage = (void *)addr; + bpage = page_address(page); rb_init_page(bpage); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 302f8a61463..a5457d577b9 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -106,7 +106,7 @@ static enum event_status read_page(int cpu) int inc; int i; - bpage = ring_buffer_alloc_read_page(buffer); + bpage = ring_buffer_alloc_read_page(buffer, cpu); if (!bpage) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 71777c8fe36..61fda6b6f1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,7 +3697,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return 0; if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); if (!info->spare) return -ENOMEM; @@ -3854,7 +3854,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer); + ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); if (!ref->page) { kfree(ref); break; @@ -3863,8 +3863,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, r = ring_buffer_read_page(ref->buffer, &ref->page, len, info->cpu, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, - ref->page); + ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); break; } -- cgit v1.2.3-70-g09d2 From 4f271a2a60c748599b30bb4dafff30d770439b96 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Mon, 13 Jun 2011 17:51:57 -0700 Subject: tracing: Add a proc file to stop tracing and free buffer The proc file entry buffer_size_kb is used to set the size of tracing buffer. The memory to expand the buffer size is kernel memory. Consider a use case where tracing is handled by a user space utility, which acts as a gate keeper for tracing requests. In an OOM condition, tracing is considered a low priority task and if the utility gets killed the ring buffer memory cannot be released back to the kernel. This patch adds a proc file called "free_buffer" whose purpose is to stop tracing and free up the ring buffer when it is closed. The user space process can then set the desired size in buffer_size_kb file and open the fd to the "free_buffer" file. Under OOM condition, if the process gets killed, the kernel closes the file descriptor. The release handler stops the tracing and releases the kernel memory automatically. Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Link: http://lkml.kernel.org/r/1308012717-11148-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 108 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 61fda6b6f1a..9c557ae6a21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2768,7 +2768,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static int tracing_resize_ring_buffer(unsigned long size) +static int __tracing_resize_ring_buffer(unsigned long size) { int ret; @@ -2820,6 +2820,41 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } +static ssize_t tracing_resize_ring_buffer(unsigned long size) +{ + int cpu, ret = size; + + mutex_lock(&trace_types_lock); + + tracing_stop(); + + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + + if (size != global_trace.entries) + ret = __tracing_resize_ring_buffer(size); + + if (ret < 0) + ret = -ENOMEM; + + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + + tracing_start(); + mutex_unlock(&trace_types_lock); + + return ret; +} + /** * tracing_update_buffers - used by tracing facility to expand ring buffers @@ -2837,7 +2872,7 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); mutex_unlock(&trace_types_lock); return ret; @@ -2861,7 +2896,7 @@ static int tracing_set_tracer(const char *buf) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) goto out; ret = 0; @@ -3436,7 +3471,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret, cpu; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -3454,48 +3489,43 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (!val) return -EINVAL; - mutex_lock(&trace_types_lock); - - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); - } - /* value is in KB */ val <<= 10; - if (val != global_trace.entries) { - ret = tracing_resize_ring_buffer(val); - if (ret < 0) { - cnt = ret; - goto out; - } - } + ret = tracing_resize_ring_buffer(val); + if (ret < 0) + return ret; *ppos += cnt; - /* If check pages failed, return ENOMEM */ - if (tracing_disabled) - cnt = -ENOMEM; - out: - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } + return cnt; +} - tracing_start(); - mutex_unlock(&trace_types_lock); +static ssize_t +tracing_free_buffer_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* + * There is no need to read what the user has written, this function + * is just to make sure that there is no error when "echo" is used + */ + + *ppos += cnt; return cnt; } +static int +tracing_free_buffer_release(struct inode *inode, struct file *filp) +{ + /* disable tracing */ + tracing_off(); + /* resize the ring buffer to 0 */ + tracing_resize_ring_buffer(0); + + return 0; +} + static int mark_printk(const char *fmt, ...) { int ret; @@ -3641,6 +3671,11 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_free_buffer_fops = { + .write = tracing_free_buffer_write, + .release = tracing_free_buffer_release, +}; + static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, @@ -4365,6 +4400,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, + &global_trace, &tracing_free_buffer_fops); + trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); -- cgit v1.2.3-70-g09d2 From cf30cf67d6c7592c670ec946d89fc15ee0deb0eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 22:44:07 -0400 Subject: tracing: Add disable_on_free option Add a trace option to disable tracing on free. When this option is set, a write into the free_buffer file will not only shrink the ring buffer down to zero, but it will also disable tracing. Cc: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++-- kernel/trace/trace.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9c557ae6a21..42fdf3adff3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -425,6 +425,7 @@ static const char *trace_options[] = { "graph-time", "record-cmd", "overwrite", + "disable_on_free", NULL }; @@ -3518,8 +3519,9 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { - /* disable tracing */ - tracing_off(); + /* disable tracing ? */ + if (trace_flags & TRACE_ITER_STOP_ON_FREE) + tracing_off(); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(0); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f8591f61..742f545ae18 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -609,6 +609,7 @@ enum trace_iterator_flags { TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, + TRACE_ITER_STOP_ON_FREE = 0x400000, }; /* -- cgit v1.2.3-70-g09d2 From bd38c0e6f98326132a691d73b2056b426423c638 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Tue, 31 May 2011 20:51:55 +0100 Subject: ftrace: Fixed an include coding style issue Removed because was already declared. Braces of struct's coding style fixed. Cc: Frederic Weisbecker Signed-off-by: Paul McQuade Link: http://lkml.kernel.org/r/4DE59711.3090900@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ee417fcbfa..e1538071660 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,7 +32,6 @@ #include -#include #include #include "trace_output.h" @@ -82,8 +81,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = -{ +static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, }; @@ -785,8 +783,7 @@ static void unregister_ftrace_profiler(void) unregister_ftrace_graph(); } #else -static struct ftrace_ops ftrace_profile_ops __read_mostly = -{ +static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; -- cgit v1.2.3-70-g09d2 From 84c15027a7f2fbd7f1180d7cbb60e41abbbaedd2 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Tue, 31 May 2011 20:51:55 +0100 Subject: async: Fixed an include coding style issue Added , and Removed . Added KERN_DEBUG to printk() functions. Acked-by: Arjan van de Ven Signed-off-by: Paul McQuade Link: http://lkml.kernel.org/r/4DE596B4.7030904@gmail.com Signed-off-by: Steven Rostedt --- kernel/async.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index cd9dbb913c7..d5fe7af0de2 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel. */ #include +#include +#include #include #include #include #include #include -#include static async_cookie_t next_cookie = 1; @@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 2) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, + printk(KERN_DEBUG "calling %lli_%pF @ %i\n", + (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } @@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - printk("initcall %lli_%pF returned 0 after %lld usecs\n", + printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); @@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, ktime_t starttime, delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk("async_waiting @ %i\n", task_pid_nr(current)); + printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } @@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, endtime = ktime_get(); delta = ktime_sub(endtime, starttime); - printk("async_continuing @ %i after %lli usec\n", + printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", task_pid_nr(current), (long long)ktime_to_ns(delta) >> 10); } -- cgit v1.2.3-70-g09d2 From 321e68b095addc473ca52ced9acfa59be88f76e6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:47 +0200 Subject: tracing, function_graph: Remove dependency of abstime and duration fields on latency The display of absolute time and duration fields is based on the latency field. This was added during the irqsoff/wakeup tracers graph support changes. It's causing confusion in what fields will be displayed for the function_graph tracer itself. So I'm removing this depency, and adding absolute time and duration fields to the preemptirqsoff preemptoff irqsoff wakeup tracers. With following commands: # echo function_graph > ./current_tracer # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 0) 0.068 us | } /* page_add_file_rmap */ 0) | _raw_spin_unlock() { ... This is what it looks like now: # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 0) 0.068 us | } /* add_preempt_count */ 0) 0.993 us | } /* vfsmount_lock_local_lock */ ... For preemptirqsoff preemptoff irqsoff wakeup tracers, this is what it looked like before: SNIP # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU TASK/PID ||||| DURATION FUNCTION CALLS # | | | ||||| | | | | | | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... This is what it looks like now: SNIP # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 19.847735 | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-2-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 19 +++---------------- kernel/trace/trace_irqsoff.c | 4 +++- kernel/trace/trace_sched_wakeup.c | 4 +++- 3 files changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 962cdb24ed8..352652eb9aa 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1207,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -__print_graph_function_flags(struct trace_iterator *iter, u32 flags) +print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1270,18 +1270,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return __print_graph_function_flags(iter, tracer_flags.val); -} - -enum print_line_t print_graph_function_flags(struct trace_iterator *iter, - u32 flags) -{ - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - return __print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, tracer_flags.val); } static enum print_line_t @@ -1364,9 +1353,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) return; print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; + } __print_graph_headers_flags(s, flags); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c77424be284..667aa8cc0cf 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ - TRACE_GRAPH_PRINT_PROC) + TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f029dd4fd2c..e4a70c0c71b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter) graph_trace_close(iter); } -#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { -- cgit v1.2.3-70-g09d2 From ffeb80fc30acbf6bd51cb47a1815f621a9d017dc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:48 +0200 Subject: tracing, function_graph: Merge overhead and duration display functions Functions print_graph_overhead() and print_graph_duration() displays data for one field - DURATION. I merged them into single function print_graph_duration(), and added a way to display the empty parts of the field. This way the print_graph_irq() function can use this column to display the IRQ signs if needed and the DURATION field details stays inside the print_graph_duration() function. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-3-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 148 +++++++++++++++++------------------ 1 file changed, 74 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 352652eb9aa..44b955fd87b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = { static struct trace_array *graph_array; +/* + * DURATION column is being also used to display IRQ signs, + * following values are used by print_graph_irq and others + * to fill in space into DURATION column. + */ +enum { + DURATION_FILL_FULL = -1, + DURATION_FILL_START = -2, + DURATION_FILL_END = -3, +}; + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags); /* Add a function return address to the trace stack on thread info.*/ int @@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter, return next; } -/* Signal a overhead of time execution to the output */ -static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s, - u32 flags) -{ - /* If duration disappear, we don't need anything */ - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return 1; - - /* Non nested entry or return */ - if (duration == -1) - return trace_seq_printf(s, " "); - - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - return trace_seq_printf(s, "! "); - - /* Duration exceeded 10 msecs */ - if (duration > 10000ULL) - return trace_seq_printf(s, "+ "); - } - - return trace_seq_printf(s, " "); -} - static int print_graph_abs_time(u64 t, struct trace_seq *s) { unsigned long usecs_rem; @@ -650,9 +638,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + ret = print_graph_duration(DURATION_FILL_START, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; if (type == TRACE_GRAPH_ENT) ret = trace_seq_printf(s, "==========>"); @@ -662,9 +650,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Don't close the duration column if haven't one */ - if (flags & TRACE_GRAPH_PRINT_DURATION) - trace_seq_printf(s, " |"); + ret = print_graph_duration(DURATION_FILL_END, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; + ret = trace_seq_printf(s, "\n"); if (!ret) @@ -716,9 +705,48 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) } static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags) { - int ret; + int ret = -1; + + if (!(flags & TRACE_GRAPH_PRINT_DURATION)) + return TRACE_TYPE_HANDLED; + + /* No real adata, just filling the column with spaces */ + switch (duration) { + case DURATION_FILL_FULL: + ret = trace_seq_printf(s, " | "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_START: + ret = trace_seq_printf(s, " "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_END: + ret = trace_seq_printf(s, " |"); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + } + + /* Signal a overhead of time execution to the output */ + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + ret = trace_seq_printf(s, "! "); + /* Duration exceeded 10 msecs */ + else if (duration > 10000ULL) + ret = trace_seq_printf(s, "+ "); + } + + /* + * The -1 means we either did not exceed the duration tresholds + * or we dont want to print out the overhead. Either way we need + * to fill out the space. + */ + if (ret == -1) + ret = trace_seq_printf(s, " "); + + /* Catching here any failure happenned above */ + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; ret = trace_print_graph_duration(duration, s); if (ret != TRACE_TYPE_HANDLED) @@ -767,18 +795,11 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = 0; } - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -815,17 +836,10 @@ print_graph_entry_nested(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = call->func; } - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -1078,18 +1092,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -1146,17 +1153,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Indentation */ if (depth > 0) -- cgit v1.2.3-70-g09d2 From f56e7f8efb4ec200364f690a9902713410e24d47 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:49 +0200 Subject: tracing, function: Fix trace header to follow context-info option The header display of function tracer does not follow the context-info option, so field names are displayed even if this option is off. Added check for TRACE_ITER_CONTEXT_INFO trace_flags. With following commands: # echo function > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | add_preempt_count <-schedule rcu_note_context_switch <-schedule ... This is what it looks like now: # tracer: function # _raw_spin_unlock_irqrestore <-hrtimer_try_to_cancel ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-4-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42fdf3adff3..cf22b4bf989 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2053,6 +2053,9 @@ void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v1.2.3-70-g09d2 From 199abfab40389963b397c2982222e68ea782b2cf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:50 +0200 Subject: tracing, function_graph: Remove lock-depth from latency trace The lock_depth was removed in commit e6e1e25 tracing: Remove lock_depth from event entry Removing the lock_depth info from function_graph latency header. With following commands: # echo function_graph > ./current_tracer # echo 1 > options/latency-format # cat trace This is what it looked like before: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59756/311298, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU||||| DURATION FUNCTION CALLS # | ||||| | | | | | | 0) .... 0.068 us | } /* __rcu_read_unlock */ ... This is what it looks like now: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59747/1744610, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # CPU|||| DURATION FUNCTION CALLS # | |||| | | | | | | 0) ..s. 1.641 us | } /* __rcu_process_callbacks */ ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-5-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 44b955fd87b..1da5b97d228 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1298,8 +1298,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); - seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); - seq_printf(s, "#%.*s|||| / \n", size, spaces); + seq_printf(s, "#%.*s||| / \n", size, spaces); } static void __print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1318,7 +1317,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " TASK/PID "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); @@ -1332,7 +1331,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " | | "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); -- cgit v1.2.3-70-g09d2 From 749230b06a753a22f6ed96e5dd60815d6ab12865 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:51 +0200 Subject: tracing, function_graph: Add context-info support for function_graph tracer The function_graph tracer does not follow global context-info option. Adding TRACE_ITER_CONTEXT_INFO trace_flags check to enable it. With following commands: # echo function_graph > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 1) 0.079 us | } /* __vma_link_rb */ 1) 0.056 us | copy_page_range(); 1) | security_vm_enough_memory() { ... This is what it looks like now: # tracer: function_graph # } /* update_ts_time_stats */ timekeeping_max_deferment(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-6-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 53 +++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 1da5b97d228..e8d6bb55d71 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -613,28 +613,30 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return TRACE_TYPE_UNHANDLED; - /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Proc */ - if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } } /* No overhead */ @@ -710,8 +712,9 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, { int ret = -1; - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return TRACE_TYPE_HANDLED; + if (!(flags & TRACE_GRAPH_PRINT_DURATION) || + !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ switch (duration) { @@ -879,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return 0; + /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); @@ -1346,6 +1352,9 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) { struct trace_iterator *iter = s->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (trace_flags & TRACE_ITER_LATENCY_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v1.2.3-70-g09d2 From 22fe9b54d859e53bfbbbdc1a0a77a82bc453927c Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Tue, 7 Jun 2011 21:58:27 +0200 Subject: tracing: Convert to kstrtoul_from_user This patch replaces the code for getting an unsigned long from a userspace buffer by a simple call to kstroul_from_user. This makes it easier to read and less error prone. Signed-off-by: Peter Huewe Link: http://lkml.kernel.org/r/1307476707-14762-1-git-send-email-peterhuewe@gmx.de Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 13 ++------- kernel/trace/ring_buffer.c | 13 ++------- kernel/trace/trace.c | 65 +++++++-------------------------------------- kernel/trace/trace_events.c | 26 +++--------------- kernel/trace/trace_stack.c | 13 ++------- 5 files changed, 20 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e1538071660..458018a1ac9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -803,19 +803,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; /* big enough to hold a number */ int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 725153d6cf7..f00ede314eb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3980,20 +3980,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *p = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cf22b4bf989..c977018e87c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2706,20 +2706,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; @@ -3006,20 +2997,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *ptr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; *ptr = val * 1000; @@ -3474,19 +3456,10 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; /* must have at least 1 entry */ @@ -4139,19 +4112,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, { struct trace_option_dentry *topt = filp->private_data; unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) @@ -4199,20 +4163,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { long index = (long)filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec399f2a..4d7e1498ae9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -486,20 +486,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); @@ -571,19 +562,10 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char buf[64]; ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b0b53b8e4c2..77575b386d9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, { long *ptr = filp->private_data; unsigned long val, flags; - char buf[64]; int ret; int cpu; - if (count >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, count, 10, &val); + if (ret) return ret; local_irq_save(flags); -- cgit v1.2.3-70-g09d2 From d7ec4bfed6c97405c6417970ba06c439e08ab8e3 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 7 Jun 2011 17:01:42 -0700 Subject: ring-buffer: Set __GFP_NORETRY flag for ring buffer allocating process The tracing ring buffer is allocated from kernel memory. While allocating a large chunk of memory, OOM might happen which destabilizes the system. Thus random processes might get killed during the allocation. This patch adds __GFP_NORETRY flag to the ring buffer allocation calls to make it fail more gracefully if the system will not be able to complete the allocation request. Acked-by: David Rientjes Signed-off-by: Vaibhav Nagarnaik Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1307491302-9236-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f00ede314eb..731201bf4ac 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1004,9 +1004,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { struct page *page; - + /* + * __GFP_NORETRY flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is + * not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; @@ -1015,7 +1020,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - GFP_KERNEL, 0); + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -1377,13 +1382,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { struct page *page; + /* + * __GFP_NORETRY flag makes sure that the allocation + * fails gracefully without invoking oom-killer and + * the system is not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -3737,7 +3749,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) struct buffer_data_page *bpage; struct page *page; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) return NULL; -- cgit v1.2.3-70-g09d2 From c624d33f61cd05241e85b906311f0b712fdb0f32 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:27 +0900 Subject: stack_trace: Add weak save_stack_trace_regs() Add weak symbol of save_stack_trace_regs() as same as save_stack_trace_tsk() since that is not implemented except x86 yet. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: yrl.pp-manager.tt@hitachi.com Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070927.17777.37895.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/stacktrace.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index eb212f8f8bc..d20c6983aad 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces) EXPORT_SYMBOL_GPL(print_stack_trace); /* - * Architectures that do not implement save_stack_trace_tsk get this - * weak alias and a once-per-bootup warning (whenever this facility - * is utilized - for example by procfs): + * Architectures that do not implement save_stack_trace_tsk or + * save_stack_trace_regs get this weak alias and a once-per-bootup warning + * (whenever this facility is utilized - for example by procfs): */ __weak void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); } + +__weak void +save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +{ + WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); +} -- cgit v1.2.3-70-g09d2 From 1fd8df2c3970c9e7e4e262354154ee39e58bdd7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:34 +0900 Subject: tracing/kprobes: Fix kprobe-tracer to support stack trace Fix to support kernel stack trace correctly on kprobe-tracer. Since the execution path of kprobe-based dynamic events is different from other tracepoint-based events, normal ftrace_trace_stack() doesn't work correctly. To fix that, this introduces ftrace_trace_stack_regs() which traces stack via pt_regs instead of current stack register. e.g. # echo p schedule+4 > /sys/kernel/debug/tracing/kprobe_events # echo 1 > /sys/kernel/debug/tracing/options/stacktrace # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable # head -n 20 /sys/kernel/debug/tracing/trace bash-2968 [000] 10297.050245: p_schedule_4: (schedule+0x4/0x4ca) bash-2968 [000] 10297.050247: => schedule_timeout => n_tty_read => tty_read => vfs_read => sys_read => system_call_fastpath kworker/0:1-2940 [000] 10297.050265: p_schedule_4: (schedule+0x4/0x4ca) kworker/0:1-2940 [000] 10297.050266: => worker_thread => kthread => kernel_thread_helper sshd-1132 [000] 10297.050365: p_schedule_4: (schedule+0x4/0x4ca) sshd-1132 [000] 10297.050365: => sysret_careful Note: Even with this fix, the first entry will be skipped if the probe is put on the function entry area before the frame pointer is set up (usually, that is 4 bytes (push %bp; mov %sp %bp) on x86), because stack unwinder depends on the frame pointer. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: yrl.pp-manager.tt@hitachi.com Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070934.17777.17116.stgit@fedora15 Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 ++++ kernel/trace/trace.c | 34 +++++++++++++++++++++++++++++----- kernel/trace/trace.h | 9 +++++++++ kernel/trace/trace_kprobe.c | 6 ++++-- 4 files changed, 46 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 59d3ef100eb..b1e69eefc20 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -129,6 +129,10 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c977018e87c..d9c16123f6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) +{ + ring_buffer_unlock_commit(buffer, event); + + ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + ftrace_trace_userstack(buffer, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); + void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { @@ -1238,7 +1250,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) + int skip, int pc, struct pt_regs *regs) { struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; @@ -1257,24 +1269,36 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; trace.entries = entry->caller; - save_stack_trace(&trace); + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); } +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, flags, skip, pc, regs); +} + void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, flags, skip, pc); + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); } void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc); + __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); } /** @@ -1290,7 +1314,7 @@ void trace_dump_stack(void) local_save_flags(flags); /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 742f545ae18..a3e2db70807 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -389,6 +389,9 @@ void update_max_tr_single(struct trace_array *tr, void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc); +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); @@ -400,6 +403,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, { } +static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, + unsigned long flags, int skip, + int pc, struct pt_regs *regs) +{ +} + static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f925c45f0af..7053ef3d73d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1397,7 +1397,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Kretprobe handler */ @@ -1429,7 +1430,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Event entry printers */ -- cgit v1.2.3-70-g09d2 From 73ddff2bee159ffb580bd24faf625cd5e628f5ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Jun 2011 11:20:14 +0200 Subject: job control: introduce JOBCTL_TRAP_STOP and use it for group stop trap do_signal_stop() implemented both normal group stop and trap for group stop while ptraced. This approach has been enough but scheduled changes require trap mechanism which can be used in more generic manner and using group stop trap for generic trap site simplifies both userland visible interface and implementation. This patch adds a new jobctl flag - JOBCTL_TRAP_STOP. When set, it triggers a trap site, which behaves like group stop trap, in get_signal_to_deliver() after checking for pending signals. While ptraced, do_signal_stop() doesn't stop itself. It initiates group stop if requested and schedules JOBCTL_TRAP_STOP and returns. The caller - get_signal_to_deliver() - is responsible for checking whether TRAP_STOP is pending afterwards and handling it. ptrace_attach() is updated to use JOBCTL_TRAP_STOP instead of JOBCTL_STOP_PENDING and __ptrace_unlink() to clear all pending trap bits and TRAPPING so that TRAP_STOP and future trap bits don't linger after detach. While at it, add proper function comment to do_signal_stop() and make it return bool. -v2: __ptrace_unlink() updated to clear JOBCTL_TRAP_MASK and TRAPPING instead of JOBCTL_PENDING_MASK. This avoids accidentally clearing JOBCTL_STOP_CONSUME. Spotted by Oleg. -v3: do_signal_stop() updated to return %false without dropping siglock while ptraced and TRAP_STOP check moved inside for(;;) loop after group stop participation. This avoids unnecessary relocking and also will help avoiding unnecessary traps by consuming group stop before handling pending traps. -v4: Jobctl trap handling moved into a separate function - do_jobctl_trap(). Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- include/linux/sched.h | 6 +++- kernel/ptrace.c | 12 +++++-- kernel/signal.c | 90 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 75 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5157bd9eee3..8bd84b83a35 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1810,17 +1810,21 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ #define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ #define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ +#define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) #define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -#define JOBCTL_PENDING_MASK JOBCTL_STOP_PENDING +#define JOBCTL_TRAP_MASK JOBCTL_TRAP_STOP +#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask); +extern void task_clear_jobctl_trapping(struct task_struct *task); extern void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 7f05f3a1267..45a8a4c5d8b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -82,6 +82,13 @@ void __ptrace_unlink(struct task_struct *child) spin_lock(&child->sighand->siglock); + /* + * Clear all pending traps and TRAPPING. TRAPPING should be + * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. + */ + task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); + task_clear_jobctl_trapping(child); + /* * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and * @child isn't dead. @@ -246,7 +253,7 @@ static int ptrace_attach(struct task_struct *task) spin_lock(&task->sighand->siglock); /* - * If the task is already STOPPED, set JOBCTL_STOP_PENDING and + * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING * will be cleared if the child completes the transition or any * event which clears the group stop states happens. We'll wait @@ -263,8 +270,7 @@ static int ptrace_attach(struct task_struct *task) * in and out of STOPPED are protected by siglock. */ if (task_is_stopped(task) && - task_set_jobctl_pending(task, - JOBCTL_STOP_PENDING | JOBCTL_TRAPPING)) + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) signal_wake_up(task, 1); spin_unlock(&task->sighand->siglock); diff --git a/kernel/signal.c b/kernel/signal.c index c99b8b5c0be..b5f55ca1f43 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -266,7 +266,7 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) * CONTEXT: * Must be called with @task->sighand->siglock held. */ -static void task_clear_jobctl_trapping(struct task_struct *task) +void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; @@ -1790,13 +1790,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) /* * If @why is CLD_STOPPED, we're trapping to participate in a group * stop. Do the bookkeeping. Note that if SIGCONT was delievered - * while siglock was released for the arch hook, PENDING could be - * clear now. We act as if SIGCONT is received after TASK_TRACED - * is entered - ignore it. + * across siglock relocks since INTERRUPT was scheduled, PENDING + * could be clear now. We act as if SIGCONT is received after + * TASK_TRACED is entered - ignore it. */ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); + /* any trap clears pending STOP trap */ + task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); + /* entering a trap, clear TRAPPING */ task_clear_jobctl_trapping(current); @@ -1888,13 +1891,30 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } -/* - * This performs the stopping for SIGSTOP and other stop signals. - * We have to stop all threads in the thread group. - * Returns non-zero if we've actually stopped and released the siglock. - * Returns zero if we didn't stop and still hold the siglock. +/** + * do_signal_stop - handle group stop for SIGSTOP and other stop signals + * @signr: signr causing group stop if initiating + * + * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr + * and participate in it. If already set, participate in the existing + * group stop. If participated in a group stop (and thus slept), %true is + * returned with siglock released. + * + * If ptraced, this function doesn't handle stop itself. Instead, + * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock + * untouched. The caller must ensure that INTERRUPT trap handling takes + * places afterwards. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, which is released + * on %true return. + * + * RETURNS: + * %false if group stop is already cancelled or ptrace trap is scheduled. + * %true if participated in group stop. */ -static int do_signal_stop(int signr) +static bool do_signal_stop(int signr) + __releases(¤t->sighand->siglock) { struct signal_struct *sig = current->signal; @@ -1907,7 +1927,7 @@ static int do_signal_stop(int signr) if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) - return 0; + return false; /* * There is no group stop already in progress. We must * initiate one now. @@ -1951,7 +1971,7 @@ static int do_signal_stop(int signr) } } } -retry: + if (likely(!task_ptrace(current))) { int notify = 0; @@ -1983,27 +2003,33 @@ retry: /* Now we don't run again until woken by SIGCONT or SIGKILL */ schedule(); - - spin_lock_irq(¤t->sighand->siglock); + return true; } else { - ptrace_stop(current->jobctl & JOBCTL_STOP_SIGMASK, - CLD_STOPPED, 0, NULL); - current->exit_code = 0; - } - - /* - * JOBCTL_STOP_PENDING could be set if another group stop has - * started since being woken up or ptrace wants us to transit - * between TASK_STOPPED and TRACED. Retry group stop. - */ - if (current->jobctl & JOBCTL_STOP_PENDING) { - WARN_ON_ONCE(!(current->jobctl & JOBCTL_STOP_SIGMASK)); - goto retry; + /* + * While ptraced, group stop is handled by STOP trap. + * Schedule it and let the caller deal with it. + */ + task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); + return false; } +} - spin_unlock_irq(¤t->sighand->siglock); +/** + * do_jobctl_trap - take care of ptrace jobctl traps + * + * It is currently used only to trap for group stop while ptraced. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, which may be + * released and re-acquired before returning with intervening sleep. + */ +static void do_jobctl_trap(void) +{ + int signr = current->jobctl & JOBCTL_STOP_SIGMASK; - return 1; + WARN_ON_ONCE(!signr); + ptrace_stop(signr, CLD_STOPPED, 0, NULL); + current->exit_code = 0; } static int ptrace_signal(int signr, siginfo_t *info, @@ -2110,6 +2136,12 @@ relock: do_signal_stop(0)) goto relock; + if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { + do_jobctl_trap(); + spin_unlock_irq(&sighand->siglock); + goto relock; + } + signr = dequeue_signal(current, ¤t->blocked, info); if (!signr) -- cgit v1.2.3-70-g09d2 From 3544d72a0e10d0aa1c1bd59ed77a53a59cdc12f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Jun 2011 11:20:15 +0200 Subject: ptrace: implement PTRACE_SEIZE PTRACE_ATTACH implicitly issues SIGSTOP on attach which has side effects on tracee signal and job control states. This patch implements a new ptrace request PTRACE_SEIZE which attaches a tracee without trapping it or affecting its signal and job control states. The usage is the same with PTRACE_ATTACH but it takes PTRACE_SEIZE_* flags in @data. Currently, the only defined flag is PTRACE_SEIZE_DEVEL which is a temporary flag to enable PTRACE_SEIZE. PTRACE_SEIZE will change ptrace behaviors outside of attach itself. The changes will be implemented gradually and the DEVEL flag is to prevent programs which expect full SEIZE behavior from using it before all the behavior modifications are complete while allowing unit testing. The flag will be removed once SEIZE behaviors are completely implemented. * PTRACE_SEIZE, unlike ATTACH, doesn't force tracee to trap. After attaching tracee continues to run unless a trap condition occurs. * PTRACE_SEIZE doesn't affect signal or group stop state. * If PTRACE_SEIZE'd, group stop uses PTRACE_EVENT_STOP trap which uses exit_code of (signr | PTRACE_EVENT_STOP << 8) where signr is one of the stopping signals if group stop is in effect or SIGTRAP otherwise, and returns usual trap siginfo on PTRACE_GETSIGINFO instead of NULL. Seizing sets PT_SEIZED in ->ptrace of the tracee. This flag will be used to determine whether new SEIZE behaviors should be enabled. Test program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts100ms = { .tv_nsec = 100000000 }; static const struct timespec ts1s = { .tv_sec = 1 }; static const struct timespec ts3s = { .tv_sec = 3 }; int main(int argc, char **argv) { pid_t tracee; tracee = fork(); if (tracee == 0) { nanosleep(&ts100ms, NULL); while (1) { printf("tracee: alive\n"); nanosleep(&ts1s, NULL); } } if (argc > 1) kill(tracee, SIGSTOP); nanosleep(&ts100ms, NULL); ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); if (argc > 1) { waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, NULL); } nanosleep(&ts3s, NULL); printf("tracer: exiting\n"); return 0; } When the above program is called w/o argument, tracee is seized while running and remains running. When tracer exits, tracee continues to run and print out messages. # ./test-seize-simple tracee: alive tracee: alive tracee: alive tracer: exiting tracee: alive tracee: alive When called with an argument, tracee is seized from stopped state and continued, and returns to stopped state when tracer exits. # ./test-seize tracee: alive tracee: alive tracee: alive tracer: exiting # ps -el|grep test-seize 1 T 0 4720 1 0 80 0 - 941 signal ttyS0 00:00:00 test-seize -v2: SEIZE doesn't schedule TRAP_STOP and leaves tracee running as Jan suggested. -v3: PTRACE_EVENT_STOP traps now report group stop state by signr. If group stop is in effect the stop signal number is returned as part of exit_code; otherwise, SIGTRAP. This was suggested by Denys and Oleg. Signed-off-by: Tejun Heo Cc: Jan Kratochvil Cc: Denys Vlasenko Cc: Oleg Nesterov --- include/linux/ptrace.h | 7 +++++++ kernel/ptrace.c | 35 +++++++++++++++++++++++++++++------ kernel/signal.c | 39 ++++++++++++++++++++++++++++++--------- 3 files changed, 66 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index e93ef1a54fc..67ad3f15232 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -47,6 +47,11 @@ #define PTRACE_GETREGSET 0x4204 #define PTRACE_SETREGSET 0x4205 +#define PTRACE_SEIZE 0x4206 + +/* flags in @data for PTRACE_SEIZE */ +#define PTRACE_SEIZE_DEVEL 0x80000000 /* temp flag for development */ + /* options set using PTRACE_SETOPTIONS */ #define PTRACE_O_TRACESYSGOOD 0x00000001 #define PTRACE_O_TRACEFORK 0x00000002 @@ -65,6 +70,7 @@ #define PTRACE_EVENT_EXEC 4 #define PTRACE_EVENT_VFORK_DONE 5 #define PTRACE_EVENT_EXIT 6 +#define PTRACE_EVENT_STOP 7 #include @@ -77,6 +83,7 @@ * flags. When the a task is stopped the ptracer owns task->ptrace. */ +#define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ #define PT_TRACESYSGOOD 0x00000004 diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 45a8a4c5d8b..dcf9f974198 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -209,10 +209,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } -static int ptrace_attach(struct task_struct *task) +static int ptrace_attach(struct task_struct *task, long request, + unsigned long flags) { + bool seize = (request == PTRACE_SEIZE); int retval; + /* + * SEIZE will enable new ptrace behaviors which will be implemented + * gradually. SEIZE_DEVEL is used to prevent applications + * expecting full SEIZE behaviors trapping on kernel commits which + * are still in the process of implementing them. + * + * Only test programs for new ptrace behaviors being implemented + * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. + * + * Once SEIZE behaviors are completely implemented, this flag and + * the following test will be removed. + */ + retval = -EIO; + if (seize && !(flags & PTRACE_SEIZE_DEVEL)) + goto out; + audit_ptrace(task); retval = -EPERM; @@ -244,11 +262,16 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; + if (seize) + task->ptrace |= PT_SEIZED; if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); - send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); spin_lock(&task->sighand->siglock); @@ -785,8 +808,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -927,8 +950,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. diff --git a/kernel/signal.c b/kernel/signal.c index b5f55ca1f43..589292f3853 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1873,21 +1873,26 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) recalc_sigpending_tsk(current); } -void ptrace_notify(int exit_code) +static void ptrace_do_notify(int signr, int exit_code, int why) { siginfo_t info; - BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - memset(&info, 0, sizeof info); - info.si_signo = SIGTRAP; + info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); info.si_uid = current_uid(); /* Let the debugger run. */ + ptrace_stop(exit_code, why, 1, &info); +} + +void ptrace_notify(int exit_code) +{ + BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); + ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); spin_unlock_irq(¤t->sighand->siglock); } @@ -2017,7 +2022,13 @@ static bool do_signal_stop(int signr) /** * do_jobctl_trap - take care of ptrace jobctl traps * - * It is currently used only to trap for group stop while ptraced. + * When PT_SEIZED, it's used for both group stop and explicit + * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with + * accompanying siginfo. If stopped, lower eight bits of exit_code contain + * the stop signal; otherwise, %SIGTRAP. + * + * When !PT_SEIZED, it's used only for group stop trap with stop signal + * number as exit_code and no siginfo. * * CONTEXT: * Must be called with @current->sighand->siglock held, which may be @@ -2025,11 +2036,21 @@ static bool do_signal_stop(int signr) */ static void do_jobctl_trap(void) { + struct signal_struct *signal = current->signal; int signr = current->jobctl & JOBCTL_STOP_SIGMASK; - WARN_ON_ONCE(!signr); - ptrace_stop(signr, CLD_STOPPED, 0, NULL); - current->exit_code = 0; + if (current->ptrace & PT_SEIZED) { + if (!signal->group_stop_count && + !(signal->flags & SIGNAL_STOP_STOPPED)) + signr = SIGTRAP; + WARN_ON_ONCE(!signr); + ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), + CLD_STOPPED); + } else { + WARN_ON_ONCE(!signr); + ptrace_stop(signr, CLD_STOPPED, 0, NULL); + current->exit_code = 0; + } } static int ptrace_signal(int signr, siginfo_t *info, -- cgit v1.2.3-70-g09d2 From fca26f260c528ee51a2e451b5b200aeb528f3e09 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Jun 2011 11:20:16 +0200 Subject: ptrace: implement PTRACE_INTERRUPT Currently, there's no way to trap a running ptracee short of sending a signal which has various side effects. This patch implements PTRACE_INTERRUPT which traps ptracee without any signal or job control related side effect. The implementation is almost trivial. It uses the group stop trap - SIGTRAP | PTRACE_EVENT_STOP << 8. A new trap flag JOBCTL_TRAP_INTERRUPT is added, which is set on PTRACE_INTERRUPT and cleared when any trap happens. As INTERRUPT should be useable regardless of the current state of tracee, task_is_traced() test in ptrace_check_attach() is skipped for INTERRUPT. PTRACE_INTERRUPT is available iff tracee is attached with PTRACE_SEIZE. Test program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts100ms = { .tv_nsec = 100000000 }; static const struct timespec ts1s = { .tv_sec = 1 }; static const struct timespec ts3s = { .tv_sec = 3 }; int main(int argc, char **argv) { pid_t tracee; tracee = fork(); if (tracee == 0) { nanosleep(&ts100ms, NULL); while (1) { printf("tracee: alive pid=%d\n", getpid()); nanosleep(&ts1s, NULL); } } if (argc > 1) kill(tracee, SIGSTOP); nanosleep(&ts100ms, NULL); ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); if (argc > 1) { waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, NULL); } nanosleep(&ts3s, NULL); printf("tracer: INTERRUPT and DETACH\n"); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_DETACH, tracee, NULL, NULL); nanosleep(&ts3s, NULL); printf("tracer: exiting\n"); kill(tracee, SIGKILL); return 0; } When called without argument, tracee is seized from running state, interrupted and then detached back to running state. # ./test-interrupt tracee: alive pid=4546 tracee: alive pid=4546 tracee: alive pid=4546 tracer: INTERRUPT and DETACH tracee: alive pid=4546 tracee: alive pid=4546 tracee: alive pid=4546 tracer: exiting When called with argument, tracee is seized from stopped state, continued, interrupted and then detached back to stopped state. # ./test-interrupt 1 tracee: alive pid=4548 tracee: alive pid=4548 tracee: alive pid=4548 tracer: INTERRUPT and DETACH tracer: exiting Before PTRACE_INTERRUPT, once the tracee was running, there was no way to trap tracee and do PTRACE_DETACH without causing side effect. -v2: Updated to use task_set_jobctl_pending() so that it doesn't end up scheduling TRAP_STOP if child is dying which may make the child unkillable. Spotted by Oleg. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- include/linux/ptrace.h | 1 + kernel/ptrace.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 67ad3f15232..ad754d1e0b1 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -48,6 +48,7 @@ #define PTRACE_SETREGSET 0x4205 #define PTRACE_SEIZE 0x4206 +#define PTRACE_INTERRUPT 0x4207 /* flags in @data for PTRACE_SEIZE */ #define PTRACE_SEIZE_DEVEL 0x80000000 /* temp flag for development */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dcf9f974198..6852c0f4a91 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -658,10 +658,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + bool seized = child->ptrace & PT_SEIZED; int ret = -EIO; siginfo_t siginfo; void __user *datavp = (void __user *) data; unsigned long __user *datalp = datavp; + unsigned long flags; switch (request) { case PTRACE_PEEKTEXT: @@ -694,6 +696,27 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_INTERRUPT: + /* + * Stop tracee without any side-effect on signal or job + * control. At least one trap is guaranteed to happen + * after this request. If @child is already trapped, the + * current trap is not disturbed and another trap will + * happen after the current trap is ended with PTRACE_CONT. + * + * The actual trap might not be PTRACE_EVENT_STOP trap but + * the pending condition is cleared regardless. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) + signal_wake_up(child, 0); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + case PTRACE_DETACH: /* detach a process that was attached. */ ret = ptrace_detach(child, data); break; @@ -819,7 +842,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (ret < 0) goto out_put_task_struct; @@ -961,7 +985,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (!ret) ret = compat_arch_ptrace(child, request, addr, data); -- cgit v1.2.3-70-g09d2 From fb1d910c178ba0c5bc32d3e5a9e82e05b7aad3cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Jun 2011 11:20:17 +0200 Subject: ptrace: implement TRAP_NOTIFY and use it for group stop events Currently there's no way for ptracer to find out whether group stop finished other than polling with INTERRUPT - GETSIGINFO - CONT sequence. This patch implements group stop notification for ptracer using STOP traps. When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY is set, which schedules a STOP trap which is sticky - it isn't cleared by other traps and at least one STOP trap will happen eventually. STOP trap is synchronization point for event notification and the tracer can determine the current group stop state by looking at the signal number portion of exit code (si_status from waitid(2) or si_code from PTRACE_GETSIGINFO). Notifications are generated both on start and end of group stops but, because group stop participation always happens before STOP trap, this doesn't cause an extra trap while tracee is participating in group stop. The symmetry will be useful later. Note that this notification works iff tracee is not trapped. Currently there is no way to be notified of group stop state changes while tracee is trapped. This will be addressed by a later patch. An example program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts1s = { .tv_sec = 1 }; int main(int argc, char **argv) { pid_t tracee, tracer; int i; tracee = fork(); if (!tracee) while (1) pause(); tracer = fork(); if (!tracer) { siginfo_t si; ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); repeat: waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si); if (!si.si_code) { printf("tracer: SIG %d\n", si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, (void *)(unsigned long)si.si_signo); goto repeat; } printf("tracer: stopped=%d signo=%d\n", si.si_signo != SIGTRAP, si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, NULL); goto repeat; } for (i = 0; i < 3; i++) { nanosleep(&ts1s, NULL); printf("mother: SIGSTOP\n"); kill(tracee, SIGSTOP); nanosleep(&ts1s, NULL); printf("mother: SIGCONT\n"); kill(tracee, SIGCONT); } nanosleep(&ts1s, NULL); kill(tracer, SIGKILL); kill(tracee, SIGKILL); return 0; } In the above program, tracer keeps tracee running and gets notification of each group stop state changes. # ./test-notify tracer: stopped=0 signo=5 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- include/linux/sched.h | 4 +++- kernel/signal.c | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8bd84b83a35..1854def284f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1811,15 +1811,17 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ #define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ #define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ +#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) #define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) #define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) +#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -#define JOBCTL_TRAP_MASK JOBCTL_TRAP_STOP +#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, diff --git a/kernel/signal.c b/kernel/signal.c index 589292f3853..06177e2b391 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -817,6 +817,30 @@ static int check_kill_permission(int sig, struct siginfo *info, return security_task_kill(t, info, sig, 0); } +/** + * ptrace_trap_notify - schedule trap to notify ptracer + * @t: tracee wanting to notify tracer + * + * This function schedules sticky ptrace trap which is cleared on the next + * TRAP_STOP to notify ptracer of an event. @t must have been seized by + * ptracer. + * + * If @t is running, STOP trap will be taken. If already trapped, STOP + * trap will be eventually taken without returning to userland after the + * existing traps are finished by PTRACE_CONT. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void ptrace_trap_notify(struct task_struct *t) +{ + WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); + assert_spin_locked(&t->sighand->siglock); + + task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); + signal_wake_up(t, 0); +} + /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation @@ -855,7 +879,10 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) do { task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - wake_up_state(t, __TASK_STOPPED); + if (likely(!(t->ptrace & PT_SEIZED))) + wake_up_state(t, __TASK_STOPPED); + else + ptrace_trap_notify(t); } while_each_thread(p, t); /* @@ -1797,8 +1824,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); - /* any trap clears pending STOP trap */ + /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); + if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) + task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); /* entering a trap, clear TRAPPING */ task_clear_jobctl_trapping(current); @@ -1972,7 +2001,10 @@ static bool do_signal_stop(int signr) if (!task_is_stopped(t) && task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; - signal_wake_up(t, 0); + if (likely(!(t->ptrace & PT_SEIZED))) + signal_wake_up(t, 0); + else + ptrace_trap_notify(t); } } } -- cgit v1.2.3-70-g09d2 From 544b2c91a9f14f9565af1972203438b7f49afd48 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Jun 2011 11:20:18 +0200 Subject: ptrace: implement PTRACE_LISTEN The previous patch implemented async notification for ptrace but it only worked while trace is running. This patch introduces PTRACE_LISTEN which is suggested by Oleg Nestrov. It's allowed iff tracee is in STOP trap and puts tracee into quasi-running state - tracee never really runs but wait(2) and ptrace(2) consider it to be running. While ptracer is listening, tracee is allowed to re-enter STOP to notify an async event. Listening state is cleared on the first notification. Ptracer can also clear it by issuing INTERRUPT - tracee will re-trap into STOP with listening state cleared. This allows ptracer to monitor group stop state without running tracee - use INTERRUPT to put tracee into STOP trap, issue LISTEN and then wait(2) to wait for the next group stop event. When it happens, PTRACE_GETSIGINFO provides information to determine the current state. Test program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_LISTEN 0x4208 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts1s = { .tv_sec = 1 }; int main(int argc, char **argv) { pid_t tracee, tracer; int i; tracee = fork(); if (!tracee) while (1) pause(); tracer = fork(); if (!tracer) { siginfo_t si; ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); repeat: waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si); if (!si.si_code) { printf("tracer: SIG %d\n", si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, (void *)(unsigned long)si.si_signo); goto repeat; } printf("tracer: stopped=%d signo=%d\n", si.si_signo != SIGTRAP, si.si_signo); if (si.si_signo != SIGTRAP) ptrace(PTRACE_LISTEN, tracee, NULL, NULL); else ptrace(PTRACE_CONT, tracee, NULL, NULL); goto repeat; } for (i = 0; i < 3; i++) { nanosleep(&ts1s, NULL); printf("mother: SIGSTOP\n"); kill(tracee, SIGSTOP); nanosleep(&ts1s, NULL); printf("mother: SIGCONT\n"); kill(tracee, SIGCONT); } nanosleep(&ts1s, NULL); kill(tracer, SIGKILL); kill(tracee, SIGKILL); return 0; } This is identical to the program to test TRAP_NOTIFY except that tracee is PTRACE_LISTEN'd instead of PTRACE_CONT'd when group stopped. This allows ptracer to monitor when group stop ends without running tracee. # ./test-listen tracer: stopped=0 signo=5 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 -v2: Moved JOBCTL_LISTENING check in wait_task_stopped() into task_stopped_code() as suggested by Oleg. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- include/linux/ptrace.h | 1 + include/linux/sched.h | 2 ++ kernel/exit.c | 3 ++- kernel/ptrace.c | 42 +++++++++++++++++++++++++++++++++++++++--- kernel/signal.c | 13 +++++++++---- 5 files changed, 53 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index ad754d1e0b1..4f224f16952 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -49,6 +49,7 @@ #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 +#define PTRACE_LISTEN 0x4208 /* flags in @data for PTRACE_SEIZE */ #define PTRACE_SEIZE_DEVEL 0x80000000 /* temp flag for development */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1854def284f..87f7ca7ed6f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1813,6 +1813,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ +#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) @@ -1820,6 +1821,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) #define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) +#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/kernel/exit.c b/kernel/exit.c index 20a40647152..289f59d686b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1368,7 +1368,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p)) + if (task_is_stopped_or_traced(p) && + !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6852c0f4a91..e18966c1c0d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -146,7 +146,8 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) */ spin_lock_irq(&child->sighand->siglock); WARN_ON_ONCE(task_is_stopped(child)); - if (task_is_traced(child) || ignore_state) + if (ignore_state || (task_is_traced(child) && + !(child->jobctl & JOBCTL_LISTENING))) ret = 0; spin_unlock_irq(&child->sighand->siglock); } @@ -660,7 +661,7 @@ int ptrace_request(struct task_struct *child, long request, { bool seized = child->ptrace & PT_SEIZED; int ret = -EIO; - siginfo_t siginfo; + siginfo_t siginfo, *si; void __user *datavp = (void __user *) data; unsigned long __user *datalp = datavp; unsigned long flags; @@ -710,8 +711,43 @@ int ptrace_request(struct task_struct *child, long request, if (unlikely(!seized || !lock_task_sighand(child, &flags))) break; + /* + * INTERRUPT doesn't disturb existing trap sans one + * exception. If ptracer issued LISTEN for the current + * STOP, this INTERRUPT should clear LISTEN and re-trap + * tracee into STOP. + */ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) - signal_wake_up(child, 0); + signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + + case PTRACE_LISTEN: + /* + * Listen for events. Tracee must be in STOP. It's not + * resumed per-se but is not considered to be in TRACED by + * wait(2) or ptrace(2). If an async event (e.g. group + * stop state change) happens, tracee will enter STOP trap + * again. Alternatively, ptracer can issue INTERRUPT to + * finish listening and re-trap tracee into STOP. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + si = child->last_siginfo; + if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) + break; + + child->jobctl |= JOBCTL_LISTENING; + + /* + * If NOTIFY is set, it means event happened between start + * of this trap and now. Trigger re-trap immediately. + */ + if (child->jobctl & JOBCTL_TRAP_NOTIFY) + signal_wake_up(child, true); unlock_task_sighand(child, &flags); ret = 0; diff --git a/kernel/signal.c b/kernel/signal.c index 06177e2b391..97e575a3387 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -825,9 +825,11 @@ static int check_kill_permission(int sig, struct siginfo *info, * TRAP_STOP to notify ptracer of an event. @t must have been seized by * ptracer. * - * If @t is running, STOP trap will be taken. If already trapped, STOP - * trap will be eventually taken without returning to userland after the - * existing traps are finished by PTRACE_CONT. + * If @t is running, STOP trap will be taken. If trapped for STOP and + * ptracer is listening for events, tracee is woken up so that it can + * re-trap for the new event. If trapped otherwise, STOP trap will be + * eventually taken without returning to userland after the existing traps + * are finished by PTRACE_CONT. * * CONTEXT: * Must be called with @task->sighand->siglock held. @@ -838,7 +840,7 @@ static void ptrace_trap_notify(struct task_struct *t) assert_spin_locked(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); - signal_wake_up(t, 0); + signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* @@ -1894,6 +1896,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) spin_lock_irq(¤t->sighand->siglock); current->last_siginfo = NULL; + /* LISTENING can be set only during STOP traps, clear it */ + current->jobctl &= ~JOBCTL_LISTENING; + /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. -- cgit v1.2.3-70-g09d2 From cb5de2f8d0306be38f9b377b8a5c56acca7dbc3d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 1 Jun 2011 18:18:09 -0700 Subject: time: Catch invalid timespec sleep values in __timekeeping_inject_sleeptime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arve suggested making sure we catch possible negative sleep time intervals that could be passed into timekeeping_inject_sleeptime. CC: Arve HjønnevÃ¥g CC: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 342408cf68d..9d09777a213 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -604,6 +604,12 @@ static struct timespec timekeeping_suspend_time; */ static void __timekeeping_inject_sleeptime(struct timespec *delta) { + if (!timespec_valid(delta)) { + printk(KERN_WARN "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + xtime = timespec_add(xtime, *delta); wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); total_sleep_time = timespec_add(total_sleep_time, *delta); -- cgit v1.2.3-70-g09d2 From cb33217b1b2523895eb328a0b13fb3b1c4000969 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 31 May 2011 22:53:23 -0700 Subject: time: Avoid accumulating time drift in suspend/resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because the read_persistent_clock interface is usually backed by only a second granular interface, each time we read from the persistent clock for suspend/resume, we introduce a half second (on average) of error. In order to avoid this error accumulating as the system is suspended over and over, this patch measures the time delta between the persistent clock and the system CLOCK_REALTIME. If the delta is less then 2 seconds from the last suspend, we compensate by using the previous time delta (keeping it close). If it is larger then 2 seconds, we assume the clock was set or has been changed, so we do no correction and update the delta. Note: If NTP is running, ths could seem to "fight" with the NTP corrected time, where as if the system time was off by 1 second, and NTP slewed the value in, a suspend/resume cycle could undo this correction, by trying to restore the previous offset from the persistent clock. However, without this patch, since each read could cause almost a full second worth of error, its possible to get almost 2 seconds of error just from the suspend/resume cycle alone, so this about equal to any offset added by the compensation. Further on systems that suspend/resume frequently, this should keep time closer then NTP could compensate for if the errors were allowed to accumulate. Credits to Arve HjønnevÃ¥g for suggesting this solution. CC: Arve HjønnevÃ¥g CC: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9d09777a213..fdc6b887b20 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -692,12 +692,34 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { unsigned long flags; + struct timespec delta, delta_delta; + static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); timekeeping_suspended = 1; + + /* + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. + */ + delta = timespec_sub(xtime, timekeeping_suspend_time); + delta_delta = timespec_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occured and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec_add(timekeeping_suspend_time, delta_delta); + } write_sequnlock_irqrestore(&xtime_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); -- cgit v1.2.3-70-g09d2 From 4f2a8d3cf5e0486fd547633fa86c5d130ae98cad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jun 2011 11:20:09 +0200 Subject: printk: Fix console_sem vs logbuf_lock unlock race Fix up the fallout from commit 0b5e1c5255 ("printk: Release console_sem after logbuf_lock"). The reason for unlocking the console_sem under the logbuf_lock is that a concurrent printk() might fill up the buffer but fail to acquire the console sem, resulting in a missed write to the console until a subsequent console_sem acquire/release cycle. Signed-off-by: Peter Zijlstra Cc: efault@gmx.de Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1308734409.1022.14.camel@twins Signed-off-by: Ingo Molnar --- kernel/printk.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 751e7b84e9e..37dff3429ad 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1244,7 +1244,7 @@ void console_unlock(void) { unsigned long flags; unsigned _con_start, _log_end; - unsigned wake_klogd = 0; + unsigned wake_klogd = 0, retry = 0; if (console_suspended) { up(&console_sem); @@ -1253,6 +1253,7 @@ void console_unlock(void) console_may_schedule = 0; +again: for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; @@ -1273,8 +1274,23 @@ void console_unlock(void) if (unlikely(exclusive_console)) exclusive_console = NULL; - spin_unlock_irqrestore(&logbuf_lock, flags); + spin_unlock(&logbuf_lock); + up(&console_sem); + + /* + * Someone could have filled up the buffer again, so re-check if there's + * something to flush. In case we cannot trylock the console_sem again, + * there's a new owner and the console_unlock() from them will do the + * flush, no worries. + */ + spin_lock(&logbuf_lock); + if (con_start != log_end) + retry = 1; + spin_unlock_irqrestore(&logbuf_lock, flags); + if (retry && console_trylock()) + goto again; + if (wake_klogd) wake_up_klogd(); } -- cgit v1.2.3-70-g09d2 From dd4e5d3ac4a76b868daf30e35bd572def96c30ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jun 2011 17:17:27 +0200 Subject: lockdep: Fix trace_[soft,hard]irqs_[on,off]() recursion Commit: 1efc5da3cf56: [PATCH] order of lockdep off/on in vprintk() should be changed explains the reason for having raw_local_irq_*() and lockdep_off() in printk(). Instead of working around the broken recursion detection of interrupt state tracking, fix it. Signed-off-by: Peter Zijlstra Cc: efault@gmx.de Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110621153806.185242734@chello.nl Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 63437d065ac..81968a065b4 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2478,15 +2478,10 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(unsigned long ip) { struct task_struct *curr = current; - time_hardirqs_on(CALLER_ADDR0, ip); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; @@ -2502,8 +2497,6 @@ void trace_hardirqs_on_caller(unsigned long ip) /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; /* @@ -2525,6 +2518,21 @@ void trace_hardirqs_on_caller(unsigned long ip) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_on_events); } + +void trace_hardirqs_on_caller(unsigned long ip) +{ + time_hardirqs_on(CALLER_ADDR0, ip); + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + current->lockdep_recursion = 1; + __trace_hardirqs_on_caller(ip); + current->lockdep_recursion = 0; +} EXPORT_SYMBOL(trace_hardirqs_on_caller); void trace_hardirqs_on(void) @@ -2574,7 +2582,7 @@ void trace_softirqs_on(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks)) + if (unlikely(!debug_locks || current->lockdep_recursion)) return; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) @@ -2585,6 +2593,7 @@ void trace_softirqs_on(unsigned long ip) return; } + current->lockdep_recursion = 1; /* * We'll do an OFF -> ON transition: */ @@ -2599,6 +2608,7 @@ void trace_softirqs_on(unsigned long ip) */ if (curr->hardirqs_enabled) mark_held_locks(curr, SOFTIRQ); + current->lockdep_recursion = 0; } /* @@ -2608,7 +2618,7 @@ void trace_softirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks)) + if (unlikely(!debug_locks || current->lockdep_recursion)) return; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) -- cgit v1.2.3-70-g09d2 From d21142ece414ce1088cfcae760689aa60d6fee80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Jun 2011 16:50:34 +0200 Subject: ptrace: kill task_ptrace() task_ptrace(task) simply dereferences task->ptrace and isn't even used consistently only adding confusion. Kill it and directly access ->ptrace instead. This doesn't introduce any behavior change. Signed-off-by: Tejun Heo Signed-off-by: Oleg Nesterov --- include/linux/ptrace.h | 11 ----------- include/linux/tracehook.h | 16 ++++++++-------- kernel/exit.c | 8 ++++---- kernel/signal.c | 14 +++++++------- mm/oom_kill.c | 3 +-- 5 files changed, 20 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 4f224f16952..3ff20b32259 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -145,17 +145,6 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); -/** - * task_ptrace - return %PT_* flags that apply to a task - * @task: pointer to &task_struct in question - * - * Returns the %PT_* flags that apply to @task. - */ -static inline int task_ptrace(struct task_struct *task) -{ - return task->ptrace; -} - /** * ptrace_event - possibly stop for a ptrace event notification * @mask: %PT_* bit to check in @current->ptrace diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 15745cdd32c..a3e838784f4 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -63,7 +63,7 @@ struct linux_binprm; */ static inline int tracehook_expect_breakpoints(struct task_struct *task) { - return (task_ptrace(task) & PT_PTRACED) != 0; + return (task->ptrace & PT_PTRACED) != 0; } /* @@ -71,7 +71,7 @@ static inline int tracehook_expect_breakpoints(struct task_struct *task) */ static inline void ptrace_report_syscall(struct pt_regs *regs) { - int ptrace = task_ptrace(current); + int ptrace = current->ptrace; if (!(ptrace & PT_PTRACED)) return; @@ -155,7 +155,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) static inline int tracehook_unsafe_exec(struct task_struct *task) { int unsafe = 0; - int ptrace = task_ptrace(task); + int ptrace = task->ptrace; if (ptrace & PT_PTRACED) { if (ptrace & PT_PTRACE_CAP) unsafe |= LSM_UNSAFE_PTRACE_CAP; @@ -178,7 +178,7 @@ static inline int tracehook_unsafe_exec(struct task_struct *task) */ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk) { - if (task_ptrace(tsk) & PT_PTRACED) + if (tsk->ptrace & PT_PTRACED) return rcu_dereference(tsk->parent); return NULL; } @@ -202,7 +202,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt, struct pt_regs *regs) { if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) && - unlikely(task_ptrace(current) & PT_PTRACED)) + unlikely(current->ptrace & PT_PTRACED)) send_sig(SIGTRAP, current, 0); } @@ -285,7 +285,7 @@ static inline void tracehook_report_clone(struct pt_regs *regs, unsigned long clone_flags, pid_t pid, struct task_struct *child) { - if (unlikely(task_ptrace(child))) { + if (unlikely(child->ptrace)) { /* * It doesn't matter who attached/attaching to this * task, the pending SIGSTOP is right in any case. @@ -403,7 +403,7 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, static inline int tracehook_consider_ignored_signal(struct task_struct *task, int sig) { - return (task_ptrace(task) & PT_PTRACED) != 0; + return (task->ptrace & PT_PTRACED) != 0; } /** @@ -422,7 +422,7 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task, static inline int tracehook_consider_fatal_signal(struct task_struct *task, int sig) { - return (task_ptrace(task) & PT_PTRACED) != 0; + return (task->ptrace & PT_PTRACED) != 0; } #define DEATH_REAP -1 diff --git a/kernel/exit.c b/kernel/exit.c index 289f59d686b..e5cc0564460 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -765,7 +765,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ - if (!task_ptrace(p) && + if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { do_notify_parent(p, p->exit_signal); if (task_detached(p)) { @@ -795,7 +795,7 @@ static void forget_original_parent(struct task_struct *father) do { t->real_parent = reaper; if (t->parent == father) { - BUG_ON(task_ptrace(t)); + BUG_ON(t->ptrace); t->parent = t->real_parent; } if (t->pdeath_signal) @@ -1565,7 +1565,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Notification and reaping will be cascaded to the real * parent when the ptracer detaches. */ - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* it will become visible, clear notask_error */ wo->notask_error = 0; return 0; @@ -1608,7 +1608,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * own children, it should create a separate process which * takes the role of real parent. */ - if (likely(!ptrace) && task_ptrace(p) && + if (likely(!ptrace) && p->ptrace && same_thread_group(p->parent, p->real_parent)) return 0; diff --git a/kernel/signal.c b/kernel/signal.c index 97e575a3387..0f337087250 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1592,7 +1592,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) /* do_notify_parent_cldstop should have been called instead. */ BUG_ON(task_is_stopped_or_traced(tsk)); - BUG_ON(!task_ptrace(tsk) && + BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); info.si_signo = sig; @@ -1631,7 +1631,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); - if (!task_ptrace(tsk) && sig == SIGCHLD && + if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* @@ -1731,7 +1731,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, static inline int may_ptrace_stop(void) { - if (!likely(task_ptrace(current))) + if (!likely(current->ptrace)) return 0; /* * Are we in the middle of do_coredump? @@ -1989,7 +1989,7 @@ static bool do_signal_stop(int signr) if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; else - WARN_ON_ONCE(!task_ptrace(current)); + WARN_ON_ONCE(!current->ptrace); sig->group_stop_count = 0; @@ -2014,7 +2014,7 @@ static bool do_signal_stop(int signr) } } - if (likely(!task_ptrace(current))) { + if (likely(!current->ptrace)) { int notify = 0; /* @@ -2093,7 +2093,7 @@ static void do_jobctl_trap(void) static int ptrace_signal(int signr, siginfo_t *info, struct pt_regs *regs, void *cookie) { - if (!task_ptrace(current)) + if (!current->ptrace) return signr; ptrace_signal_deliver(regs, cookie); @@ -2179,7 +2179,7 @@ relock: do_notify_parent_cldstop(current, false, why); leader = current->group_leader; - if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) + if (leader->ptrace && !real_parent_is_ptracer(leader)) do_notify_parent_cldstop(leader, true, why); read_unlock(&tasklist_lock); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca35..b0be989d436 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * then wait for it to finish before killing * some other task unnecessarily. */ - if (!(task_ptrace(p->group_leader) & - PT_TRACE_EXIT)) + if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) return ERR_PTR(-1UL); } } -- cgit v1.2.3-70-g09d2 From a288eecce5253cc1565d400a52b9b476a157e040 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Jun 2011 16:50:37 +0200 Subject: ptrace: kill trivial tracehooks At this point, tracehooks aren't useful to mainline kernel and mostly just add an extra layer of obfuscation. Although they have comments, without actual in-kernel users, it is difficult to tell what are their assumptions and they're actually trying to achieve. To mainline kernel, they just aren't worth keeping around. This patch kills the following trivial tracehooks. * Ones testing whether task is ptraced. Replace with ->ptrace test. tracehook_expect_breakpoints() tracehook_consider_ignored_signal() tracehook_consider_fatal_signal() * ptrace_event() wrappers. Call directly. tracehook_report_exec() tracehook_report_exit() tracehook_report_vfork_done() * ptrace_release_task() wrapper. Call directly. tracehook_finish_release_task() * noop tracehook_prepare_release_task() tracehook_report_death() This doesn't introduce any behavior change. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Cc: Martin Schwidefsky Signed-off-by: Oleg Nesterov --- arch/s390/kernel/traps.c | 4 +- fs/exec.c | 2 +- include/linux/tracehook.h | 156 ---------------------------------------------- kernel/exit.c | 7 +-- kernel/fork.c | 2 +- kernel/signal.c | 8 +-- mm/nommu.c | 3 +- 7 files changed, 11 insertions(+), 171 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index a65d2e82f61..a63d34c3611 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -331,7 +331,7 @@ void __kprobes do_per_trap(struct pt_regs *regs) { if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP) return; - if (tracehook_consider_fatal_signal(current, SIGTRAP)) + if (current->ptrace) force_sig(SIGTRAP, current); } @@ -425,7 +425,7 @@ static void __kprobes illegal_op(struct pt_regs *regs, long pgm_int_code, if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) return; if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { - if (tracehook_consider_fatal_signal(current, SIGTRAP)) + if (current->ptrace) force_sig(SIGTRAP, current); else signal = SIGILL; diff --git a/fs/exec.c b/fs/exec.c index a9f2b3631bd..b37030d0a50 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1384,7 +1384,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) bprm->recursion_depth = depth; if (retval >= 0) { if (depth == 0) - tracehook_report_exec(fmt, bprm, regs); + ptrace_event(PTRACE_EVENT_EXEC, 0); put_binfmt(fmt); allow_write_access(bprm->file); if (bprm->file) diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3b68aa842a9..8b06d4f2b81 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -51,21 +51,6 @@ #include struct linux_binprm; -/** - * tracehook_expect_breakpoints - guess if task memory might be touched - * @task: current task, making a new mapping - * - * Return nonzero if @task is expected to want breakpoint insertion in - * its memory at some point. A zero return is no guarantee it won't - * be done, but this is a hint that it's known to be likely. - * - * May be called with @task->mm->mmap_sem held for writing. - */ -static inline int tracehook_expect_breakpoints(struct task_struct *task) -{ - return (task->ptrace & PT_PTRACED) != 0; -} - /* * ptrace report for syscall entry and exit looks identical. */ @@ -183,42 +168,6 @@ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk) return NULL; } -/** - * tracehook_report_exec - a successful exec was completed - * @fmt: &struct linux_binfmt that performed the exec - * @bprm: &struct linux_binprm containing exec details - * @regs: user-mode register state - * - * An exec just completed, we are shortly going to return to user mode. - * The freshly initialized register state can be seen and changed in @regs. - * The name, file and other pointers in @bprm are still on hand to be - * inspected, but will be freed as soon as this returns. - * - * Called with no locks, but with some kernel resources held live - * and a reference on @fmt->module. - */ -static inline void tracehook_report_exec(struct linux_binfmt *fmt, - struct linux_binprm *bprm, - struct pt_regs *regs) -{ - ptrace_event(PTRACE_EVENT_EXEC, 0); -} - -/** - * tracehook_report_exit - task has begun to exit - * @exit_code: pointer to value destined for @current->exit_code - * - * @exit_code points to the value passed to do_exit(), which tracing - * might change here. This is almost the first thing in do_exit(), - * before freeing any resources or setting the %PF_EXITING flag. - * - * Called with no locks held. - */ -static inline void tracehook_report_exit(long *exit_code) -{ - ptrace_event(PTRACE_EVENT_EXIT, *exit_code); -} - /** * tracehook_prepare_clone - prepare for new child to be cloned * @clone_flags: %CLONE_* flags from clone/fork/vfork system call @@ -319,52 +268,6 @@ static inline void tracehook_report_clone_complete(int trace, ptrace_event(trace, pid); } -/** - * tracehook_report_vfork_done - vfork parent's child has exited or exec'd - * @child: child task, already running - * @pid: new child's PID in the parent's namespace - * - * Called after a %CLONE_VFORK parent has waited for the child to complete. - * The clone/vfork system call will return immediately after this. - * The @child pointer may be invalid if a self-reaping child died and - * tracehook_report_clone() took no action to prevent it from self-reaping. - * - * Called with no locks held. - */ -static inline void tracehook_report_vfork_done(struct task_struct *child, - pid_t pid) -{ - ptrace_event(PTRACE_EVENT_VFORK_DONE, pid); -} - -/** - * tracehook_prepare_release_task - task is being reaped, clean up tracing - * @task: task in %EXIT_DEAD state - * - * This is called in release_task() just before @task gets finally reaped - * and freed. This would be the ideal place to remove and clean up any - * tracing-related state for @task. - * - * Called with no locks held. - */ -static inline void tracehook_prepare_release_task(struct task_struct *task) -{ -} - -/** - * tracehook_finish_release_task - final tracing clean-up - * @task: task in %EXIT_DEAD state - * - * This is called in release_task() when @task is being in the middle of - * being reaped. After this, there must be no tracing entanglements. - * - * Called with write_lock_irq(&tasklist_lock) held. - */ -static inline void tracehook_finish_release_task(struct task_struct *task) -{ - ptrace_release_task(task); -} - /** * tracehook_signal_handler - signal handler setup is complete * @sig: number of signal being delivered @@ -388,41 +291,6 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, ptrace_notify(SIGTRAP); } -/** - * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal - * @task: task receiving the signal - * @sig: signal number being sent - * - * Return zero iff tracing doesn't care to examine this ignored signal, - * so it can short-circuit normal delivery and never even get queued. - * - * Called with @task->sighand->siglock held. - */ -static inline int tracehook_consider_ignored_signal(struct task_struct *task, - int sig) -{ - return (task->ptrace & PT_PTRACED) != 0; -} - -/** - * tracehook_consider_fatal_signal - suppress special handling of fatal signal - * @task: task receiving the signal - * @sig: signal number being sent - * - * Return nonzero to prevent special handling of this termination signal. - * Normally handler for signal is %SIG_DFL. It can be %SIG_IGN if @sig is - * ignored, in which case force_sig() is about to reset it to %SIG_DFL. - * When this returns zero, this signal might cause a quick termination - * that does not give the debugger a chance to intercept the signal. - * - * Called with or without @task->sighand->siglock held. - */ -static inline int tracehook_consider_fatal_signal(struct task_struct *task, - int sig) -{ - return (task->ptrace & PT_PTRACED) != 0; -} - #define DEATH_REAP -1 #define DEATH_DELAYED_GROUP_LEADER -2 @@ -457,30 +325,6 @@ static inline int tracehook_notify_death(struct task_struct *task, return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER; } -/** - * tracehook_report_death - task is dead and ready to be reaped - * @task: @current task now exiting - * @signal: return value from tracheook_notify_death() - * @death_cookie: value passed back from tracehook_notify_death() - * @group_dead: nonzero if this was the last thread in the group to die - * - * Thread has just become a zombie or is about to self-reap. If positive, - * @signal is the signal number just sent to the parent (usually %SIGCHLD). - * If @signal is %DEATH_REAP, this thread will self-reap. If @signal is - * %DEATH_DELAYED_GROUP_LEADER, this is a delayed_group_leader() zombie. - * The @death_cookie was passed back by tracehook_notify_death(). - * - * If normal reaping is not inhibited, @task->exit_state might be changing - * in parallel. - * - * Called without locks. - */ -static inline void tracehook_report_death(struct task_struct *task, - int signal, void *death_cookie, - int group_dead) -{ -} - #ifdef TIF_NOTIFY_RESUME /** * set_notify_resume - cause tracehook_notify_resume() to be called diff --git a/kernel/exit.c b/kernel/exit.c index e5cc0564460..d49134a7f25 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -169,7 +169,6 @@ void release_task(struct task_struct * p) struct task_struct *leader; int zap_leader; repeat: - tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); @@ -179,7 +178,7 @@ repeat: proc_flush_task(p); write_lock_irq(&tasklist_lock); - tracehook_finish_release_task(p); + ptrace_release_task(p); __exit_signal(p); /* @@ -868,8 +867,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - tracehook_report_death(tsk, signal, cookie, group_dead); - /* If the process is dead, release it - nobody will wait for it */ if (signal == DEATH_REAP) release_task(tsk); @@ -924,7 +921,7 @@ NORET_TYPE void do_exit(long code) */ set_fs(USER_DS); - tracehook_report_exit(&code); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a..d4f0dff9d61 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1527,7 +1527,7 @@ long do_fork(unsigned long clone_flags, freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - tracehook_report_vfork_done(p, nr); + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); diff --git a/kernel/signal.c b/kernel/signal.c index 0f337087250..1550aee34f4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) /* * Tracers may want to know about even ignored signals. */ - return !tracehook_consider_ignored_signal(t, sig); + return !t->ptrace; } /* @@ -493,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return 1; if (handler != SIG_IGN && handler != SIG_DFL) return 0; - return !tracehook_consider_fatal_signal(tsk, sig); + /* if ptraced, let the tracer determine */ + return !tsk->ptrace; } /* @@ -981,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) if (sig_fatal(p, sig) && !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || - !tracehook_consider_fatal_signal(t, sig))) { + (sig == SIGKILL || !t->ptrace)) { /* * This signal will be fatal to the whole group. */ diff --git a/mm/nommu.c b/mm/nommu.c index 1fd0c51b10a..54ae707bdae 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) + if ((flags & MAP_PRIVATE) && current->ptrace) vm_flags &= ~VM_MAYSHARE; return vm_flags; -- cgit v1.2.3-70-g09d2 From 4b9d33e6d83cc05a8005a8f9a8b9677fa0f53626 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Jun 2011 16:50:38 +0200 Subject: ptrace: kill clone/exec tracehooks At this point, tracehooks aren't useful to mainline kernel and mostly just add an extra layer of obfuscation. Although they have comments, without actual in-kernel users, it is difficult to tell what are their assumptions and they're actually trying to achieve. To mainline kernel, they just aren't worth keeping around. This patch kills the following clone and exec related tracehooks. tracehook_prepare_clone() tracehook_finish_clone() tracehook_report_clone() tracehook_report_clone_complete() tracehook_unsafe_exec() The changes are mostly trivial - logic is moved to the caller and comments are merged and adjusted appropriately. The only exception is in check_unsafe_exec() where LSM_UNSAFE_PTRACE* are OR'd to bprm->unsafe instead of setting it, which produces the same result as the field is always zero on entry. It also tests p->ptrace instead of (p->ptrace & PT_PTRACED) for consistency, which also gives the same result. This doesn't introduce any behavior change. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Oleg Nesterov --- fs/exec.c | 7 ++- include/linux/tracehook.h | 121 ---------------------------------------------- kernel/fork.c | 41 ++++++++++++---- 3 files changed, 38 insertions(+), 131 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index b37030d0a50..8dca45b0dae 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1224,7 +1224,12 @@ int check_unsafe_exec(struct linux_binprm *bprm) unsigned n_fs; int res = 0; - bprm->unsafe = tracehook_unsafe_exec(p); + if (p->ptrace) { + if (p->ptrace & PT_PTRACE_CAP) + bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; + else + bprm->unsafe |= LSM_UNSAFE_PTRACE; + } n_fs = 1; spin_lock(&p->fs->lock); diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 8b06d4f2b81..bcc4ca762ae 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -129,27 +129,6 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) ptrace_report_syscall(regs); } -/** - * tracehook_unsafe_exec - check for exec declared unsafe due to tracing - * @task: current task doing exec - * - * Return %LSM_UNSAFE_* bits applied to an exec because of tracing. - * - * @task->signal->cred_guard_mutex is held by the caller through the do_execve(). - */ -static inline int tracehook_unsafe_exec(struct task_struct *task) -{ - int unsafe = 0; - int ptrace = task->ptrace; - if (ptrace & PT_PTRACED) { - if (ptrace & PT_PTRACE_CAP) - unsafe |= LSM_UNSAFE_PTRACE_CAP; - else - unsafe |= LSM_UNSAFE_PTRACE; - } - return unsafe; -} - /** * tracehook_tracer_task - return the task that is tracing the given task * @tsk: task to consider @@ -168,106 +147,6 @@ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk) return NULL; } -/** - * tracehook_prepare_clone - prepare for new child to be cloned - * @clone_flags: %CLONE_* flags from clone/fork/vfork system call - * - * This is called before a new user task is to be cloned. - * Its return value will be passed to tracehook_finish_clone(). - * - * Called with no locks held. - */ -static inline int tracehook_prepare_clone(unsigned clone_flags) -{ - int event = 0; - - if (clone_flags & CLONE_UNTRACED) - return 0; - - if (clone_flags & CLONE_VFORK) - event = PTRACE_EVENT_VFORK; - else if ((clone_flags & CSIGNAL) != SIGCHLD) - event = PTRACE_EVENT_CLONE; - else - event = PTRACE_EVENT_FORK; - - return ptrace_event_enabled(current, event) ? event : 0; -} - -/** - * tracehook_finish_clone - new child created and being attached - * @child: new child task - * @clone_flags: %CLONE_* flags from clone/fork/vfork system call - * @trace: return value from tracehook_prepare_clone() - * - * This is called immediately after adding @child to its parent's children list. - * The @trace value is that returned by tracehook_prepare_clone(). - * - * Called with current's siglock and write_lock_irq(&tasklist_lock) held. - */ -static inline void tracehook_finish_clone(struct task_struct *child, - unsigned long clone_flags, int trace) -{ - ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace); -} - -/** - * tracehook_report_clone - in parent, new child is about to start running - * @regs: parent's user register state - * @clone_flags: flags from parent's system call - * @pid: new child's PID in the parent's namespace - * @child: new child task - * - * Called after a child is set up, but before it has been started running. - * This is not a good place to block, because the child has not started - * yet. Suspend the child here if desired, and then block in - * tracehook_report_clone_complete(). This must prevent the child from - * self-reaping if tracehook_report_clone_complete() uses the @child - * pointer; otherwise it might have died and been released by the time - * tracehook_report_clone_complete() is called. - * - * Called with no locks held, but the child cannot run until this returns. - */ -static inline void tracehook_report_clone(struct pt_regs *regs, - unsigned long clone_flags, - pid_t pid, struct task_struct *child) -{ - if (unlikely(child->ptrace)) { - /* - * It doesn't matter who attached/attaching to this - * task, the pending SIGSTOP is right in any case. - */ - sigaddset(&child->pending.signal, SIGSTOP); - set_tsk_thread_flag(child, TIF_SIGPENDING); - } -} - -/** - * tracehook_report_clone_complete - new child is running - * @trace: return value from tracehook_prepare_clone() - * @regs: parent's user register state - * @clone_flags: flags from parent's system call - * @pid: new child's PID in the parent's namespace - * @child: child task, already running - * - * This is called just after the child has started running. This is - * just before the clone/fork syscall returns, or blocks for vfork - * child completion if @clone_flags has the %CLONE_VFORK bit set. - * The @child pointer may be invalid if a self-reaping child died and - * tracehook_report_clone() took no action to prevent it from self-reaping. - * - * Called with no locks held. - */ -static inline void tracehook_report_clone_complete(int trace, - struct pt_regs *regs, - unsigned long clone_flags, - pid_t pid, - struct task_struct *child) -{ - if (unlikely(trace)) - ptrace_event(trace, pid); -} - /** * tracehook_signal_handler - signal handler setup is complete * @sig: number of signal being delivered diff --git a/kernel/fork.c b/kernel/fork.c index d4f0dff9d61..3c72a5b321a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1340,7 +1340,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - tracehook_finish_clone(p, clone_flags, trace); + ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { if (is_child_reaper(pid)) @@ -1481,10 +1481,22 @@ long do_fork(unsigned long clone_flags, } /* - * When called from kernel_thread, don't do user tracing stuff. + * Determine whether and which event to report to ptracer. When + * called from kernel_thread or CLONE_UNTRACED is explicitly + * requested, no event is reported; otherwise, report if the event + * for the type of forking is enabled. */ - if (likely(user_mode(regs))) - trace = tracehook_prepare_clone(clone_flags); + if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (clone_flags & CLONE_VFORK) + trace = PTRACE_EVENT_VFORK; + else if ((clone_flags & CSIGNAL) != SIGCHLD) + trace = PTRACE_EVENT_CLONE; + else + trace = PTRACE_EVENT_FORK; + + if (likely(!ptrace_event_enabled(current, trace))) + trace = 0; + } p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); @@ -1508,20 +1520,31 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(regs, clone_flags, nr, p); + + /* + * Child is ready but hasn't started running yet. Queue + * SIGSTOP if it's gonna be ptraced - it doesn't matter who + * attached/attaching to this task, the pending SIGSTOP is + * right in any case. + */ + if (unlikely(p->ptrace)) { + sigaddset(&p->pending.signal, SIGSTOP); + set_tsk_thread_flag(p, TIF_SIGPENDING); + } /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that - * hasn't gotten to tracehook_report_clone() yet. Now we - * clear it and set the child going. + * hasn't finished SIGSTOP raising yet. Now we clear it + * and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p); - tracehook_report_clone_complete(trace, regs, - clone_flags, nr, p); + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); -- cgit v1.2.3-70-g09d2 From d902db1eb60387040fe541573083e47469db50ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 19:31:56 +0200 Subject: sched: Generalize sleep inside spinlock detection The sleeping inside spinlock detection is actually used for more general sleeping inside atomic sections debugging: preemption disabled, rcu read side critical sections, interrupts, interrupt disabled, etc... Change the name of the config and its help section to reflect its more general role. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Acked-by: Randy Dunlap Cc: Peter Zijlstra Cc: Ingo Molnar --- Documentation/DocBook/kernel-hacking.tmpl | 2 +- Documentation/SubmitChecklist | 2 +- Documentation/development-process/4.Coding | 2 +- Documentation/ja_JP/SubmitChecklist | 2 +- Documentation/zh_CN/SubmitChecklist | 2 +- include/linux/kernel.h | 2 +- kernel/sched.c | 2 +- lib/Kconfig.debug | 8 +++++--- 8 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl index 7b3f4936341..07a9c48de5a 100644 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -409,7 +409,7 @@ cond_resched(); /* Will sleep */ You should always compile your kernel - CONFIG_DEBUG_SPINLOCK_SLEEP on, and it will warn + CONFIG_DEBUG_ATOMIC_SLEEP on, and it will warn you if you break these rules. If you do break the rules, you will eventually lock up your box. diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index da0382daa39..7b13be41c08 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist @@ -53,7 +53,7 @@ kernel patches. 12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, - CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously + CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP all simultaneously enabled. 13: Has been build- and runtime tested with and without CONFIG_SMP and diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding index f3f1a469443..83f5f5b365a 100644 --- a/Documentation/development-process/4.Coding +++ b/Documentation/development-process/4.Coding @@ -244,7 +244,7 @@ testing purposes. In particular, you should turn on: - DEBUG_SLAB can find a variety of memory allocation and use errors; it should be used on most development kernels. - - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a + - DEBUG_SPINLOCK, DEBUG_ATOMIC_SLEEP, and DEBUG_MUTEXES will find a number of common locking errors. There are quite a few other debugging options, some of which will be diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist index 2df4576f117..cb5507b1ac8 100644 --- a/Documentation/ja_JP/SubmitChecklist +++ b/Documentation/ja_JP/SubmitChecklist @@ -68,7 +68,7 @@ Linux カーãƒãƒ«ãƒ‘ッãƒæŠ•ç¨¿è€…å‘ã‘ãƒã‚§ãƒƒã‚¯ãƒªã‚¹ãƒˆ 12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK, - CONFIG_DEBUG_SPINLOCK_SLEEP ã“れら全ã¦ã‚’åŒæ™‚ã«æœ‰åŠ¹ã«ã—ã¦å‹•ä½œç¢ºèªã‚’ + CONFIG_DEBUG_ATOMIC_SLEEP ã“れら全ã¦ã‚’åŒæ™‚ã«æœ‰åŠ¹ã«ã—ã¦å‹•ä½œç¢ºèªã‚’ è¡Œã£ã¦ãã ã•ã„。 13: CONFIG_SMP, CONFIG_PREEMPT を有効ã«ã—ãŸå ´åˆã¨ç„¡åŠ¹ã«ã—ãŸå ´åˆã®ä¸¡æ–¹ã§ diff --git a/Documentation/zh_CN/SubmitChecklist b/Documentation/zh_CN/SubmitChecklist index 951415bbab0..4c741d6bc04 100644 --- a/Documentation/zh_CN/SubmitChecklist +++ b/Documentation/zh_CN/SubmitChecklist @@ -67,7 +67,7 @@ Linux 12£ºÒѾ­Í¨¹ýCONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, - CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP²âÊÔ£¬²¢ÇÒͬʱ¶¼ + CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP²âÊÔ£¬²¢ÇÒͬʱ¶¼ ʹÄÜ¡£ 13£ºÒѾ­¶¼¹¹½¨²¢ÇÒʹÓûòÕß²»Ê¹Óà CONFIG_SMP ºÍ CONFIG_PREEMPT²âÊÔÖ´ÐÐʱ¼ä¡£ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fb0e7329fee..24b489f6659 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -121,7 +121,7 @@ extern int _cond_resched(void); # define might_resched() do { } while (0) #endif -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep diff --git a/kernel/sched.c b/kernel/sched.c index 90ad7cf2b29..a5f318b8d65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,7 +8018,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a7dd7b547fe..81a4f3302bc 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -648,13 +648,15 @@ config TRACE_IRQFLAGS Enables hooks to interrupt enabling and disabling for either tracing or lock debugging. -config DEBUG_SPINLOCK_SLEEP - bool "Spinlock debugging: sleep-inside-spinlock checking" +config DEBUG_ATOMIC_SLEEP + bool "Sleep inside atomic section checking" select PREEMPT_COUNT depends on DEBUG_KERNEL help If you say Y here, various routines which may sleep will become very - noisy if they are called with a spinlock held. + noisy if they are called inside atomic sections: when a spinlock is + held, inside an rcu read side critical section, inside preempt disabled + sections, inside an interrupt, etc... config DEBUG_LOCKING_API_SELFTESTS bool "Locking API boot-time self-tests" -- cgit v1.2.3-70-g09d2 From 53c8f9f199b239668e6b1a907735ee323a0d1ccd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jun 2011 23:08:18 +0200 Subject: make do_notify_parent() return bool - change do_notify_parent() to return a boolean, true if the task should be reaped because its parent ignores SIGCHLD. - update the only caller which checks the returned value, exit_notify(). This temporary uglifies exit_notify() even more, will be cleanuped by the next change. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- include/linux/sched.h | 4 ++-- kernel/exit.c | 9 ++++++--- kernel/signal.c | 17 +++++++++-------- 3 files changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 87f7ca7ed6f..0df7231d9ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2145,7 +2145,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s spin_unlock_irqrestore(&tsk->sighand->siglock, flags); return ret; -} +} extern void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask); @@ -2160,7 +2160,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_ extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern int do_notify_parent(struct task_struct *, int); +extern bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); diff --git a/kernel/exit.c b/kernel/exit.c index d49134a7f25..34d135f4fcc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -820,6 +820,7 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk, int group_dead) { int signal; + bool autoreap; void *cookie; /* @@ -858,9 +859,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead) signal = tracehook_notify_death(tsk, &cookie, group_dead); if (signal >= 0) - signal = do_notify_parent(tsk, signal); + autoreap = do_notify_parent(tsk, signal); + else + autoreap = (signal == DEATH_REAP); - tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) @@ -868,7 +871,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) write_unlock_irq(&tasklist_lock); /* If the process is dead, release it - nobody will wait for it */ - if (signal == DEATH_REAP) + if (autoreap) release_task(tsk); } diff --git a/kernel/signal.c b/kernel/signal.c index 1550aee34f4..d52e82cd62b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1577,15 +1577,15 @@ ret: * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * - * Returns -1 if our parent ignored us and so we've switched to - * self-reaping, or else @sig. + * Returns true if our parent ignored us and so we've switched to + * self-reaping. */ -int do_notify_parent(struct task_struct *tsk, int sig) +bool do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; struct sighand_struct *psig; - int ret = sig; + bool autoreap = false; BUG_ON(sig == -1); @@ -1649,16 +1649,17 @@ int do_notify_parent(struct task_struct *tsk, int sig) * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ - ret = tsk->exit_signal = -1; + autoreap = true; + tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = -1; + sig = 0; } - if (valid_signal(sig) && sig > 0) + if (valid_signal(sig) && sig) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); - return ret; + return autoreap; } /** -- cgit v1.2.3-70-g09d2 From 45cdf5cc0703c537194588c63d53bad1f2539d36 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2011 19:06:50 +0200 Subject: kill tracehook_notify_death() Kill tracehook_notify_death(), reimplement the logic in its caller, exit_notify(). Also, change the exec_id's check to use thread_group_leader() instead of task_detached(), this is more clear. This logic only applies to the exiting leader, a sub-thread must never change its exit_signal. Note: when the traced group leader exits the exit_signal-or-SIGCHLD logic looks really strange: - we notify the tracer even if !thread_group_empty() but do_wait(WEXITED) can't work until all threads exit - if the tracer is real_parent, it is not clear why can't we use ->exit_signal event if !thread_group_empty() -v2: do not try to fix the 2nd oddity to avoid the subtle behavior change mixed with reorganization, suggested by Tejun. Signed-off-by: Oleg Nesterov Reviewed-by: Tejun Heo --- include/linux/tracehook.h | 34 ---------------------------------- kernel/exit.c | 21 +++++++++++++-------- 2 files changed, 13 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 7a1bd12aeff..a71a2927a6a 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -152,40 +152,6 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, ptrace_notify(SIGTRAP); } -#define DEATH_REAP -1 -#define DEATH_DELAYED_GROUP_LEADER -2 - -/** - * tracehook_notify_death - task is dead, ready to notify parent - * @task: @current task now exiting - * @death_cookie: value to pass to tracehook_report_death() - * @group_dead: nonzero if this was the last thread in the group to die - * - * A return value >= 0 means call do_notify_parent() with that signal - * number. Negative return value can be %DEATH_REAP to self-reap right - * now, or %DEATH_DELAYED_GROUP_LEADER to a zombie without notifying our - * parent. Note that a return value of 0 means a do_notify_parent() call - * that sends no signal, but still wakes up a parent blocked in wait*(). - * - * Called with write_lock_irq(&tasklist_lock) held. - */ -static inline int tracehook_notify_death(struct task_struct *task, - void **death_cookie, int group_dead) -{ - if (task_detached(task)) - return task->ptrace ? SIGCHLD : DEATH_REAP; - - /* - * If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (thread_group_empty(task) && !ptrace_reparented(task)) - return task->exit_signal; - - return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER; -} - #ifdef TIF_NOTIFY_RESUME /** * set_notify_resume - cause tracehook_notify_resume() to be called diff --git a/kernel/exit.c b/kernel/exit.c index 34d135f4fcc..bb08e938ca7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -819,9 +819,7 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int signal; bool autoreap; - void *cookie; /* * This does two things: @@ -852,16 +850,23 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * we have changed execution domain as these two values started * the same after a fork. */ - if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && (tsk->parent_exec_id != tsk->real_parent->self_exec_id || tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; - signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal >= 0) - autoreap = do_notify_parent(tsk, signal); - else - autoreap = (signal == DEATH_REAP); + if (unlikely(tsk->ptrace)) { + int sig = thread_group_leader(tsk) && + thread_group_empty(tsk) && + !ptrace_reparented(tsk) ? + tsk->exit_signal : SIGCHLD; + autoreap = do_notify_parent(tsk, sig); + } else if (thread_group_leader(tsk)) { + autoreap = thread_group_empty(tsk) && + do_notify_parent(tsk, tsk->exit_signal); + } else { + autoreap = true; + } tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; -- cgit v1.2.3-70-g09d2 From 9843a1e977977986d0a4c1000f2229b032572534 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jun 2011 23:08:53 +0200 Subject: __ptrace_detach: avoid task_detached(), check do_notify_parent() __ptrace_detach() relies on the current obscure behaviour of do_notify_parent(tsk) which changes tsk->exit_signal if this child should be silently reaped. That is why we check task_detached(), it is true if the task is sub-thread, or it is the group_leader but its exit_signal was changed by do_notify_parent(). This is confusing, change the code to rely on !thread_group_leader() or the value returned by do_notify_parent(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- kernel/ptrace.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e18966c1c0d..66a28bd71ef 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -370,25 +370,28 @@ static int ignoring_children(struct sighand_struct *sigh) */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { + bool dead; + __ptrace_unlink(p); - if (p->exit_state == EXIT_ZOMBIE) { - if (!task_detached(p) && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, tracer)) - do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { - __wake_up_parent(p, tracer); - p->exit_signal = -1; - } - } - if (task_detached(p)) { - /* Mark it as in the process of being reaped. */ - p->exit_state = EXIT_DEAD; - return true; + if (p->exit_state != EXIT_ZOMBIE) + return false; + + dead = !thread_group_leader(p); + + if (!dead && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, tracer)) + dead = do_notify_parent(p, p->exit_signal); + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); + p->exit_signal = -1; + dead = true; } } - - return false; + /* Mark it as in the process of being reaped. */ + if (dead) + p->exit_state = EXIT_DEAD; + return dead; } static int ptrace_detach(struct task_struct *child, unsigned int data) -- cgit v1.2.3-70-g09d2 From 8677347378044ab564470bced2275520efb3670d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jun 2011 23:09:09 +0200 Subject: make do_notify_parent() __must_check, update the callers Change other callers of do_notify_parent() to check the value it returns, this makes the subsequent task_detached() unnecessary. Mark do_notify_parent() as __must_check. Use thread_group_leader() instead of !task_detached() to check if we need to notify the real parent in wait_task_zombie(). Remove the stale comment in release_task(). "just for sanity" is no longer true, we have to set EXIT_DEAD to avoid the races with do_wait(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- include/linux/sched.h | 2 +- kernel/exit.c | 29 ++++++++--------------------- 2 files changed, 9 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0df7231d9ee..0cb4f097f76 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2160,7 +2160,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_ extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern bool do_notify_parent(struct task_struct *, int); +extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); diff --git a/kernel/exit.c b/kernel/exit.c index bb08e938ca7..f68d137ffeb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -190,21 +190,12 @@ repeat: leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { BUG_ON(task_detached(leader)); - do_notify_parent(leader, leader->exit_signal); /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. - * - * do_notify_parent() will have marked it self-reaping in - * that case. - */ - zap_leader = task_detached(leader); - - /* - * This maintains the invariant that release_task() - * only runs on a task in EXIT_DEAD, just for sanity. */ + zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } @@ -766,8 +757,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { - do_notify_parent(p, p->exit_signal); - if (task_detached(p)) { + if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_move_tail(&p->sibling, dead); } @@ -1351,16 +1341,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* - * If this is not a detached task, notify the parent. - * If it's still not detached after that, don't release - * it now. + * If this is not a sub-thread, notify the parent. + * If parent wants a zombie, don't release it now. */ - if (!task_detached(p)) { - do_notify_parent(p, p->exit_signal); - if (!task_detached(p)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + if (thread_group_leader(p) && + !do_notify_parent(p, p->exit_signal)) { + p->exit_state = EXIT_ZOMBIE; + p = NULL; } write_unlock_irq(&tasklist_lock); } -- cgit v1.2.3-70-g09d2 From 0976a03e5ce8ec346e985f21046d7a75bb7fdffd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jun 2011 23:09:39 +0200 Subject: reparent_leader: check EXIT_DEAD instead of task_detached() Change reparent_leader() to check ->exit_state instead of ->exit_signal, this matches the similar EXIT_DEAD check in wait_consider_task() and allows us to cleanup the do_notify_parent/task_detached logic. task_detached() was really needed during reparenting before 9cd80bbb "do_wait() optimization: do not place sub-threads on ->children list" to filter out the sub-threads. After this change task_detached(p) can only be true if p is the dead group_leader and its parent ignores SIGCHLD, in this case the caller of do_notify_parent() is going to reap this task and it should set EXIT_DEAD. Signed-off-by: Oleg Nesterov Reviewed-by: Tejun Heo --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f68d137ffeb..2b1ba8048a1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -742,7 +742,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, { list_move_tail(&p->sibling, &p->real_parent->children); - if (task_detached(p)) + if (p->exit_state == EXIT_DEAD) return; /* * If this is a threaded reparent there is no need to -- cgit v1.2.3-70-g09d2 From e550f14dc6322e794d4e70825f63c9c99177ae8b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jun 2011 23:09:54 +0200 Subject: kill task_detached() Upadate the last user of task_detached(), wait_task_zombie(), to use thread_group_leader() and kill task_detached(). Signed-off-by: Oleg Nesterov Reviewed-by: Tejun Heo --- include/linux/sched.h | 5 ----- kernel/exit.c | 5 ++--- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0cb4f097f76..39acee2c892 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2318,11 +2318,6 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) -static inline int task_detached(struct task_struct *p) -{ - return p->exit_signal == -1; -} - /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also diff --git a/kernel/exit.c b/kernel/exit.c index 2b1ba8048a1..9fa99702645 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -189,7 +189,6 @@ repeat: zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(task_detached(leader)); /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -1231,9 +1230,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check - * !task_detached() to filter out sub-threads. + * thread_group_leader() to filter out sub-threads. */ - if (likely(!traced) && likely(!task_detached(p))) { + if (likely(!traced) && thread_group_leader(p)) { struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; -- cgit v1.2.3-70-g09d2 From d4f7c511c1c2a67eb287987cf1ce9554149030e6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jun 2011 23:10:11 +0200 Subject: do not change dead_task->exit_signal __ptrace_detach() and do_notify_parent() set task->exit_signal = -1 to mark the task dead. This is no longer needed, nobody checks exit_signal to detect the EXIT_DEAD task. Signed-off-by: Oleg Nesterov Reviewed-by: Tejun Heo --- kernel/ptrace.c | 1 - kernel/signal.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 66a28bd71ef..d7ccc79454f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -384,7 +384,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) dead = do_notify_parent(p, p->exit_signal); else if (ignoring_children(tracer->sighand)) { __wake_up_parent(p, tracer); - p->exit_signal = -1; dead = true; } } diff --git a/kernel/signal.c b/kernel/signal.c index d52e82cd62b..4c4ad34caf7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1650,7 +1650,6 @@ bool do_notify_parent(struct task_struct *tsk, int sig) * it, just use SIG_IGN instead). */ autoreap = true; - tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } -- cgit v1.2.3-70-g09d2 From bb3696da89743d580f869142d0a6e6ba9b7fe89a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Jun 2011 17:34:23 +0200 Subject: ptrace: kill real_parent_is_ptracer() in in favor of ptrace_reparented() Kill real_parent_is_ptracer() and update the callers to use ptrace_reparented(), after the previous patch they do the same. Remove the unnecessary ->ptrace != 0 check in get_signal_to_deliver(), if ptrace_reparented() == T then the task must be ptraced. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- kernel/signal.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 4c4ad34caf7..0a1bf2c8bdc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1759,15 +1759,6 @@ static int sigkill_pending(struct task_struct *tsk) sigismember(&tsk->signal->shared_pending.signal, SIGKILL); } -/* - * Test whether the target task of the usual cldstop notification - the - * real_parent of @child - is in the same group as the ptracer. - */ -static bool real_parent_is_ptracer(struct task_struct *child) -{ - return same_thread_group(child->parent, child->real_parent); -} - /* * This must be called with current->sighand->siglock held. * @@ -1848,7 +1839,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * separately unless they're gonna be duplicates. */ do_notify_parent_cldstop(current, true, why); - if (gstop_done && !real_parent_is_ptracer(current)) + if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); /* @@ -2154,7 +2145,6 @@ relock: * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - struct task_struct *leader; int why; if (signal->flags & SIGNAL_CLD_CONTINUED) @@ -2175,13 +2165,11 @@ relock: * a duplicate. */ read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, false, why); - leader = current->group_leader; - if (leader->ptrace && !real_parent_is_ptracer(leader)) - do_notify_parent_cldstop(leader, true, why); - + if (ptrace_reparented(current->group_leader)) + do_notify_parent_cldstop(current->group_leader, + true, why); read_unlock(&tasklist_lock); goto relock; -- cgit v1.2.3-70-g09d2 From 479bf98c1c29b40d86e40a4e6e4944c2f03d9493 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Jun 2011 17:34:39 +0200 Subject: ptrace: wait_consider_task: s/same_thread_group/ptrace_reparented/ wait_consider_task() checks same_thread_group(parent, real_parent), this is the open-coded ptrace_reparented(). __ptrace_detach() remains the only function which has to check this by hand, although we could reorganize the code to delay __ptrace_unlink. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- kernel/exit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9fa99702645..b8d3b47bb88 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1599,8 +1599,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * own children, it should create a separate process which * takes the role of real parent. */ - if (likely(!ptrace) && p->ptrace && - same_thread_group(p->parent, p->real_parent)) + if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) return 0; /* -- cgit v1.2.3-70-g09d2 From 6d3321e8e2b3bf6a5892e2ef673c7bf536e3f904 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 23 Jun 2011 11:19:26 -0700 Subject: x86, mtrr: lock stop machine during MTRR rendezvous sequence MTRR rendezvous sequence using stop_one_cpu_nowait() can potentially happen in parallel with another system wide rendezvous using stop_machine(). This can lead to deadlock (The order in which works are queued can be different on different cpu's. Some cpu's will be running the first rendezvous handler and others will be running the second rendezvous handler. Each set waiting for the other set to join for the system wide rendezvous, leading to a deadlock). MTRR rendezvous sequence is not implemented using stop_machine() as this gets called both from the process context aswell as the cpu online paths (where the cpu has not come online and the interrupts are disabled etc). stop_machine() works with only online cpus. For now, take the stop_machine mutex in the MTRR rendezvous sequence that gets called from an online cpu (here we are in the process context and can potentially sleep while taking the mutex). And the MTRR rendezvous that gets triggered during cpu online doesn't need to take this stop_machine lock (as the stop_machine() already ensures that there is no cpu hotplug going on in parallel by doing get_online_cpus()) TBD: Pursue a cleaner solution of extending the stop_machine() infrastructure to handle the case where the calling cpu is still not online and use this for MTRR rendezvous sequence. fixes: https://bugzilla.novell.com/show_bug.cgi?id=672008 Reported-by: Vadim Kotelnikov Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/20110623182056.807230326@sbsiddha-MOBL3.sc.intel.com Cc: stable@kernel.org # 2.6.35+, backport a week or two after this gets more testing in mainline Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 23 +++++++++++++++++++++++ include/linux/stop_machine.h | 2 ++ kernel/stop_machine.c | 2 +- 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 929739a653d..3d17bc7f06e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -248,6 +248,25 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ unsigned long flags; int cpu; +#ifdef CONFIG_SMP + /* + * If this cpu is not yet active, we are in the cpu online path. There + * can be no stop_machine() in parallel, as stop machine ensures this + * by using get_online_cpus(). We can skip taking the stop_cpus_mutex, + * as we don't need it and also we can't afford to block while waiting + * for the mutex. + * + * If this cpu is active, we need to prevent stop_machine() happening + * in parallel by taking the stop cpus mutex. + * + * Also, this is called in the context of cpu online path or in the + * context where cpu hotplug is prevented. So checking the active status + * of the raw_smp_processor_id() is safe. + */ + if (cpu_active(raw_smp_processor_id())) + mutex_lock(&stop_cpus_mutex); +#endif + preempt_disable(); data.smp_reg = reg; @@ -330,6 +349,10 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ local_irq_restore(flags); preempt_enable(); +#ifdef CONFIG_SMP + if (cpu_active(raw_smp_processor_id())) + mutex_unlock(&stop_cpus_mutex); +#endif } /** diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 092dc9b1ce7..14d3524d127 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -27,6 +27,8 @@ struct cpu_stop_work { struct cpu_stop_done *done; }; +extern struct mutex stop_cpus_mutex; + int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e3516b29076..0cae1cc323d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -132,8 +132,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); } +DEFINE_MUTEX(stop_cpus_mutex); /* static data for stop_cpus */ -static DEFINE_MUTEX(stop_cpus_mutex); static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) -- cgit v1.2.3-70-g09d2 From fd7355ba1e936487f5aae6fc058c6cb300e44a64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2011 11:19:27 -0700 Subject: stop_machine: reorganize stop_cpus() implementation Refactor the queuing part of the stop cpus work from __stop_cpus() into queue_stop_cpus_work(). The reorganization is to help future improvements to stop_machine() and doesn't introduce any behavior difference. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/20110623182056.897818337@sbsiddha-MOBL3.sc.intel.com Signed-off-by: Suresh Siddha Cc: Ingo Molnar Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: H. Peter Anvin --- kernel/stop_machine.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0cae1cc323d..4c89ee9fc56 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -136,10 +136,11 @@ DEFINE_MUTEX(stop_cpus_mutex); /* static data for stop_cpus */ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); -int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +static void queue_stop_cpus_work(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg, + struct cpu_stop_done *done) { struct cpu_stop_work *work; - struct cpu_stop_done done; unsigned int cpu; /* initialize works and done */ @@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) work = &per_cpu(stop_cpus_work, cpu); work->fn = fn; work->arg = arg; - work->done = &done; + work->done = done; } - cpu_stop_init_done(&done, cpumask_weight(cpumask)); /* * Disable preemption while queueing to avoid getting @@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &per_cpu(stop_cpus_work, cpu)); preempt_enable(); +} +static int __stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + struct cpu_stop_done done; + + cpu_stop_init_done(&done, cpumask_weight(cpumask)); + queue_stop_cpus_work(cpumask, fn, arg, &done); wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } -- cgit v1.2.3-70-g09d2 From f740e6cd0cb5e7468e46831aeb4d9c30e03d5ebc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2011 11:19:28 -0700 Subject: stop_machine: implement stop_machine_from_inactive_cpu() Currently, mtrr wants stop_machine functionality while a CPU is being brought up. As stop_machine() requires the calling CPU to be active, mtrr implements its own stop_machine using stop_one_cpu() on each online CPU. This doesn't only unnecessarily duplicate complex logic but also introduces a possibility of deadlock when it races against the generic stop_machine(). This patch implements stop_machine_from_inactive_cpu() to serve such use cases. Its functionality is basically the same as stop_machine(); however, it should be called from a CPU which isn't active and doesn't depend on working scheduling on the calling CPU. This is achieved by using busy loops for synchronization and open-coding stop_cpus queuing and waiting with direct invocation of fn() for local CPU inbetween. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/20110623182056.982526827@sbsiddha-MOBL3.sc.intel.com Signed-off-by: Suresh Siddha Cc: Ingo Molnar Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: H. Peter Anvin --- include/linux/stop_machine.h | 14 ++++++++-- kernel/stop_machine.c | 62 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 14d3524d127..e0f2da25d75 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -126,15 +126,19 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); */ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); +int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, + const struct cpumask *cpus); + #else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ static inline int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { + unsigned long flags; int ret; - local_irq_disable(); + local_irq_save(flags); ret = fn(data); - local_irq_enable(); + local_irq_restore(flags); return ret; } @@ -144,5 +148,11 @@ static inline int stop_machine(int (*fn)(void *), void *data, return __stop_machine(fn, data, cpus); } +static inline int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, + const struct cpumask *cpus) +{ + return __stop_machine(fn, data, cpus); +} + #endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4c89ee9fc56..e8f05b14cd4 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -439,8 +439,15 @@ static int stop_machine_cpu_stop(void *data) struct stop_machine_data *smdata = data; enum stopmachine_state curstate = STOPMACHINE_NONE; int cpu = smp_processor_id(), err = 0; + unsigned long flags; bool is_active; + /* + * When called from stop_machine_from_inactive_cpu(), irq might + * already be disabled. Save the state and restore it on exit. + */ + local_save_flags(flags); + if (!smdata->active_cpus) is_active = cpu == cpumask_first(cpu_online_mask); else @@ -468,7 +475,7 @@ static int stop_machine_cpu_stop(void *data) } } while (curstate != STOPMACHINE_EXIT); - local_irq_enable(); + local_irq_restore(flags); return err; } @@ -495,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) } EXPORT_SYMBOL_GPL(stop_machine); +/** + * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU + * @fn: the function to run + * @data: the data ptr for the @fn() + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) + * + * This is identical to stop_machine() but can be called from a CPU which + * is not active. The local CPU is in the process of hotplug (so no other + * CPU hotplug can start) and not marked active and doesn't have enough + * context to sleep. + * + * This function provides stop_machine() functionality for such state by + * using busy-wait for synchronization and executing @fn directly for local + * CPU. + * + * CONTEXT: + * Local CPU is inactive. Temporarily stops all active CPUs. + * + * RETURNS: + * 0 if all executions of @fn returned 0, any non zero return value if any + * returned non zero. + */ +int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, + const struct cpumask *cpus) +{ + struct stop_machine_data smdata = { .fn = fn, .data = data, + .active_cpus = cpus }; + struct cpu_stop_done done; + int ret; + + /* Local CPU must be inactive and CPU hotplug in progress. */ + BUG_ON(cpu_active(raw_smp_processor_id())); + smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ + + /* No proper task established and can't sleep - busy wait for lock. */ + while (!mutex_trylock(&stop_cpus_mutex)) + cpu_relax(); + + /* Schedule work on other CPUs and execute directly for local CPU */ + set_state(&smdata, STOPMACHINE_PREPARE); + cpu_stop_init_done(&done, num_active_cpus()); + queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, + &done); + ret = stop_machine_cpu_stop(&smdata); + + /* Busy wait for completion. */ + while (!completion_done(&done.completion)) + cpu_relax(); + + mutex_unlock(&stop_cpus_mutex); + return ret ?: done.ret; +} + #endif /* CONFIG_STOP_MACHINE */ -- cgit v1.2.3-70-g09d2 From 192d8857427dd23707d5f0b86ca990c3af6f2d74 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 23 Jun 2011 11:19:29 -0700 Subject: x86, mtrr: use stop_machine APIs for doing MTRR rendezvous MTRR rendezvous sequence is not implemened using stop_machine() before, as this gets called both from the process context aswell as the cpu online paths (where the cpu has not come online and the interrupts are disabled etc). Now that we have a new stop_machine_from_inactive_cpu() API, use it for rendezvous during mtrr init of a logical processor that is coming online. For the rest (runtime MTRR modification, system boot, resume paths), use stop_machine() to implement the rendezvous sequence. This will consolidate and cleanup the code. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/20110623182057.076997177@sbsiddha-MOBL3.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 192 +++++++++------------------------------- include/linux/stop_machine.h | 2 - kernel/stop_machine.c | 2 +- 3 files changed, 42 insertions(+), 154 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 3d17bc7f06e..707b6377adf 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -137,55 +137,43 @@ static void __init init_table(void) } struct set_mtrr_data { - atomic_t count; - atomic_t gate; unsigned long smp_base; unsigned long smp_size; unsigned int smp_reg; mtrr_type smp_type; }; -static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); - /** - * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static int mtrr_work_handler(void *info) +static int mtrr_rendezvous_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; - unsigned long flags; - - atomic_dec(&data->count); - while (!atomic_read(&data->gate)) - cpu_relax(); - - local_irq_save(flags); - - atomic_dec(&data->count); - while (atomic_read(&data->gate)) - cpu_relax(); - /* The master has cleared me to execute */ + /* + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * all the cpu's we do mtrr_if->set_all() (On the logical cpu that + * started the boot/resume sequence, this might be a duplicate + * set_all()). + */ if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else if (mtrr_aps_delayed_init) { - /* - * Initialize the MTRRs inaddition to the synchronisation. - */ + } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } - - atomic_dec(&data->count); - while (!atomic_read(&data->gate)) - cpu_relax(); - - atomic_dec(&data->count); - local_irq_restore(flags); #endif return 0; } @@ -223,20 +211,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 14. Wait for buddies to catch up * 15. Enable interrupts. * - * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU announces that it started the rendezvous handler by - * decrementing the count, We reset data.count and set the data.gate flag - * allowing all the cpu's to proceed with the work. As each cpu disables - * interrupts, it'll decrement data.count once. We wait until it hits 0 and - * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they - * are waiting for that flag to be cleared. Once it's cleared, each - * CPU goes through the transition of updating MTRRs. - * The CPU vendors may each do it differently, - * so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate - * to be set. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag - * Everyone then enables interrupts and we all continue on. + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. @@ -244,115 +223,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { - struct set_mtrr_data data; - unsigned long flags; - int cpu; - -#ifdef CONFIG_SMP - /* - * If this cpu is not yet active, we are in the cpu online path. There - * can be no stop_machine() in parallel, as stop machine ensures this - * by using get_online_cpus(). We can skip taking the stop_cpus_mutex, - * as we don't need it and also we can't afford to block while waiting - * for the mutex. - * - * If this cpu is active, we need to prevent stop_machine() happening - * in parallel by taking the stop cpus mutex. - * - * Also, this is called in the context of cpu online path or in the - * context where cpu hotplug is prevented. So checking the active status - * of the raw_smp_processor_id() is safe. - */ - if (cpu_active(raw_smp_processor_id())) - mutex_lock(&stop_cpus_mutex); -#endif - - preempt_disable(); - - data.smp_reg = reg; - data.smp_base = base; - data.smp_size = size; - data.smp_type = type; - atomic_set(&data.count, num_booting_cpus() - 1); - - /* Make sure data.count is visible before unleashing other CPUs */ - smp_wmb(); - atomic_set(&data.gate, 0); - - /* Start the ball rolling on other CPUs */ - for_each_online_cpu(cpu) { - struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); - - if (cpu == smp_processor_id()) - continue; + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; - stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); - } - - - while (atomic_read(&data.count)) - cpu_relax(); - - /* Ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 1); - - local_irq_save(flags); - - while (atomic_read(&data.count)) - cpu_relax(); - - /* Ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 0); - - /* Do our MTRR business */ - - /* - * HACK! - * - * We use this same function to initialize the mtrrs during boot, - * resume, runtime cpu online and on an explicit request to set a - * specific MTRR. - * - * During boot or suspend, the state of the boot cpu's mtrrs has been - * saved, and we want to replicate that across all the cpus that come - * online (either at the end of boot or resume or during a runtime cpu - * online). If we're doing that, @reg is set to something special and on - * this cpu we still do mtrr_if->set_all(). During boot/resume, this - * is unnecessary if at this point we are still on the cpu that started - * the boot/resume sequence. But there is no guarantee that we are still - * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be - * sure that we are in sync with everyone else. - */ - if (reg != ~0U) - mtrr_if->set(reg, base, size, type); - else - mtrr_if->set_all(); - - /* Wait for the others */ - while (atomic_read(&data.count)) - cpu_relax(); - - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 1); - - /* - * Wait here for everyone to have seen the gate change - * So we're the last ones to touch 'data' - */ - while (atomic_read(&data.count)) - cpu_relax(); + stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); +} - local_irq_restore(flags); - preempt_enable(); -#ifdef CONFIG_SMP - if (cpu_active(raw_smp_processor_id())) - mutex_unlock(&stop_cpus_mutex); -#endif +static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, + cpu_callout_mask); } /** @@ -806,7 +696,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - set_mtrr(~0U, 0, 0, 0); + set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); } /** diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index e0f2da25d75..4a9d0c7edc6 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -27,8 +27,6 @@ struct cpu_stop_work { struct cpu_stop_done *done; }; -extern struct mutex stop_cpus_mutex; - int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e8f05b14cd4..c1124752e1d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -132,8 +132,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); } -DEFINE_MUTEX(stop_cpus_mutex); /* static data for stop_cpus */ +static DEFINE_MUTEX(stop_cpus_mutex); static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); static void queue_stop_cpus_work(const struct cpumask *cpumask, -- cgit v1.2.3-70-g09d2 From 131ad62d8fc06d9d0a5c61d9526876352c2f2bbd Mon Sep 17 00:00:00 2001 From: Mr Dash Four Date: Thu, 30 Jun 2011 13:31:57 +0200 Subject: netfilter: add SELinux context support to AUDIT target In this revision the conversion of secid to SELinux context and adding it to the audit log is moved from xt_AUDIT.c to audit.c with the aid of a separate helper function - audit_log_secctx - which does both the conversion and logging of SELinux context, thus also preventing internal secid number being leaked to userspace. If conversion is not successful an error is raised. With the introduction of this helper function the work done in xt_AUDIT.c is much more simplified. It also opens the possibility of this helper function being used by other modules (including auditd itself), if desired. With this addition, typical (raw auditd) output after applying the patch would be: type=NETFILTER_PKT msg=audit(1305852240.082:31012): action=0 hook=1 len=52 inif=? outif=eth0 saddr=10.1.1.7 daddr=10.1.2.1 ipid=16312 proto=6 sport=56150 dport=22 obj=system_u:object_r:ssh_client_packet_t:s0 type=NETFILTER_PKT msg=audit(1306772064.079:56): action=0 hook=3 len=48 inif=eth0 outif=? smac=00:05:5d:7c:27:0b dmac=00:02:b3:0a:7f:81 macproto=0x0800 saddr=10.1.2.1 daddr=10.1.1.7 ipid=462 proto=6 sport=22 dport=3561 obj=system_u:object_r:ssh_server_packet_t:s0 Acked-by: Eric Paris Signed-off-by: Mr Dash Four Signed-off-by: Patrick McHardy --- include/linux/audit.h | 7 +++++++ kernel/audit.c | 29 +++++++++++++++++++++++++++++ net/netfilter/xt_AUDIT.c | 5 +++++ 3 files changed, 41 insertions(+) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 9d339eb2788..0c8006129fb 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -613,6 +613,12 @@ extern void audit_log_d_path(struct audit_buffer *ab, extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_lost(const char *message); +#ifdef CONFIG_SECURITY +extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); +#else +#define audit_log_secctx(b,s) do { ; } while (0) +#endif + extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ @@ -635,6 +641,7 @@ extern int audit_enabled; #define audit_log_untrustedstring(a,s) do { ; } while (0) #define audit_log_d_path(b, p, d) do { ; } while (0) #define audit_log_key(b, k) do { ; } while (0) +#define audit_log_secctx(b,s) do { ; } while (0) #define audit_enabled 0 #endif #endif diff --git a/kernel/audit.c b/kernel/audit.c index 93950031706..52501b5d490 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -55,6 +55,9 @@ #include #include #include +#ifdef CONFIG_SECURITY +#include +#endif #include #include #include @@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, } } +#ifdef CONFIG_SECURITY +/** + * audit_log_secctx - Converts and logs SELinux context + * @ab: audit_buffer + * @secid: security number + * + * This is a helper function that calls security_secid_to_secctx to convert + * secid to secctx and then adds the (converted) SELinux context to the audit + * log by calling audit_log_format, thus also preventing leak of internal secid + * to userspace. If secid cannot be converted audit_panic is called. + */ +void audit_log_secctx(struct audit_buffer *ab, u32 secid) +{ + u32 len; + char *secctx; + + if (security_secid_to_secctx(secid, &secctx, &len)) { + audit_panic("Cannot convert secid to context"); + } else { + audit_log_format(ab, " obj=%s", secctx); + security_release_secctx(secctx, len); + } +} +EXPORT_SYMBOL(audit_log_secctx); +#endif + EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 363a99ec063..4bca15a0c38 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -163,6 +163,11 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) break; } +#ifdef CONFIG_NETWORK_SECMARK + if (skb->secmark) + audit_log_secctx(ab, skb->secmark); +#endif + audit_log_end(ab); errout: -- cgit v1.2.3-70-g09d2 From 2a46dae38087e62dd5fb08a6dadf1407717ed13c Mon Sep 17 00:00:00 2001 From: "Nikunj A. Dadhania" Date: Tue, 7 Jun 2011 15:43:22 +0530 Subject: sched: Remove rcu_read_lock() from wake_affine() wake_affine() is only called from one path: select_task_rq_fair(), which already has the RCU read lock held. Signed-off-by: Nikunj A. Dadhania Signed-off-by: Peter Zijlstra Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20110607101251.777.34547.stgit@IBM-009124035060.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 433491c2dc8..eb98f77b38e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1481,7 +1481,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * effect of the currently running task from the load * of the current CPU: */ - rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1517,7 +1516,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; - rcu_read_unlock(); /* * If the currently running task will sleep within -- cgit v1.2.3-70-g09d2 From 307bf9803f25a8a3f53c1012110fb74e2f893eb0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jun 2011 15:08:55 +0200 Subject: sched: Simplify mutex_spin_on_owner() It does not make sense to rcu_read_lock/unlock() in every loop iteration while spinning on the mutex. Move the rcu protection outside the loop. Also simplify the return path to always check for lock->owner == NULL which meets the requirements of both owner changed and need_resched() caused loop exits. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1106101458350.11814@ionos Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 59252754fbe..e355ee72e83 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4306,11 +4306,8 @@ EXPORT_SYMBOL(schedule); static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { - bool ret = false; - - rcu_read_lock(); if (lock->owner != owner) - goto fail; + return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking @@ -4320,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) */ barrier(); - ret = owner->on_cpu; -fail: - rcu_read_unlock(); - - return ret; + return owner->on_cpu; } /* @@ -4336,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (!sched_feat(OWNER_SPIN)) return 0; + rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) - return 0; + break; arch_mutex_cpu_relax(); } + rcu_read_unlock(); /* - * If the owner changed to another task there is likely - * heavy contention, stop spinning. + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. */ - if (lock->owner) - return 0; - - return 1; + return lock->owner == NULL; } #endif -- cgit v1.2.3-70-g09d2 From 1c09ab0d257317f97e8629a3d0c8713d6dd9de4c Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Tue, 28 Jun 2011 10:51:31 +0800 Subject: sched: Skip autogroup when looking for all rt sched groups Since commit ec514c48 ("sched: Fix rt_rq runtime leakage bug") 'cat /proc/sched_debug' will print data of root_task_group.rt_rq multiple times. This is because autogroup does not have its own rt group, instead rt group of autogroup is linked to root_task_group. So skip it when we are looking for all rt sched groups, and it will also save some noop operation against root_task_group when __disable_runtime()/__enable_runtime(). -v2: Based on Cheng Xu's idea which uses less code. Signed-off-by: Yong Zhang Cc: Mike Galbraith Cc: Cheng Xu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTi=87P3RoTF_UEtamNfc_XGxQXE__Q@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.h | 1 + kernel/sched_rt.c | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 05577055cfc..c2f0e7248dc 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -13,6 +13,7 @@ struct autogroup { int nice; }; +static inline bool task_group_is_autogroup(struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b03cd89fee2..97540f0c9e4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) typedef struct task_group *rt_rq_iter_t; -#define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ - (&iter->list != &task_groups) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]); \ - iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) +static inline struct task_group *next_task_group(struct task_group *tg) +{ + do { + tg = list_entry_rcu(tg->list.next, + typeof(struct task_group), list); + } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); + + if (&tg->list == &task_groups) + tg = NULL; + + return tg; +} + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = container_of(&task_groups, typeof(*iter), list); \ + (iter = next_task_group(iter)) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]);) static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) { -- cgit v1.2.3-70-g09d2 From 4ec8363dfc1451f8c8f86825731fe712798ada02 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 1 Jun 2011 15:15:36 -0400 Subject: perf_events: Fix perf buffer watermark setting Since 2.6.36 (specifically commit d57e34fdd60b ("perf: Simplify the ring-buffer logic: make perf_buffer_alloc() do everything needed"), the perf_buffer_init_code() has been mis-setting the buffer watermark if perf_event_attr.wakeup_events has a non-zero value. This is because perf_event_attr.wakeup_events is a union with perf_event_attr.wakeup_watermark. This commit re-enables the check for perf_event_attr.watermark being set before continuing with setting a non-default watermark. This bug is most noticable when you are trying to use PERF_IOC_REFRESH with a value larger than one and perf_event_attr.wakeup_events is set to one. In this case the buffer watermark will be set to 1 and you will get extraneous POLL_IN overflows rather than POLL_HUP as expected. [ avoid using attr.wakeup_events when attr.watermark is set ] Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1106011506390.5384@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- kernel/events/ring_buffer.c | 16 +++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5e70f62752a..e4aee519572 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3569,8 +3569,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - rb = rb_alloc(nr_pages, event->attr.wakeup_watermark, - event->cpu, flags); + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); + if (!rb) { ret = -ENOMEM; goto unlock; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index fde52595d8f..fc2701c9920 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -199,13 +199,15 @@ void perf_output_end(struct perf_output_handle *handle) struct perf_event *event = handle->event; struct ring_buffer *rb = handle->rb; - int wakeup_events = event->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = local_inc_return(&rb->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); + if (handle->sample && !event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + int events = local_inc_return(&rb->events); + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } } } -- cgit v1.2.3-70-g09d2 From b7526f0ca6dc68f57ca467ce503151b1d476a3e4 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 23 Jun 2011 16:34:37 -0400 Subject: events: Add note to update_event_times comment about holding ctx->lock Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1308861279-15216-1-git-send-email-emunson@mgebm.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e4aee519572..2293e0b084a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -748,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. + * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { -- cgit v1.2.3-70-g09d2 From c4794295917ebeda8013b6cb9c8d71ab4f74a1fa Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 23 Jun 2011 16:34:38 -0400 Subject: events: Move lockless timer calculation into helper function Take the timer calculation from perf_output_read and move it to a helper function for any place that needs timer values but cannot take the ctx->lock. Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1308861279-15216-2-git-send-email-emunson@mgebm.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2293e0b084a..c851d707821 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3351,6 +3351,18 @@ static int perf_event_index(struct perf_event *event) return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; } +static void calc_timer_values(struct perf_event *event, + u64 *running, + u64 *enabled) +{ + u64 now, ctx_time; + + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + *enabled = ctx_time - event->tstamp_enabled; + *running = ctx_time - event->tstamp_running; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3816,7 +3828,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0, now, ctx_time; + u64 enabled = 0, running = 0; u64 read_format = event->attr.read_format; /* @@ -3828,12 +3840,8 @@ static void perf_output_read(struct perf_output_handle *handle, * because of locking issue as we are called in * NMI context */ - if (read_format & PERF_FORMAT_TOTAL_TIMES) { - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; - enabled = ctx_time - event->tstamp_enabled; - running = ctx_time - event->tstamp_running; - } + if (read_format & PERF_FORMAT_TOTAL_TIMES) + calc_timer_values(event, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); -- cgit v1.2.3-70-g09d2 From 0d6412085b7ff58612af52e51ffa864f0df4b8fd Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Fri, 24 Jun 2011 12:26:26 -0400 Subject: events: Ensure that timers are updated without requiring read() call The event tracing infrastructure exposes two timers which should be updated each time the value of the counter is updated. Currently, these counters are only updated when userspace calls read() on the fd associated with an event. This means that counters which are read via the mmap'd page exclusively never have their timers updated. This patch adds ensures that the timers are updated each time the values in the mmap'd page are updated. Signed-off-by: Eric B Munson Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1308932786-5111-1-git-send-email-emunson@mgebm.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c851d707821..270e32f9fc0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3372,8 +3372,19 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; + u64 enabled, running; rcu_read_lock(); + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we can be called in + * NMI context + */ + calc_timer_values(event, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3392,10 +3403,10 @@ void perf_event_update_userpage(struct perf_event *event) if (event->state == PERF_EVENT_STATE_ACTIVE) userpg->offset -= local64_read(&event->hw.prev_count); - userpg->time_enabled = event->total_time_enabled + + userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); - userpg->time_running = event->total_time_running + + userpg->time_running = running + atomic64_read(&event->child_total_time_running); barrier(); -- cgit v1.2.3-70-g09d2 From 1880c4ae182afb5650c5678949ecfe7ff66a724e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jun 2011 16:49:18 +0400 Subject: perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4 Due to restriction and specifics of Netburst PMU we need a separated event for NMI watchdog. In particular every Netburst event consumes not just a counter and a config register, but also an additional ESCR register. Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied for some event there is no room for another event to enter until its released) we need to pick up the "least" used ESCR (or the most available one) for nmi-watchdog purposes -- so MSR_P4_CRU_ESCR2/3 was chosen. With this patch nmi-watchdog and perf top should be able to run simultaneously. Signed-off-by: Cyrill Gorcunov CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Tested-and-reviewed-by: Don Zickus Tested-and-reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110623124918.GC13050@sun Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 +++++++ arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++ kernel/watchdog.c | 6 +++++- 3 files changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3a0338b4b17..8a57f9aa8e3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -233,6 +233,7 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + if (x86_pmu.hw_watchdog_set_attr) + x86_pmu.hw_watchdog_set_attr(wd_attr); +} + /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ead584fb6a7..f76fddf6338 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } +static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + /* + * Watchdog ticks are special on Netburst, we use + * that named "non-sleeping" ticks as recommended + * by Intel SDM Vol3b. + */ + WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || + wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); + + wd_attr->type = PERF_TYPE_RAW; + wd_attr->config = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE); +} + static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, + .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3d0c56ad479..752b75ba662 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -200,6 +200,8 @@ static int is_softlockup(unsigned long touch_ts) } #ifdef CONFIG_HARDLOCKUP_DETECTOR +void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { } + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -368,9 +370,11 @@ static int watchdog_nmi_enable(int cpu) if (event != NULL) goto out_enable; - /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + hw_nmi_watchdog_set_attr(wd_attr); + + /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); -- cgit v1.2.3-70-g09d2 From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 14:41:57 +0200 Subject: perf: Remove the nmi parameter from the swevent and overflow interface The nmi parameter indicated if we could do wakeups from the current context, if not, we would set some state and self-IPI and let the resulting interrupt do the wakeup. For the various event classes: - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from the PMI-tail (ARM etc.) - tracepoint: nmi=0; since tracepoint could be from NMI context. - software: nmi=[0,1]; some, like the schedule thing cannot perform wakeups, and hence need 0. As one can see, there is very little nmi=1 usage, and the down-side of not using it is that on some platforms some software events can have a jiffy delay in wakeup (when arch_irq_work_raise isn't implemented). The up-side however is that we can remove the nmi parameter and save a bunch of conditionals in fast paths. Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jason Wessel Cc: Don Zickus Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 2 +- arch/arm/kernel/perf_event_v6.c | 2 +- arch/arm/kernel/perf_event_v7.c | 2 +- arch/arm/kernel/perf_event_xscale.c | 4 +- arch/arm/kernel/ptrace.c | 2 +- arch/arm/kernel/swp_emulate.c | 2 +- arch/arm/mm/fault.c | 6 +-- arch/mips/kernel/perf_event.c | 2 +- arch/mips/kernel/traps.c | 8 ++-- arch/mips/kernel/unaligned.c | 5 +-- arch/mips/math-emu/cp1emu.c | 3 +- arch/mips/mm/fault.c | 8 ++-- arch/powerpc/include/asm/emulated_ops.h | 4 +- arch/powerpc/kernel/perf_event.c | 6 +-- arch/powerpc/kernel/perf_event_fsl_emb.c | 6 +-- arch/powerpc/kernel/ptrace.c | 2 +- arch/powerpc/mm/fault.c | 6 +-- arch/s390/mm/fault.c | 6 +-- arch/sh/kernel/ptrace_32.c | 2 +- arch/sh/kernel/traps_32.c | 2 +- arch/sh/kernel/traps_64.c | 8 ++-- arch/sh/math-emu/math.c | 2 +- arch/sh/mm/fault_32.c | 6 +-- arch/sh/mm/tlbflush_64.c | 6 +-- arch/sparc/kernel/perf_event.c | 2 +- arch/sparc/kernel/unaligned_32.c | 4 +- arch/sparc/kernel/unaligned_64.c | 12 +++--- arch/sparc/kernel/visemul.c | 2 +- arch/sparc/math-emu/math_32.c | 2 +- arch/sparc/math-emu/math_64.c | 2 +- arch/sparc/mm/fault_32.c | 8 ++-- arch/sparc/mm/fault_64.c | 8 ++-- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 +- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/mm/fault.c | 6 +-- include/linux/perf_event.h | 18 ++++----- kernel/events/core.c | 63 ++++++++++++++----------------- kernel/events/internal.h | 1 - kernel/events/ring_buffer.c | 10 ++--- kernel/sched.c | 2 +- kernel/watchdog.c | 2 +- samples/hw_breakpoint/data_breakpoint.c | 2 +- 46 files changed, 119 insertions(+), 141 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 90561c45e7d..8e47709160f 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -847,7 +847,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, data.period = event->hw.last_period; if (alpha_perf_event_set_period(event, hwc, idx)) { - if (perf_event_overflow(event, 1, &data, regs)) { + if (perf_event_overflow(event, &data, regs)) { /* Interrupts coming too quickly; "throttle" the * counter, i.e., disable it for a little while. */ diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c index f1e8dd94afe..38dc4da1d61 100644 --- a/arch/arm/kernel/perf_event_v6.c +++ b/arch/arm/kernel/perf_event_v6.c @@ -479,7 +479,7 @@ armv6pmu_handle_irq(int irq_num, if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 4960686afb5..6e5f8752303 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -787,7 +787,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c index 39affbe4fdb..99b6b85c7e4 100644 --- a/arch/arm/kernel/perf_event_xscale.c +++ b/arch/arm/kernel/perf_event_xscale.c @@ -251,7 +251,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev) if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } @@ -583,7 +583,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev) if (!armpmu_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) armpmu->disable(hwc, idx); } diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 97260060bf2..0c9b1054f79 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -396,7 +396,7 @@ static long ptrace_hbp_idx_to_num(int idx) /* * Handle hitting a HW-breakpoint. */ -static void ptrace_hbptriggered(struct perf_event *bp, int unused, +static void ptrace_hbptriggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c index 40ee7e5045e..5f452f8fde0 100644 --- a/arch/arm/kernel/swp_emulate.c +++ b/arch/arm/kernel/swp_emulate.c @@ -183,7 +183,7 @@ static int swp_handler(struct pt_regs *regs, unsigned int instr) unsigned int address, destreg, data, type; unsigned int res = 0; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, regs->ARM_pc); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->ARM_pc); if (current->pid != previous_pid) { pr_debug("\"%s\" (%ld) uses deprecated SWP{B} instruction\n", diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index bc0e1d88fd3..9ea4f7ddd66 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -318,11 +318,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) fault = __do_page_fault(mm, addr, fsr, tsk); up_read(&mm->mmap_sem); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (fault & VM_FAULT_MAJOR) - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); else if (fault & VM_FAULT_MINOR) - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); /* * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index a8244854d3d..d0deaab9ace 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -527,7 +527,7 @@ handle_associated_event(struct cpu_hw_events *cpuc, if (!mipspmu_event_set_period(event, hwc, idx)) return; - if (perf_event_overflow(event, 0, data, regs)) + if (perf_event_overflow(event, data, regs)) mipspmu->disable_event(idx); } diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e9b3af27d84..b7517e3abc8 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -578,12 +578,12 @@ static int simulate_llsc(struct pt_regs *regs, unsigned int opcode) { if ((opcode & OPCODE) == LL) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); return simulate_ll(regs, opcode); } if ((opcode & OPCODE) == SC) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); return simulate_sc(regs, opcode); } @@ -602,7 +602,7 @@ static int simulate_rdhwr(struct pt_regs *regs, unsigned int opcode) int rd = (opcode & RD) >> 11; int rt = (opcode & RT) >> 16; perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); switch (rd) { case 0: /* CPU number */ regs->regs[rt] = smp_processor_id(); @@ -640,7 +640,7 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode) { if ((opcode & OPCODE) == SPEC0 && (opcode & FUNC) == SYNC) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + 1, regs, 0); return 0; } diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index cfea1adfa15..eb319b58035 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -111,8 +111,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, unsigned long value; unsigned int res; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); /* * This load never faults. @@ -517,7 +516,7 @@ asmlinkage void do_ade(struct pt_regs *regs) mm_segment_t seg; perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, - 1, 0, regs, regs->cp0_badvaddr); + 1, regs, regs->cp0_badvaddr); /* * Did we catch a fault trying to load an instruction? * Or are we running in MIPS16 mode? diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index d32cb050311..dbf2f93a509 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -272,8 +272,7 @@ static int cop1Emulate(struct pt_regs *xcp, struct mips_fpu_struct *ctx, } emul: - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, - 1, 0, xcp, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, xcp, 0); MIPS_FPU_EMU_INC_STATS(emulated); switch (MIPSInst_OPCODE(ir)) { case ldc1_op:{ diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 137ee76a004..937cf336816 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -145,7 +145,7 @@ good_area: * the fault. */ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -154,12 +154,10 @@ good_area: BUG(); } if (fault & VM_FAULT_MAJOR) { - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, - 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); tsk->maj_flt++; } else { - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, - 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); tsk->min_flt++; } diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index 45921672b97..2cc41c715d2 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -78,14 +78,14 @@ extern void ppc_warn_emulated_print(const char *type); #define PPC_WARN_EMULATED(type, regs) \ do { \ perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, \ - 1, 0, regs, 0); \ + 1, regs, 0); \ __PPC_WARN_EMULATED(type); \ } while (0) #define PPC_WARN_ALIGNMENT(type, regs) \ do { \ perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, \ - 1, 0, regs, regs->dar); \ + 1, regs, regs->dar); \ __PPC_WARN_EMULATED(type); \ } while (0) diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 822f63008ae..14967de9887 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1207,7 +1207,7 @@ struct pmu power_pmu = { * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs, int nmi) + struct pt_regs *regs) { u64 period = event->hw.sample_period; s64 prev, delta, left; @@ -1258,7 +1258,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); - if (perf_event_overflow(event, nmi, &data, regs)) + if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } } @@ -1346,7 +1346,7 @@ static void perf_event_interrupt(struct pt_regs *regs) if ((int)val < 0) { /* event has overflowed */ found = 1; - record_and_restart(event, val, regs, nmi); + record_and_restart(event, val, regs); } } diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c index b0dc8f7069c..0a6d2a9d569 100644 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ b/arch/powerpc/kernel/perf_event_fsl_emb.c @@ -568,7 +568,7 @@ static struct pmu fsl_emb_pmu = { * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs, int nmi) + struct pt_regs *regs) { u64 period = event->hw.sample_period; s64 prev, delta, left; @@ -616,7 +616,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - if (perf_event_overflow(event, nmi, &data, regs)) + if (perf_event_overflow(event, &data, regs)) fsl_emb_pmu_stop(event, 0); } } @@ -644,7 +644,7 @@ static void perf_event_interrupt(struct pt_regs *regs) if (event) { /* event has overflowed */ found = 1; - record_and_restart(event, val, regs, nmi); + record_and_restart(event, val, regs); } else { /* * Disabled counter is negative, diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index cb22024f2b4..3177617af2e 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -882,7 +882,7 @@ void user_disable_single_step(struct task_struct *task) } #ifdef CONFIG_HAVE_HW_BREAKPOINT -void ptrace_triggered(struct perf_event *bp, int nmi, +void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_event_attr attr; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 54f4fb994e9..dbc48254c6c 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -173,7 +173,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -319,7 +319,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -330,7 +330,7 @@ good_area: #endif } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index fe103e891e7..095f782a551 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -299,7 +299,7 @@ static inline int do_exception(struct pt_regs *regs, int access, goto out; address = trans_exc_code & __FAIL_ADDR_MASK; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_ALLOW_RETRY; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; @@ -345,11 +345,11 @@ retry: if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 3d7b209b217..8051976100a 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -63,7 +63,7 @@ static inline int put_stack_long(struct task_struct *task, int offset, return 0; } -void ptrace_triggered(struct perf_event *bp, int nmi, +void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_event_attr attr; diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index b51a17104b5..d9006f8ffc1 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -393,7 +393,7 @@ int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs, */ if (!expected) { unaligned_fixups_notify(current, instruction, regs); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address); } diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 6713ca97e55..67110be83fd 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -434,7 +434,7 @@ static int misaligned_load(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address); destreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { @@ -512,7 +512,7 @@ static int misaligned_store(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, address); srcreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { @@ -588,7 +588,7 @@ static int misaligned_fpu_load(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, address); destreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { @@ -665,7 +665,7 @@ static int misaligned_fpu_store(struct pt_regs *regs, return error; } - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, address); srcreg = (opcode >> 4) & 0x3f; if (user_mode(regs)) { diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c index f76a5090d5d..97719521065 100644 --- a/arch/sh/math-emu/math.c +++ b/arch/sh/math-emu/math.c @@ -620,7 +620,7 @@ int do_fpu_inst(unsigned short inst, struct pt_regs *regs) struct task_struct *tsk = current; struct sh_fpu_soft_struct *fpu = &(tsk->thread.xstate->softfpu); - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (!(task_thread_info(tsk)->status & TS_USEDFPU)) { /* initialize once. */ diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index d4c34d757f0..7bebd044f2a 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -160,7 +160,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if ((regs->sr & SR_IMASK) != SR_IMASK) local_irq_enable(); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -210,11 +210,11 @@ good_area: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index 7f5810f5dfd..e3430e093d4 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess, /* Not an IO address, so reenable interrupts */ local_irq_enable(); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt or have no user @@ -200,11 +200,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 2cb0e1c001e..0b32f2e9e08 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1277,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, if (!sparc_perf_event_set_period(event, hwc, idx)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) sparc_pmu_stop(event, 0); } diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c index 4491f4cb269..7efbb2f9e77 100644 --- a/arch/sparc/kernel/unaligned_32.c +++ b/arch/sparc/kernel/unaligned_32.c @@ -247,7 +247,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn) unsigned long addr = compute_effective_address(regs, insn); int err; - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); switch (dir) { case load: err = do_int_load(fetch_reg_addr(((insn>>25)&0x1f), @@ -338,7 +338,7 @@ asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn) } addr = compute_effective_address(regs, insn); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); switch(dir) { case load: err = do_int_load(fetch_reg_addr(((insn>>25)&0x1f), diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c index b2b019ea8ca..35cff1673aa 100644 --- a/arch/sparc/kernel/unaligned_64.c +++ b/arch/sparc/kernel/unaligned_64.c @@ -317,7 +317,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn) addr = compute_effective_address(regs, insn, ((insn >> 25) & 0x1f)); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, addr); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); switch (asi) { case ASI_NL: case ASI_AIUPL: @@ -384,7 +384,7 @@ int handle_popc(u32 insn, struct pt_regs *regs) int ret, i, rd = ((insn >> 25) & 0x1f); int from_kernel = (regs->tstate & TSTATE_PRIV) != 0; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (insn & 0x2000) { maybe_flush_windows(0, 0, rd, from_kernel); value = sign_extend_imm13(insn); @@ -431,7 +431,7 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs) int asi = decode_asi(insn, regs); int flag = (freg < 32) ? FPRS_DL : FPRS_DU; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); save_and_clear_fpu(); current_thread_info()->xfsr[0] &= ~0x1c000; @@ -554,7 +554,7 @@ void handle_ld_nf(u32 insn, struct pt_regs *regs) int from_kernel = (regs->tstate & TSTATE_PRIV) != 0; unsigned long *reg; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); maybe_flush_windows(0, 0, rd, from_kernel); reg = fetch_reg_addr(rd, regs); @@ -586,7 +586,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr if (tstate & TSTATE_PRIV) die_if_kernel("lddfmna from kernel", regs); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, sfar); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, sfar); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; if (get_user(insn, (u32 __user *) pc) != -EFAULT) { @@ -647,7 +647,7 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr if (tstate & TSTATE_PRIV) die_if_kernel("stdfmna from kernel", regs); - perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, sfar); + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, sfar); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; if (get_user(insn, (u32 __user *) pc) != -EFAULT) { diff --git a/arch/sparc/kernel/visemul.c b/arch/sparc/kernel/visemul.c index 36357717d69..32b626c9d81 100644 --- a/arch/sparc/kernel/visemul.c +++ b/arch/sparc/kernel/visemul.c @@ -802,7 +802,7 @@ int vis_emul(struct pt_regs *regs, unsigned int insn) BUG_ON(regs->tstate & TSTATE_PRIV); - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; diff --git a/arch/sparc/math-emu/math_32.c b/arch/sparc/math-emu/math_32.c index a3fccde894e..aa4d55b0bdf 100644 --- a/arch/sparc/math-emu/math_32.c +++ b/arch/sparc/math-emu/math_32.c @@ -164,7 +164,7 @@ int do_mathemu(struct pt_regs *regs, struct task_struct *fpt) int retcode = 0; /* assume all succeed */ unsigned long insn; - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); #ifdef DEBUG_MATHEMU printk("In do_mathemu()... pc is %08lx\n", regs->pc); diff --git a/arch/sparc/math-emu/math_64.c b/arch/sparc/math-emu/math_64.c index 56d2c44747b..e575bd2fe38 100644 --- a/arch/sparc/math-emu/math_64.c +++ b/arch/sparc/math-emu/math_64.c @@ -184,7 +184,7 @@ int do_mathemu(struct pt_regs *regs, struct fpustate *f) if (tstate & TSTATE_PRIV) die_if_kernel("unfinished/unimplemented FPop from kernel", regs); - perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, 0, regs, 0); + perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); if (test_thread_flag(TIF_32BIT)) pc = (u32)pc; if (get_user(insn, (u32 __user *) pc) != -EFAULT) { diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 7543ddbdadb..aa1c1b1ce5c 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -251,7 +251,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, if (in_atomic() || !mm) goto no_context; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); down_read(&mm->mmap_sem); @@ -301,12 +301,10 @@ good_area: } if (fault & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); return; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index f92ce56a8b2..504c0622f72 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -325,7 +325,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) if (in_atomic() || !mm) goto intr_or_no_mm; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (!down_read_trylock(&mm->mmap_sem)) { if ((regs->tstate & TSTATE_PRIV) && @@ -433,12 +433,10 @@ good_area: } if (fault & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8a57f9aa8e3..5b86ec51534 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1339,7 +1339,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 41178c826c4..d38b0020f77 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1003,7 +1003,7 @@ again: data.period = event->hw.last_period; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index bab491b8ee2..0941f93f294 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) + if (perf_output_begin(&handle, event, header.size * (top - at), 1)) return 1; for (; at < top; at++) { @@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, else regs.flags &= ~PERF_EFLAGS_EXACT; - if (perf_event_overflow(event, 1, &data, ®s)) + if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index f76fddf6338..d6e6a67b960 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -970,7 +970,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5f9ecff328b..98da6a7b5e8 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -608,7 +608,7 @@ int kgdb_arch_init(void) return register_die_notifier(&kgdb_notifier); } -static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, +static void kgdb_hw_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct task_struct *tsk = current; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 807c2a2b80f..11db2e9b860 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, int nmi, +static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2dbf6bf4c7e..4d09df054e3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1161,11 +1161,11 @@ good_area: if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2f7b5d42ab4..0946a8bc098 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -682,7 +682,7 @@ enum perf_event_active_state { struct file; struct perf_sample_data; -typedef void (*perf_overflow_handler_t)(struct perf_event *, int, +typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); @@ -925,7 +925,6 @@ struct perf_output_handle { unsigned long size; void *addr; int page; - int nmi; int sample; }; @@ -993,7 +992,7 @@ extern void perf_prepare_sample(struct perf_event_header *header, struct perf_event *event, struct pt_regs *regs); -extern int perf_event_overflow(struct perf_event *event, int nmi, +extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); @@ -1012,7 +1011,7 @@ static inline int is_software_event(struct perf_event *event) extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; -extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); +extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } @@ -1034,7 +1033,7 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs) } static __always_inline void -perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; @@ -1043,7 +1042,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; } - __perf_sw_event(event_id, nr, nmi, regs, addr); + __perf_sw_event(event_id, nr, regs, addr); } } @@ -1057,7 +1056,7 @@ static inline void perf_event_task_sched_in(struct task_struct *task) static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); __perf_event_task_sched_out(task, next); } @@ -1119,7 +1118,7 @@ extern void perf_bp_event(struct perf_event *event, void *data); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, - int nmi, int sample); + int sample); extern void perf_output_end(struct perf_output_handle *handle); extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); @@ -1143,8 +1142,7 @@ static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline void -perf_sw_event(u32 event_id, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) { } +perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index 270e32f9fc0..dbd1ca75bd3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3972,7 +3972,7 @@ void perf_prepare_sample(struct perf_event_header *header, } } -static void perf_event_output(struct perf_event *event, int nmi, +static void perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { @@ -3984,7 +3984,7 @@ static void perf_event_output(struct perf_event *event, int nmi, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size, nmi, 1)) + if (perf_output_begin(&handle, event, header.size, 1)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -4024,7 +4024,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + ret = perf_output_begin(&handle, event, read_event.header.size, 0); if (ret) return; @@ -4067,7 +4067,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - task_event->event_id.header.size, 0, 0); + task_event->event_id.header.size, 0); if (ret) goto out; @@ -4204,7 +4204,7 @@ static void perf_event_comm_output(struct perf_event *event, perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size, 0, 0); + comm_event->event_id.header.size, 0); if (ret) goto out; @@ -4351,7 +4351,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size, 0, 0); + mmap_event->event_id.header.size, 0); if (ret) goto out; @@ -4546,7 +4546,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, event, - throttle_event.header.size, 1, 0); + throttle_event.header.size, 0); if (ret) return; @@ -4559,7 +4559,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) * Generic event overflow handling, sampling. */ -static int __perf_event_overflow(struct perf_event *event, int nmi, +static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4602,34 +4602,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - if (nmi) { - event->pending_disable = 1; - irq_work_queue(&event->pending); - } else - perf_event_disable(event); + event->pending_disable = 1; + irq_work_queue(&event->pending); } if (event->overflow_handler) - event->overflow_handler(event, nmi, data, regs); + event->overflow_handler(event, data, regs); else - perf_event_output(event, nmi, data, regs); + perf_event_output(event, data, regs); if (event->fasync && event->pending_kill) { - if (nmi) { - event->pending_wakeup = 1; - irq_work_queue(&event->pending); - } else - perf_event_wakeup(event); + event->pending_wakeup = 1; + irq_work_queue(&event->pending); } return ret; } -int perf_event_overflow(struct perf_event *event, int nmi, +int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - return __perf_event_overflow(event, nmi, 1, data, regs); + return __perf_event_overflow(event, 1, data, regs); } /* @@ -4678,7 +4672,7 @@ again: } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, - int nmi, struct perf_sample_data *data, + struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; @@ -4692,7 +4686,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, return; for (; overflow; overflow--) { - if (__perf_event_overflow(event, nmi, throttle, + if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when @@ -4705,7 +4699,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } static void perf_swevent_event(struct perf_event *event, u64 nr, - int nmi, struct perf_sample_data *data, + struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; @@ -4719,12 +4713,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, return; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) - return perf_swevent_overflow(event, 1, nmi, data, regs); + return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; - perf_swevent_overflow(event, 0, nmi, data, regs); + perf_swevent_overflow(event, 0, data, regs); } static int perf_exclude_event(struct perf_event *event, @@ -4812,7 +4806,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, + u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4828,7 +4822,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_event(event, nr, nmi, data, regs); + perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); @@ -4849,8 +4843,7 @@ inline void perf_swevent_put_recursion_context(int rctx) put_recursion_context(swhash->recursion, rctx); } -void __perf_sw_event(u32 event_id, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; int rctx; @@ -4862,7 +4855,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, perf_sample_data_init(&data, addr); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); perf_swevent_put_recursion_context(rctx); preempt_enable_notrace(); @@ -5110,7 +5103,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, 1, &data, regs); + perf_swevent_event(event, count, &data, regs); } perf_swevent_put_recursion_context(rctx); @@ -5203,7 +5196,7 @@ void perf_bp_event(struct perf_event *bp, void *data) perf_sample_data_init(&sample, bp->attr.bp_addr); if (!bp->hw.state && !perf_exclude_event(bp, regs)) - perf_swevent_event(bp, 1, 1, &sample, regs); + perf_swevent_event(bp, 1, &sample, regs); } #endif @@ -5232,7 +5225,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 114f27f3a62..09097dd8116 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -27,7 +27,6 @@ struct ring_buffer { void *data_pages[0]; }; - extern void rb_free(struct ring_buffer *rb); extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index fc2701c9920..8b3b73630fa 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -38,11 +38,8 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLL_IN); - if (handle->nmi) { - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); - } else - perf_event_wakeup(handle->event); + handle->event->pending_wakeup = 1; + irq_work_queue(&handle->event->pending); } /* @@ -102,7 +99,7 @@ out: int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, - int nmi, int sample) + int sample) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -127,7 +124,6 @@ int perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; - handle->nmi = nmi; handle->sample = sample; if (!rb->nr_pages) diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609..d08d110b897 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2220,7 +2220,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } __set_task_cpu(p, new_cpu); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 752b75ba662..a6708e677a0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -211,7 +211,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 063653955f9..7b164d3200f 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -41,7 +41,7 @@ module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" " write operations on the kernel symbol"); -static void sample_hbp_handler(struct perf_event *bp, int nmi, +static void sample_hbp_handler(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { -- cgit v1.2.3-70-g09d2 From a7ac67ea021b4603095d2aa458bc41641238f22c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 16:47:16 +0200 Subject: perf: Remove the perf_output_begin(.sample) argument Since only samples call perf_output_sample() its much saner (and more correct) to put the sample logic in there than in the perf_output_begin()/perf_output_end() pair. Saves a useless argument, reduces conditionals and shrinks struct perf_output_handle, win! Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-2crpvsx3cqu67q3zqjbnlpsc@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 +- include/linux/perf_event.h | 4 +--- kernel/events/core.c | 26 ++++++++++++++++++++------ kernel/events/ring_buffer.c | 19 +------------------ 4 files changed, 23 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 0941f93f294..1b1ef3addcf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1)) + if (perf_output_begin(&handle, event, header.size * (top - at))) return 1; for (; at < top; at++) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0946a8bc098..771b0b2845e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -925,7 +925,6 @@ struct perf_output_handle { unsigned long size; void *addr; int page; - int sample; }; #ifdef CONFIG_PERF_EVENTS @@ -1117,8 +1116,7 @@ extern void perf_bp_event(struct perf_event *event, void *data); #endif extern int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int sample); + struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); diff --git a/kernel/events/core.c b/kernel/events/core.c index dbd1ca75bd3..81de28dcca8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3928,6 +3928,20 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, raw); } } + + if (!event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + struct ring_buffer *rb = handle->rb; + int events = local_inc_return(&rb->events); + + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3984,7 +3998,7 @@ static void perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size, 1)) + if (perf_output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -4024,7 +4038,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size, 0); + ret = perf_output_begin(&handle, event, read_event.header.size); if (ret) return; @@ -4067,7 +4081,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - task_event->event_id.header.size, 0); + task_event->event_id.header.size); if (ret) goto out; @@ -4204,7 +4218,7 @@ static void perf_event_comm_output(struct perf_event *event, perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size, 0); + comm_event->event_id.header.size); if (ret) goto out; @@ -4351,7 +4365,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size, 0); + mmap_event->event_id.header.size); if (ret) goto out; @@ -4546,7 +4560,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, event, - throttle_event.header.size, 0); + throttle_event.header.size); if (ret) return; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 8b3b73630fa..a2a29205cc0 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -98,8 +98,7 @@ out: } int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int sample) + struct perf_event *event, unsigned int size) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -124,7 +123,6 @@ int perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; - handle->sample = sample; if (!rb->nr_pages) goto out; @@ -192,21 +190,6 @@ void perf_output_copy(struct perf_output_handle *handle, void perf_output_end(struct perf_output_handle *handle) { - struct perf_event *event = handle->event; - struct ring_buffer *rb = handle->rb; - - if (handle->sample && !event->attr.watermark) { - int wakeup_events = event->attr.wakeup_events; - - if (wakeup_events) { - int events = local_inc_return(&rb->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); - } - } - } - perf_output_put_handle(handle); rcu_read_unlock(); } -- cgit v1.2.3-70-g09d2 From 4dc0da86967d5463708631d02a70cfed5b104884 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:35 +0300 Subject: perf: Add context field to perf_event The perf_event overflow handler does not receive any caller-derived argument, so many callers need to resort to looking up the perf_event in their local data structure. This is ugly and doesn't scale if a single callback services many perf_events. Fix by adding a context parameter to perf_event_create_kernel_counter() (and derived hardware breakpoints APIs) and storing it in the perf_event. The field can be accessed from the callback as event->overflow_handler_context. All callers are updated. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-2-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/arm/kernel/ptrace.c | 3 ++- arch/powerpc/kernel/ptrace.c | 2 +- arch/sh/kernel/ptrace_32.c | 3 ++- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/ptrace.c | 3 ++- drivers/oprofile/oprofile_perf.c | 2 +- include/linux/hw_breakpoint.h | 10 ++++++++-- include/linux/perf_event.h | 4 +++- kernel/events/core.c | 21 +++++++++++++++------ kernel/events/hw_breakpoint.c | 10 +++++++--- kernel/watchdog.c | 2 +- samples/hw_breakpoint/data_breakpoint.c | 2 +- 12 files changed, 44 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 0c9b1054f79..5c199610719 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -479,7 +479,8 @@ static struct perf_event *ptrace_hbp_create(struct task_struct *tsk, int type) attr.bp_type = type; attr.disabled = 1; - return register_user_hw_breakpoint(&attr, ptrace_hbptriggered, tsk); + return register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL, + tsk); } static int ptrace_gethbpregs(struct task_struct *tsk, long num, diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 3177617af2e..05b7dd217f6 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -973,7 +973,7 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, &attr.bp_type); thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, - ptrace_triggered, task); + ptrace_triggered, NULL, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; ptrace_put_breakpoints(task); diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 8051976100a..92b3c276339 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -91,7 +91,8 @@ static int set_single_step(struct task_struct *tsk, unsigned long addr) attr.bp_len = HW_BREAKPOINT_LEN_2; attr.bp_type = HW_BREAKPOINT_R; - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); if (IS_ERR(bp)) return PTR_ERR(bp); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 98da6a7b5e8..00354d4919a 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -638,7 +638,7 @@ void kgdb_arch_late(void) for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; - breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); if (IS_ERR((void * __force)breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 11db2e9b860..82528799c5d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); /* * CHECKME: the previous code returned -EIO if the addr wasn't diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c index 9046f7b2ed7..59acf9ef78a 100644 --- a/drivers/oprofile/oprofile_perf.c +++ b/drivers/oprofile/oprofile_perf.c @@ -79,7 +79,7 @@ static int op_create_counter(int cpu, int event) pevent = perf_event_create_kernel_counter(&counter_config[event].attr, cpu, NULL, - op_overflow_handler); + op_overflow_handler, NULL); if (IS_ERR(pevent)) return PTR_ERR(pevent); diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index d1e55fed2c7..6ae9c631a1b 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -73,6 +73,7 @@ static inline unsigned long hw_breakpoint_len(struct perf_event *bp) extern struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ @@ -85,11 +86,13 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); extern struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, int cpu); extern struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered); + perf_overflow_handler_t triggered, + void *context); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern int __register_perf_hw_breakpoint(struct perf_event *bp); @@ -115,6 +118,7 @@ static inline int __init init_hw_breakpoint(void) { return 0; } static inline struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, struct task_struct *tsk) { return NULL; } static inline int modify_user_hw_breakpoint(struct perf_event *bp, @@ -122,10 +126,12 @@ modify_user_hw_breakpoint(struct perf_event *bp, static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, int cpu) { return NULL; } static inline struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered) { return NULL; } + perf_overflow_handler_t triggered, + void *context) { return NULL; } static inline int register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline int diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a5f54b973bd..2a08cacb162 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -839,6 +839,7 @@ struct perf_event { u64 id; perf_overflow_handler_t overflow_handler; + void *overflow_handler_context; #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call *tp_event; @@ -960,7 +961,8 @@ extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, - perf_overflow_handler_t callback); + perf_overflow_handler_t callback, + void *context); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); diff --git a/kernel/events/core.c b/kernel/events/core.c index 81de28dcca8..ba8e0f4a20e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5745,7 +5745,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler) + perf_overflow_handler_t overflow_handler, + void *context) { struct pmu *pmu; struct perf_event *event; @@ -5803,10 +5804,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, #endif } - if (!overflow_handler && parent_event) + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; + context = parent_event->overflow_handler_context; + } event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -6073,7 +6077,8 @@ SYSCALL_DEFINE5(perf_event_open, } } - event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, + NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; @@ -6258,7 +6263,8 @@ err_fd: struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, - perf_overflow_handler_t overflow_handler) + perf_overflow_handler_t overflow_handler, + void *context) { struct perf_event_context *ctx; struct perf_event *event; @@ -6268,7 +6274,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Get the target context (task or percpu): */ - event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, + overflow_handler, context); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -6552,7 +6559,7 @@ inherit_event(struct perf_event *parent_event, parent_event->cpu, child, group_leader, parent_event, - NULL); + NULL, NULL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -6579,6 +6586,8 @@ inherit_event(struct perf_event *parent_event, child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; + child_event->overflow_handler_context + = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 086adf25a55..b7971d6f38b 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp) struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, + void *context, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, tsk, triggered); + return perf_event_create_kernel_counter(attr, -1, tsk, triggered, + context); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); @@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); */ struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered) + perf_overflow_handler_t triggered, + void *context) { struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; @@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, + triggered, context); *pevent = bp; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6708e677a0..a933e3a0398 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -375,7 +375,7 @@ static int watchdog_nmi_enable(int cpu) hw_nmi_watchdog_set_attr(wd_attr); /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); goto out_save; diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 7b164d3200f..ef7f3229185 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -60,7 +60,7 @@ static int __init hw_break_module_init(void) attr.bp_len = HW_BREAKPOINT_LEN_4; attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; - sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler); + sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); if (IS_ERR((void __force *)sample_hbp)) { ret = PTR_ERR((void __force *)sample_hbp); goto fail; -- cgit v1.2.3-70-g09d2 From 26ca5c11fb45ae2b2ac7e3574b8db6b3a3c7d350 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:37 +0300 Subject: perf: export perf_event_refresh() to modules KVM needs one-shot samples, since a PMC programmed to -X will fire after X events and then again after 2^40 events (i.e. variable period). Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-4-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2a08cacb162..3f2711ccf91 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -955,6 +955,7 @@ extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); +extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * @@ -1149,6 +1150,10 @@ static inline void perf_event_delayed_put(struct task_struct *task) { } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } +static inline int perf_event_refresh(struct perf_event *event, int refresh) +{ + return -EINVAL; +} static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index ba8e0f4a20e..0567e32d71a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1764,7 +1764,7 @@ out: raw_spin_unlock_irq(&ctx->lock); } -static int perf_event_refresh(struct perf_event *event, int refresh) +int perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -1777,6 +1777,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } +EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, -- cgit v1.2.3-70-g09d2 From f721889ff65afa6243c463832c74dee3bed418d5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 1 Jul 2011 22:12:45 +0200 Subject: PM / Domains: Support for generic I/O PM domains (v8) Introduce common headers, helper functions and callbacks allowing platforms to use simple generic power domains for runtime power management. Introduce struct generic_pm_domain to be used for representing power domains that each contain a number of devices and may be parent domains or subdomains with respect to other power domains. Among other things, this structure includes callbacks to be provided by platforms for performing specific tasks related to power management (i.e. ->stop_device() may disable a device's clocks, while ->start_device() may enable them, ->power_off() is supposed to remove power from the entire power domain and ->power_on() is supposed to restore it). Introduce functions that can be used as power domain runtime PM callbacks, pm_genpd_runtime_suspend() and pm_genpd_runtime_resume(), as well as helper functions for the initialization of a power domain represented by a struct generic_power_domain object, adding a device to or removing a device from it and adding or removing subdomains. Introduce configuration option CONFIG_PM_GENERIC_DOMAINS to be selected by the platforms that want to use the new code. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Reviewed-by: Kevin Hilman --- drivers/base/power/Makefile | 1 + drivers/base/power/domain.c | 494 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_domain.h | 78 +++++++ kernel/power/Kconfig | 4 + 4 files changed, 577 insertions(+) create mode 100644 drivers/base/power/domain.c create mode 100644 include/linux/pm_domain.h (limited to 'kernel') diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 3647e114d0e..2639ae79a37 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o obj-$(CONFIG_PM_RUNTIME) += runtime.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o obj-$(CONFIG_PM_OPP) += opp.o +obj-$(CONFIG_PM_GENERIC_DOMAINS) += domain.o obj-$(CONFIG_HAVE_CLK) += clock_ops.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG \ No newline at end of file diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c new file mode 100644 index 00000000000..fd31be3be40 --- /dev/null +++ b/drivers/base/power/domain.c @@ -0,0 +1,494 @@ +/* + * drivers/base/power/domain.c - Common code related to device power domains. + * + * Copyright (C) 2011 Rafael J. Wysocki , Renesas Electronics Corp. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PM_RUNTIME + +static void genpd_sd_counter_dec(struct generic_pm_domain *genpd) +{ + if (!WARN_ON(genpd->sd_count == 0)) + genpd->sd_count--; +} + +/** + * __pm_genpd_save_device - Save the pre-suspend state of a device. + * @dle: Device list entry of the device to save the state of. + * @genpd: PM domain the device belongs to. + */ +static int __pm_genpd_save_device(struct dev_list_entry *dle, + struct generic_pm_domain *genpd) +{ + struct device *dev = dle->dev; + struct device_driver *drv = dev->driver; + int ret = 0; + + if (dle->need_restore) + return 0; + + if (drv && drv->pm && drv->pm->runtime_suspend) { + if (genpd->start_device) + genpd->start_device(dev); + + ret = drv->pm->runtime_suspend(dev); + + if (genpd->stop_device) + genpd->stop_device(dev); + } + + if (!ret) + dle->need_restore = true; + + return ret; +} + +/** + * __pm_genpd_restore_device - Restore the pre-suspend state of a device. + * @dle: Device list entry of the device to restore the state of. + * @genpd: PM domain the device belongs to. + */ +static void __pm_genpd_restore_device(struct dev_list_entry *dle, + struct generic_pm_domain *genpd) +{ + struct device *dev = dle->dev; + struct device_driver *drv = dev->driver; + + if (!dle->need_restore) + return; + + if (drv && drv->pm && drv->pm->runtime_resume) { + if (genpd->start_device) + genpd->start_device(dev); + + drv->pm->runtime_resume(dev); + + if (genpd->stop_device) + genpd->stop_device(dev); + } + + dle->need_restore = false; +} + +/** + * pm_genpd_poweroff - Remove power from a given PM domain. + * @genpd: PM domain to power down. + * + * If all of the @genpd's devices have been suspended and all of its subdomains + * have been powered down, run the runtime suspend callbacks provided by all of + * the @genpd's devices' drivers and remove power from @genpd. + */ +static int pm_genpd_poweroff(struct generic_pm_domain *genpd) +{ + struct generic_pm_domain *parent; + struct dev_list_entry *dle; + unsigned int not_suspended; + int ret; + + if (genpd->power_is_off) + return 0; + + if (genpd->sd_count > 0) + return -EBUSY; + + not_suspended = 0; + list_for_each_entry(dle, &genpd->dev_list, node) + if (dle->dev->driver && !pm_runtime_suspended(dle->dev)) + not_suspended++; + + if (not_suspended > genpd->in_progress) + return -EBUSY; + + if (genpd->gov && genpd->gov->power_down_ok) { + if (!genpd->gov->power_down_ok(&genpd->domain)) + return -EAGAIN; + } + + list_for_each_entry_reverse(dle, &genpd->dev_list, node) { + ret = __pm_genpd_save_device(dle, genpd); + if (ret) + goto err_dev; + } + + if (genpd->power_off) + genpd->power_off(genpd); + + genpd->power_is_off = true; + + parent = genpd->parent; + if (parent) { + genpd_sd_counter_dec(parent); + if (parent->sd_count == 0) + queue_work(pm_wq, &parent->power_off_work); + } + + return 0; + + err_dev: + list_for_each_entry_continue(dle, &genpd->dev_list, node) + __pm_genpd_restore_device(dle, genpd); + + return ret; +} + +/** + * genpd_power_off_work_fn - Power off PM domain whose subdomain count is 0. + * @work: Work structure used for scheduling the execution of this function. + */ +static void genpd_power_off_work_fn(struct work_struct *work) +{ + struct generic_pm_domain *genpd; + + genpd = container_of(work, struct generic_pm_domain, power_off_work); + + if (genpd->parent) + mutex_lock(&genpd->parent->lock); + mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING); + pm_genpd_poweroff(genpd); + mutex_unlock(&genpd->lock); + if (genpd->parent) + mutex_unlock(&genpd->parent->lock); +} + +/** + * pm_genpd_runtime_suspend - Suspend a device belonging to I/O PM domain. + * @dev: Device to suspend. + * + * Carry out a runtime suspend of a device under the assumption that its + * pm_domain field points to the domain member of an object of type + * struct generic_pm_domain representing a PM domain consisting of I/O devices. + */ +static int pm_genpd_runtime_suspend(struct device *dev) +{ + struct generic_pm_domain *genpd; + + dev_dbg(dev, "%s()\n", __func__); + + if (IS_ERR_OR_NULL(dev->pm_domain)) + return -EINVAL; + + genpd = container_of(dev->pm_domain, struct generic_pm_domain, domain); + + if (genpd->parent) + mutex_lock(&genpd->parent->lock); + mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING); + + if (genpd->stop_device) { + int ret = genpd->stop_device(dev); + if (ret) + goto out; + } + genpd->in_progress++; + pm_genpd_poweroff(genpd); + genpd->in_progress--; + + out: + mutex_unlock(&genpd->lock); + if (genpd->parent) + mutex_unlock(&genpd->parent->lock); + + return 0; +} + +/** + * pm_genpd_poweron - Restore power to a given PM domain and its parents. + * @genpd: PM domain to power up. + * + * Restore power to @genpd and all of its parents so that it is possible to + * resume a device belonging to it. + */ +static int pm_genpd_poweron(struct generic_pm_domain *genpd) +{ + int ret = 0; + + start: + if (genpd->parent) + mutex_lock(&genpd->parent->lock); + mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING); + + if (!genpd->power_is_off) + goto out; + + if (genpd->parent && genpd->parent->power_is_off) { + mutex_unlock(&genpd->lock); + mutex_unlock(&genpd->parent->lock); + + ret = pm_genpd_poweron(genpd->parent); + if (ret) + return ret; + + goto start; + } + + if (genpd->power_on) { + int ret = genpd->power_on(genpd); + if (ret) + goto out; + } + + genpd->power_is_off = false; + if (genpd->parent) + genpd->parent->sd_count++; + + out: + mutex_unlock(&genpd->lock); + if (genpd->parent) + mutex_unlock(&genpd->parent->lock); + + return ret; +} + +/** + * pm_genpd_runtime_resume - Resume a device belonging to I/O PM domain. + * @dev: Device to resume. + * + * Carry out a runtime resume of a device under the assumption that its + * pm_domain field points to the domain member of an object of type + * struct generic_pm_domain representing a PM domain consisting of I/O devices. + */ +static int pm_genpd_runtime_resume(struct device *dev) +{ + struct generic_pm_domain *genpd; + struct dev_list_entry *dle; + int ret; + + dev_dbg(dev, "%s()\n", __func__); + + if (IS_ERR_OR_NULL(dev->pm_domain)) + return -EINVAL; + + genpd = container_of(dev->pm_domain, struct generic_pm_domain, domain); + + ret = pm_genpd_poweron(genpd); + if (ret) + return ret; + + mutex_lock(&genpd->lock); + + list_for_each_entry(dle, &genpd->dev_list, node) { + if (dle->dev == dev) { + __pm_genpd_restore_device(dle, genpd); + break; + } + } + + if (genpd->start_device) + genpd->start_device(dev); + + mutex_unlock(&genpd->lock); + + return 0; +} + +#else + +static inline void genpd_power_off_work_fn(struct work_struct *work) {} + +#define pm_genpd_runtime_suspend NULL +#define pm_genpd_runtime_resume NULL + +#endif /* CONFIG_PM_RUNTIME */ + +/** + * pm_genpd_add_device - Add a device to an I/O PM domain. + * @genpd: PM domain to add the device to. + * @dev: Device to be added. + */ +int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev) +{ + struct dev_list_entry *dle; + int ret = 0; + + dev_dbg(dev, "%s()\n", __func__); + + if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev)) + return -EINVAL; + + mutex_lock(&genpd->lock); + + if (genpd->power_is_off) { + ret = -EINVAL; + goto out; + } + + list_for_each_entry(dle, &genpd->dev_list, node) + if (dle->dev == dev) { + ret = -EINVAL; + goto out; + } + + dle = kzalloc(sizeof(*dle), GFP_KERNEL); + if (!dle) { + ret = -ENOMEM; + goto out; + } + + dle->dev = dev; + dle->need_restore = false; + list_add_tail(&dle->node, &genpd->dev_list); + + spin_lock_irq(&dev->power.lock); + dev->pm_domain = &genpd->domain; + spin_unlock_irq(&dev->power.lock); + + out: + mutex_unlock(&genpd->lock); + + return ret; +} + +/** + * pm_genpd_remove_device - Remove a device from an I/O PM domain. + * @genpd: PM domain to remove the device from. + * @dev: Device to be removed. + */ +int pm_genpd_remove_device(struct generic_pm_domain *genpd, + struct device *dev) +{ + struct dev_list_entry *dle; + int ret = -EINVAL; + + dev_dbg(dev, "%s()\n", __func__); + + if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev)) + return -EINVAL; + + mutex_lock(&genpd->lock); + + list_for_each_entry(dle, &genpd->dev_list, node) { + if (dle->dev != dev) + continue; + + spin_lock_irq(&dev->power.lock); + dev->pm_domain = NULL; + spin_unlock_irq(&dev->power.lock); + + list_del(&dle->node); + kfree(dle); + + ret = 0; + break; + } + + mutex_unlock(&genpd->lock); + + return ret; +} + +/** + * pm_genpd_add_subdomain - Add a subdomain to an I/O PM domain. + * @genpd: Master PM domain to add the subdomain to. + * @new_subdomain: Subdomain to be added. + */ +int pm_genpd_add_subdomain(struct generic_pm_domain *genpd, + struct generic_pm_domain *new_subdomain) +{ + struct generic_pm_domain *subdomain; + int ret = 0; + + if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(new_subdomain)) + return -EINVAL; + + mutex_lock(&genpd->lock); + + if (genpd->power_is_off && !new_subdomain->power_is_off) { + ret = -EINVAL; + goto out; + } + + list_for_each_entry(subdomain, &genpd->sd_list, sd_node) { + if (subdomain == new_subdomain) { + ret = -EINVAL; + goto out; + } + } + + mutex_lock_nested(&new_subdomain->lock, SINGLE_DEPTH_NESTING); + + list_add_tail(&new_subdomain->sd_node, &genpd->sd_list); + new_subdomain->parent = genpd; + if (!subdomain->power_is_off) + genpd->sd_count++; + + mutex_unlock(&new_subdomain->lock); + + out: + mutex_unlock(&genpd->lock); + + return ret; +} + +/** + * pm_genpd_remove_subdomain - Remove a subdomain from an I/O PM domain. + * @genpd: Master PM domain to remove the subdomain from. + * @target: Subdomain to be removed. + */ +int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, + struct generic_pm_domain *target) +{ + struct generic_pm_domain *subdomain; + int ret = -EINVAL; + + if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(target)) + return -EINVAL; + + mutex_lock(&genpd->lock); + + list_for_each_entry(subdomain, &genpd->sd_list, sd_node) { + if (subdomain != target) + continue; + + mutex_lock_nested(&subdomain->lock, SINGLE_DEPTH_NESTING); + + list_del(&subdomain->sd_node); + subdomain->parent = NULL; + if (!subdomain->power_is_off) + genpd_sd_counter_dec(genpd); + + mutex_unlock(&subdomain->lock); + + ret = 0; + break; + } + + mutex_unlock(&genpd->lock); + + return ret; +} + +/** + * pm_genpd_init - Initialize a generic I/O PM domain object. + * @genpd: PM domain object to initialize. + * @gov: PM domain governor to associate with the domain (may be NULL). + * @is_off: Initial value of the domain's power_is_off field. + */ +void pm_genpd_init(struct generic_pm_domain *genpd, + struct dev_power_governor *gov, bool is_off) +{ + if (IS_ERR_OR_NULL(genpd)) + return; + + INIT_LIST_HEAD(&genpd->sd_node); + genpd->parent = NULL; + INIT_LIST_HEAD(&genpd->dev_list); + INIT_LIST_HEAD(&genpd->sd_list); + mutex_init(&genpd->lock); + genpd->gov = gov; + INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); + genpd->in_progress = 0; + genpd->sd_count = 0; + genpd->power_is_off = is_off; + genpd->domain.ops.runtime_suspend = pm_genpd_runtime_suspend; + genpd->domain.ops.runtime_resume = pm_genpd_runtime_resume; + genpd->domain.ops.runtime_idle = pm_generic_runtime_idle; +} diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h new file mode 100644 index 00000000000..b1a22c65380 --- /dev/null +++ b/include/linux/pm_domain.h @@ -0,0 +1,78 @@ +/* + * pm_domain.h - Definitions and headers related to device power domains. + * + * Copyright (C) 2011 Rafael J. Wysocki , Renesas Electronics Corp. + * + * This file is released under the GPLv2. + */ + +#ifndef _LINUX_PM_DOMAIN_H +#define _LINUX_PM_DOMAIN_H + +#include + +struct dev_power_governor { + bool (*power_down_ok)(struct dev_pm_domain *domain); +}; + +struct generic_pm_domain { + struct dev_pm_domain domain; /* PM domain operations */ + struct list_head sd_node; /* Node in the parent's subdomain list */ + struct generic_pm_domain *parent; /* Parent PM domain */ + struct list_head sd_list; /* List of dubdomains */ + struct list_head dev_list; /* List of devices */ + struct mutex lock; + struct dev_power_governor *gov; + struct work_struct power_off_work; + unsigned int in_progress; /* Number of devices being suspended now */ + unsigned int sd_count; /* Number of subdomains with power "on" */ + bool power_is_off; /* Whether or not power has been removed */ + int (*power_off)(struct generic_pm_domain *domain); + int (*power_on)(struct generic_pm_domain *domain); + int (*start_device)(struct device *dev); + int (*stop_device)(struct device *dev); +}; + +struct dev_list_entry { + struct list_head node; + struct device *dev; + bool need_restore; +}; + +#ifdef CONFIG_PM_GENERIC_DOMAINS +extern int pm_genpd_add_device(struct generic_pm_domain *genpd, + struct device *dev); +extern int pm_genpd_remove_device(struct generic_pm_domain *genpd, + struct device *dev); +extern int pm_genpd_add_subdomain(struct generic_pm_domain *genpd, + struct generic_pm_domain *new_subdomain); +extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, + struct generic_pm_domain *target); +extern void pm_genpd_init(struct generic_pm_domain *genpd, + struct dev_power_governor *gov, bool is_off); +#else +static inline int pm_genpd_add_device(struct generic_pm_domain *genpd, + struct device *dev) +{ + return -ENOSYS; +} +static inline int pm_genpd_remove_device(struct generic_pm_domain *genpd, + struct device *dev) +{ + return -ENOSYS; +} +static inline int pm_genpd_add_subdomain(struct generic_pm_domain *genpd, + struct generic_pm_domain *new_sd) +{ + return -ENOSYS; +} +static inline int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, + struct generic_pm_domain *target) +{ + return -ENOSYS; +} +static inline void pm_genpd_init(struct generic_pm_domain *genpd, + struct dev_power_governor *gov, bool is_off) {} +#endif + +#endif /* _LINUX_PM_DOMAIN_H */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 87f4d24b55b..e83ac2556c8 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -227,3 +227,7 @@ config PM_OPP config PM_RUNTIME_CLK def_bool y depends on PM_RUNTIME && HAVE_CLK + +config PM_GENERIC_DOMAINS + bool + depends on PM -- cgit v1.2.3-70-g09d2 From b7b95920aa2e89e655afe9913ee0e55855ceda90 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 1 Jul 2011 22:13:37 +0200 Subject: PM: Allow the clocks management code to be used during system suspend The common clocks management code in drivers/base/power/clock_ops.c is going to be used during system-wide power transitions as well as for runtime PM, so it shouldn't depend on CONFIG_PM_RUNTIME. However, the suspend/resume functions provided by it for CONFIG_PM_RUNTIME unset, to be used during system-wide power transitions, should not behave in the same way as their counterparts defined for CONFIG_PM_RUNTIME set, because in that case the clocks are managed differently at run time. The names of the functions still contain the word "runtime" after this change, but that is going to be modified by a separate patch later. Signed-off-by: Rafael J. Wysocki Reviewed-by: Kevin Hilman --- drivers/base/power/clock_ops.c | 60 +++++++++++++++++++++++++++++++++++++++++- include/linux/pm_runtime.h | 2 +- kernel/power/Kconfig | 4 +-- 3 files changed, 62 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index c5624818259..2fb9c121c64 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -15,7 +15,7 @@ #include #include -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM struct pm_runtime_clk_data { struct list_head clock_list; @@ -191,6 +191,10 @@ void pm_runtime_clk_destroy(struct device *dev) kfree(prd); } +#endif /* CONFIG_PM */ + +#ifdef CONFIG_PM_RUNTIME + /** * pm_runtime_clk_acquire - Acquire a device clock. * @dev: Device whose clock is to be acquired. @@ -330,6 +334,60 @@ static int pm_runtime_clk_notify(struct notifier_block *nb, #else /* !CONFIG_PM_RUNTIME */ +#ifdef CONFIG_PM + +/** + * pm_runtime_clk_suspend - Disable clocks in a device's PM clock list. + * @dev: Device to disable the clocks for. + */ +int pm_runtime_clk_suspend(struct device *dev) +{ + struct pm_runtime_clk_data *prd = __to_prd(dev); + struct pm_clock_entry *ce; + + dev_dbg(dev, "%s()\n", __func__); + + /* If there is no driver, the clocks are already disabled. */ + if (!prd || !dev->driver) + return 0; + + mutex_lock(&prd->lock); + + list_for_each_entry_reverse(ce, &prd->clock_list, node) + clk_disable(ce->clk); + + mutex_unlock(&prd->lock); + + return 0; +} + +/** + * pm_runtime_clk_resume - Enable clocks in a device's PM clock list. + * @dev: Device to enable the clocks for. + */ +int pm_runtime_clk_resume(struct device *dev) +{ + struct pm_runtime_clk_data *prd = __to_prd(dev); + struct pm_clock_entry *ce; + + dev_dbg(dev, "%s()\n", __func__); + + /* If there is no driver, the clocks should remain disabled. */ + if (!prd || !dev->driver) + return 0; + + mutex_lock(&prd->lock); + + list_for_each_entry(ce, &prd->clock_list, node) + clk_enable(ce->clk); + + mutex_unlock(&prd->lock); + + return 0; +} + +#endif /* CONFIG_PM */ + /** * enable_clock - Enable a device clock. * @dev: Device whose clock is to be enabled. diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index ef91904c711..1bd5063a2cc 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -251,7 +251,7 @@ struct pm_clk_notifier_block { char *con_ids[]; }; -#ifdef CONFIG_PM_RUNTIME_CLK +#ifdef CONFIG_PM_CLK extern int pm_runtime_clk_init(struct device *dev); extern void pm_runtime_clk_destroy(struct device *dev); extern int pm_runtime_clk_add(struct device *dev, const char *con_id); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e83ac2556c8..7b856b3458d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -224,9 +224,9 @@ config PM_OPP implementations a ready to use framework to manage OPPs. For more information, read -config PM_RUNTIME_CLK +config PM_CLK def_bool y - depends on PM_RUNTIME && HAVE_CLK + depends on PM && HAVE_CLK config PM_GENERIC_DOMAINS bool -- cgit v1.2.3-70-g09d2 From e9dbfae53eeb9fc3d4bb7da3df87fa9875f5da02 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 11:36:06 -0400 Subject: tracing: Fix bug when reading system filters on module removal The event system is freed when its nr_events is set to zero. This happens when a module created an event system and then later the module is removed. Modules may share systems, so the system is allocated when it is created and freed when the modules are unloaded and all the events under the system are removed (nr_events set to zero). The problem arises when a task opened the "filter" file for the system. If the module is unloaded and it removed the last event for that system, the system structure is freed. If the task that opened the filter file accesses the "filter" file after the system has been freed, the system will access an invalid pointer. By adding a ref_count, and using it to keep track of what is using the event system, we can free it after all users are finished with the event system. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 86 +++++++++++++++++++++++++++++++++----- kernel/trace/trace_events_filter.c | 6 +++ 3 files changed, 82 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f8591f61..f8074072d11 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,6 +677,7 @@ struct event_subsystem { struct dentry *entry; struct event_filter *filter; int nr_events; + int ref_count; }; #define FILTER_PRED_INVALID ((unsigned short)-1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec399f2a..ffc5b2884af 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,6 +244,35 @@ static void ftrace_clear_events(void) mutex_unlock(&event_mutex); } +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system->ref_count == 0); + if (--system->ref_count) + return; + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system->ref_count == 0); + system->ref_count++; +} + +static void put_system(struct event_subsystem *system) +{ + mutex_lock(&event_mutex); + __put_system(system); + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -826,6 +855,47 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct event_subsystem *system = NULL; + int ret; + + /* Make sure the system still exists */ + mutex_lock(&event_mutex); + list_for_each_entry(system, &event_subsystems, list) { + if (system == inode->i_private) { + /* Don't open systems with no events */ + if (!system->nr_events) { + system = NULL; + break; + } + __get_system(system); + break; + } + } + mutex_unlock(&event_mutex); + + if (system != inode->i_private) + return -ENODEV; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + put_system(system); + + return ret; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct event_subsystem *system = inode->i_private; + + put_system(system); + + return 0; +} + static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -963,10 +1033,11 @@ static const struct file_operations ftrace_event_filter_fops = { }; static const struct file_operations ftrace_subsystem_filter_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { @@ -1002,8 +1073,6 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -static LIST_HEAD(event_subsystems); - static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { @@ -1013,6 +1082,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { + __get_system(system); system->nr_events++; return system->entry; } @@ -1035,6 +1105,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } system->nr_events = 1; + system->ref_count = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -1184,16 +1255,9 @@ static void remove_subsystem_dir(const char *name) list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { if (!--system->nr_events) { - struct event_filter *filter = system->filter; - debugfs_remove_recursive(system->entry); list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); + __put_system(system); } break; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8008ddcfbf2..256764ecccd 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + /* Make sure the system still has events */ + if (!system->nr_events) { + err = -ENODEV; + goto out_unlock; + } + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); -- cgit v1.2.3-70-g09d2 From 40ee4dffff061399eb9358e0c8fcfbaf8de4c8fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 14:32:51 -0400 Subject: tracing: Have "enable" file use refcounts like the "filter" file The "enable" file for the event system can be removed when a module is unloaded and the event system only has events from that module. As the event system nr_events count goes to zero, it may be freed if its ref_count is also set to zero. Like the "filter" file, the "enable" file may be opened by a task and referenced later, after a module has been unloaded and the events for that event system have been removed. Although the "filter" file referenced the event system structure, the "enable" file only references a pointer to the event system name. Since the name is freed when the event system is removed, it is possible that an access to the "enable" file may reference a freed pointer. Update the "enable" file to use the subsystem_open() routine that the "filter" file uses, to keep a reference to the event system structure while the "enable" file is opened. Cc: Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ffc5b2884af..3e2a7c91c54 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -557,7 +557,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; int set = 0; @@ -568,7 +568,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (!call->name || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system) != 0) + if (system && strcmp(call->class->system, system->name) != 0) continue; /* @@ -598,7 +598,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; + const char *name = NULL; unsigned long val; char buf[64]; ssize_t ret; @@ -622,7 +623,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; - ret = __ftrace_set_clr_event(NULL, system, NULL, val); + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(NULL, name, NULL, val); if (ret) goto out; @@ -862,6 +870,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct event_subsystem *system = NULL; int ret; + if (!inode->i_private) + goto skip_search; + /* Make sure the system still exists */ mutex_lock(&event_mutex); list_for_each_entry(system, &event_subsystems, list) { @@ -880,8 +891,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (system != inode->i_private) return -ENODEV; + skip_search: ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0 && system) put_system(system); return ret; @@ -891,7 +903,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct event_subsystem *system = inode->i_private; - put_system(system); + if (system) + put_system(system); return 0; } @@ -1041,10 +1054,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { }; static const struct file_operations ftrace_system_enable_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_show_header_fops = { @@ -1133,8 +1147,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, - (void *)system->name, + trace_create_file("enable", 0644, system->entry, system, &ftrace_system_enable_fops); return system->entry; -- cgit v1.2.3-70-g09d2 From 43dd61c9a09bd413e837df829e6bfb42159be52a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 Jul 2011 11:09:22 -0400 Subject: ftrace: Fix regression of :mod:module function enabling The new code that allows different utilities to pick and choose what functions they trace broke the :mod: hook that allows users to trace only functions of a particular module. The reason is that the :mod: hook bypasses the hash that is setup to allow individual users to trace their own functions and uses the global hash directly. But if the global hash has not been set up, it will cause a bug: echo '*:mod:radeon' > /sys/kernel/debug/set_ftrace_filter produces: [drm:drm_mode_getfb] *ERROR* invalid framebuffer id [drm:radeon_crtc_page_flip] *ERROR* failed to reserve new rbo buffer before flip BUG: unable to handle kernel paging request at ffffffff8160ec90 IP: [] add_hash_entry+0x66/0xd0 PGD 1a05067 PUD 1a09063 PMD 80000000016001e1 Oops: 0003 [#1] SMP Jul 7 04:02:28 phyllis kernel: [55303.858604] CPU 1 Modules linked in: cryptd aes_x86_64 aes_generic binfmt_misc rfcomm bnep ip6table_filter hid radeon r8169 ahci libahci mii ttm drm_kms_helper drm video i2c_algo_bit intel_agp intel_gtt Pid: 10344, comm: bash Tainted: G WC 3.0.0-rc5 #1 Dell Inc. Inspiron N5010/0YXXJJ RIP: 0010:[] [] add_hash_entry+0x66/0xd0 RSP: 0018:ffff88003a96bda8 EFLAGS: 00010246 RAX: ffff8801301735c0 RBX: ffffffff8160ec80 RCX: 0000000000306ee0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880137c92940 RBP: ffff88003a96bdb8 R08: ffff880137c95680 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff81c9df78 R13: ffff8801153d1000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f329c18a700(0000) GS:ffff880137c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff8160ec90 CR3: 000000003002b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process bash (pid: 10344, threadinfo ffff88003a96a000, task ffff88012fcfc470) Stack: 0000000000000fd0 00000000000000fc ffff88003a96be38 ffffffff810d92f5 ffff88011c4c4e00 ffff880000000000 000000000b69f4d0 ffffffff8160ec80 ffff8800300e6f06 0000000081130295 0000000000000282 ffff8800300e6f00 Call Trace: [] match_records+0x155/0x1b0 [] ftrace_mod_callback+0xbc/0x100 [] ftrace_regex_write+0x16f/0x210 [] ftrace_filter_write+0xf/0x20 [] vfs_write+0xc8/0x190 [] sys_write+0x51/0x90 [] system_call_fastpath+0x16/0x1b Code: 48 8b 33 31 d2 48 85 f6 75 33 49 89 d4 4c 03 63 08 49 8b 14 24 48 85 d2 48 89 10 74 04 48 89 42 08 49 89 04 24 4c 89 60 08 31 d2 RIP [] add_hash_entry+0x66/0xd0 RSP CR2: ffffffff8160ec90 ---[ end trace a5d031828efdd88e ]--- Reported-by: Brian Marete Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 ++- kernel/trace/ftrace.c | 12 +++--------- kernel/trace/trace_functions.c | 3 ++- 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9d88e1cb5db..ed0eb5254d1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -123,7 +123,8 @@ stack_trace_sysctl(struct ctl_table *table, int write, struct ftrace_func_command { struct list_head list; char *name; - int (*func)(char *func, char *cmd, + int (*func)(struct ftrace_hash *hash, + char *func, char *cmd, char *params, int enable); }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 908038f5744..1c4c0b087e1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2407,10 +2407,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) */ static int -ftrace_mod_callback(char *func, char *cmd, char *param, int enable) +ftrace_mod_callback(struct ftrace_hash *hash, + char *func, char *cmd, char *param, int enable) { - struct ftrace_ops *ops = &global_ops; - struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -2430,11 +2429,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return ret; - if (enable) - hash = ops->filter_hash; - else - hash = ops->notrace_hash; - ret = ftrace_match_module_records(hash, func, mod); if (!ret) ret = -EINVAL; @@ -2760,7 +2754,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(func, command, next, enable); + ret = p->func(hash, func, command, next, enable); goto out_unlock; } } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8d0e1cc4e97..c7b0c6a7db0 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) } static int -ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; void *count = (void *)-1; -- cgit v1.2.3-70-g09d2 From e4a3f541f0b67fdad98b326c851dfe7f4b6b6dad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 19:02:29 -0400 Subject: tracing: Still trace filtered irq functions when irq trace is disabled If a function is set to be traced by the set_graph_function, but the option funcgraph-irqs is zero, and the traced function happens to be called from a interrupt, it will not be traced. The point of funcgraph-irqs is to not trace interrupts when we are preempted by an irq, not to not trace functions we want to trace that happen to be *in* a irq. Luckily the current->trace_recursion element is perfect to add a flag to help us be able to trace functions within an interrupt even when we are not tracing interrupts that preempt the trace. Reported-by: Heiko Carstens Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 50 ++++++++++++++++++++++++------------ kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 35 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a3e2db70807..651f35be372 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -278,6 +278,29 @@ struct tracer { }; +/* Only current can touch trace_recursion */ +#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) +#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) + +/* Ring buffer has the 10 LSB bits to count */ +#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) + +/* for function tracing recursion */ +#define TRACE_INTERNAL_BIT (1<<11) +#define TRACE_GLOBAL_BIT (1<<12) +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ +#define TRACE_IRQ_BIT (1<<13) + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + #define TRACE_PIPE_ALL_CPU -1 int tracer_init(struct tracer *t, struct trace_array *tr); @@ -516,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) + if (addr == ftrace_graph_funcs[i]) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); return 1; + } } return 0; @@ -794,19 +827,4 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" -/* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) - -/* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) - -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) - #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e8d6bb55d71..a7d2a4c653d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -227,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr, static inline int ftrace_graph_ignore_irqs(void) { - if (!ftrace_graph_skip_irqs) + if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; return in_irq(); -- cgit v1.2.3-70-g09d2 From 4376cac66778b25e599be3f5d54f33f58ba8ead7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Jun 2011 23:28:13 -0400 Subject: ftrace: Do not disable interrupts for modules in mcount update When I mounted an NFS directory, it caused several modules to be loaded. At the time I was running the preemptirqsoff tracer, and it showed the following output: # tracer: preemptirqsoff # # preemptirqsoff latency trace v1.1.5 on 2.6.33.9-rt30-mrg-test # -------------------------------------------------------------------- # latency: 1177 us, #4/4, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: modprobe-19370 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # => started at: ftrace_module_notify # => ended at: ftrace_module_notify # # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| /_--=> lock-depth # |||||/ delay # cmd pid |||||| time | caller # \ / |||||| \ | / modprobe-19370 3d.... 0us!: ftrace_process_locs <-ftrace_module_notify modprobe-19370 3d.... 1176us : ftrace_process_locs <-ftrace_module_notify modprobe-19370 3d.... 1178us : trace_hardirqs_on <-ftrace_module_notify modprobe-19370 3d.... 1178us : => ftrace_process_locs => ftrace_module_notify => notifier_call_chain => __blocking_notifier_call_chain => blocking_notifier_call_chain => sys_init_module => system_call_fastpath That's over 1ms that interrupts are disabled on a Real-Time kernel! Looking at the cause (being the ftrace author helped), I found that the interrupts are disabled before the code modification of mcounts into nops. The interrupts only need to be disabled on start up around this code, not when modules are being loaded. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c997f7371c6..df93392aad8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3318,7 +3318,7 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; - unsigned long flags; + unsigned long flags = 0; /* Shut up gcc */ mutex_lock(&ftrace_lock); p = start; @@ -3336,12 +3336,18 @@ static int ftrace_process_locs(struct module *mod, } /* - * Disable interrupts to prevent interrupts from executing - * code that is being modified. + * We only need to disable interrupts on start up + * because we are modifying code that an interrupt + * may execute, and the modification is not atomic. + * But for modules, nothing runs the code we modify + * until we are finished with it, and there's no + * reason to cause large interrupt latencies while we do it. */ - local_irq_save(flags); + if (!mod) + local_irq_save(flags); ftrace_update_code(mod); - local_irq_restore(flags); + if (!mod) + local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; -- cgit v1.2.3-70-g09d2 From 732375c6a5a4cc825b676c922d547aba96b8ce15 Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Thu, 7 Jul 2011 17:27:59 -0700 Subject: plist: Remove the need to supply locks to plist heads This was legacy code brought over from the RT tree and is no longer necessary. Signed-off-by: Dima Zavin Acked-by: Thomas Gleixner Cc: Daniel Walker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andi Kleen Cc: Lai Jiangshan Link: http://lkml.kernel.org/r/1310084879-10351-2-git-send-email-dima@android.com Signed-off-by: Ingo Molnar --- include/linux/plist.h | 55 +++---------------------------------------------- include/linux/rtmutex.h | 4 ++-- kernel/fork.c | 2 +- kernel/futex.c | 2 +- kernel/pm_qos_params.c | 6 +++--- kernel/rtmutex.c | 2 +- kernel/sched.c | 4 ++-- lib/plist.c | 7 +------ 8 files changed, 14 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/include/linux/plist.h b/include/linux/plist.h index c9b9f322c8d..aa0fb390bd2 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -77,14 +77,9 @@ #include #include -#include struct plist_head { struct list_head node_list; -#ifdef CONFIG_DEBUG_PI_LIST - raw_spinlock_t *rawlock; - spinlock_t *spinlock; -#endif }; struct plist_node { @@ -93,37 +88,13 @@ struct plist_node { struct list_head node_list; }; -#ifdef CONFIG_DEBUG_PI_LIST -# define PLIST_HEAD_LOCK_INIT(_lock) .spinlock = _lock -# define PLIST_HEAD_LOCK_INIT_RAW(_lock) .rawlock = _lock -#else -# define PLIST_HEAD_LOCK_INIT(_lock) -# define PLIST_HEAD_LOCK_INIT_RAW(_lock) -#endif - -#define _PLIST_HEAD_INIT(head) \ - .node_list = LIST_HEAD_INIT((head).node_list) - /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name - * @_lock: lock to initialize for this list - */ -#define PLIST_HEAD_INIT(head, _lock) \ -{ \ - _PLIST_HEAD_INIT(head), \ - PLIST_HEAD_LOCK_INIT(&(_lock)) \ -} - -/** - * PLIST_HEAD_INIT_RAW - static struct plist_head initializer - * @head: struct plist_head variable name - * @_lock: lock to initialize for this list */ -#define PLIST_HEAD_INIT_RAW(head, _lock) \ +#define PLIST_HEAD_INIT(head) \ { \ - _PLIST_HEAD_INIT(head), \ - PLIST_HEAD_LOCK_INIT_RAW(&(_lock)) \ + .node_list = LIST_HEAD_INIT((head).node_list) \ } /** @@ -141,31 +112,11 @@ struct plist_node { /** * plist_head_init - dynamic struct plist_head initializer * @head: &struct plist_head pointer - * @lock: spinlock protecting the list (debugging) */ static inline void -plist_head_init(struct plist_head *head, spinlock_t *lock) +plist_head_init(struct plist_head *head) { INIT_LIST_HEAD(&head->node_list); -#ifdef CONFIG_DEBUG_PI_LIST - head->spinlock = lock; - head->rawlock = NULL; -#endif -} - -/** - * plist_head_init_raw - dynamic struct plist_head initializer - * @head: &struct plist_head pointer - * @lock: raw_spinlock protecting the list (debugging) - */ -static inline void -plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock) -{ - INIT_LIST_HEAD(&head->node_list); -#ifdef CONFIG_DEBUG_PI_LIST - head->rawlock = lock; - head->spinlock = NULL; -#endif } /** diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 8d522ffeda3..de17134244f 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -66,7 +66,7 @@ struct hrtimer_sleeper; #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .wait_list = PLIST_HEAD_INIT_RAW(mutexname.wait_list, mutexname.wait_lock) \ + , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} @@ -100,7 +100,7 @@ extern void rt_mutex_unlock(struct rt_mutex *lock); #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ + .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ INIT_RT_MUTEX_DEBUG(tsk) #else # define INIT_RT_MUTEXES(tsk) diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a..7517a53d50e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1013,7 +1013,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&p->pi_waiters, &p->pi_lock); + plist_head_init(&p->pi_waiters); p->pi_blocked_on = NULL; #endif } diff --git a/kernel/futex.c b/kernel/futex.c index fe28dc282ea..3fbc76cbb9a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2697,7 +2697,7 @@ static int __init futex_init(void) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 6824ca7d4d0..37f05d0f079 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock); static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, @@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), .notifiers = &network_lat_notifier, .name = "network_latency", .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, @@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), .notifiers = &network_throughput_notifier, .name = "network_throughput", .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index ab449117aaf..255e1662acd 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - plist_head_init_raw(&lock->wait_list, &lock->wait_lock); + plist_head_init(&lock->wait_list); debug_rt_mutex_init(lock, name); } diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609..71bc127e96b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7781,7 +7781,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks); #endif rt_rq->rt_time = 0; @@ -7986,7 +7986,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init(&init_task.pi_waiters); #endif /* diff --git a/lib/plist.c b/lib/plist.c index 0ae7e643172..a0a4da489c2 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -56,11 +56,6 @@ static void plist_check_list(struct list_head *top) static void plist_check_head(struct plist_head *head) { - WARN_ON(head != &test_head && !head->rawlock && !head->spinlock); - if (head->rawlock) - WARN_ON_SMP(!raw_spin_is_locked(head->rawlock)); - if (head->spinlock) - WARN_ON_SMP(!spin_is_locked(head->spinlock)); if (!plist_head_empty(head)) plist_check_list(&plist_first(head)->prio_list); plist_check_list(&head->node_list); @@ -180,7 +175,7 @@ static int __init plist_test(void) unsigned int r = local_clock(); printk(KERN_INFO "start plist test\n"); - plist_head_init(&test_head, NULL); + plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) plist_node_init(test_node + i, 0); -- cgit v1.2.3-70-g09d2 From d8bf4ca9ca9576548628344c9725edd3786e90b1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 8 Jul 2011 14:39:41 +0200 Subject: rcu: treewide: Do not use rcu_read_lock_held when calling rcu_dereference_check Since ca5ecddf (rcu: define __rcu address space modifier for sparse) rcu_dereference_check use rcu_read_lock_held as a part of condition automatically so callers do not have to do that as well. Signed-off-by: Michal Hocko Acked-by: Paul E. McKenney Signed-off-by: Jiri Kosina --- include/linux/cgroup.h | 1 - include/linux/cred.h | 1 - include/linux/fdtable.h | 1 - include/linux/rtnetlink.h | 3 +-- include/net/sock.h | 3 +-- kernel/cgroup.c | 8 ++------ kernel/exit.c | 1 - kernel/pid.c | 1 - kernel/rcutorture.c | 2 -- kernel/sched.c | 1 - net/mac80211/sta_info.c | 4 ---- net/netlabel/netlabel_domainhash.c | 3 +-- net/netlabel/netlabel_unlabeled.c | 3 +-- security/keys/keyring.c | 1 - 14 files changed, 6 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ab4ac0ccb85..da7e4bc34e8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -539,7 +539,6 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state( */ #define task_subsys_state_check(task, subsys_id, __c) \ rcu_dereference_check(task->cgroups->subsys[subsys_id], \ - rcu_read_lock_held() || \ lockdep_is_held(&task->alloc_lock) || \ cgroup_lock_is_held() || (__c)) diff --git a/include/linux/cred.h b/include/linux/cred.h index 82607992f30..f240f2fa019 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -284,7 +284,6 @@ static inline void put_cred(const struct cred *_cred) ({ \ const struct task_struct *__t = (task); \ rcu_dereference_check(__t->real_cred, \ - rcu_read_lock_held() || \ task_is_dead(__t)); \ }) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 133c0ba25e3..df7e3cf82e9 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -60,7 +60,6 @@ struct files_struct { #define rcu_dereference_check_fdtable(files, fdtfd) \ (rcu_dereference_check((fdtfd), \ - rcu_read_lock_held() || \ lockdep_is_held(&(files)->file_lock) || \ atomic_read(&(files)->count) == 1 || \ rcu_my_thread_group_empty())) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index bbad657a372..27576aa05e8 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -758,8 +758,7 @@ extern int lockdep_rtnl_is_held(void); * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() */ #define rcu_dereference_rtnl(p) \ - rcu_dereference_check(p, rcu_read_lock_held() || \ - lockdep_rtnl_is_held()) + rcu_dereference_check(p, lockdep_rtnl_is_held()) /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL diff --git a/include/net/sock.h b/include/net/sock.h index c0b938cb4b1..d5b65c19a8e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1301,8 +1301,7 @@ extern unsigned long sock_i_ino(struct sock *sk); static inline struct dst_entry * __sk_dst_get(struct sock *sk) { - return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() || - sock_owned_by_user(sk) || + return rcu_dereference_check(sk->sk_dst_cache, sock_owned_by_user(sk) || lockdep_is_held(&sk->sk_lock.slock)); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2731d115d72..5ae71d6e274 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1697,7 +1697,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; struct dentry *dentry = rcu_dereference_check(cgrp->dentry, - rcu_read_lock_held() || cgroup_lock_is_held()); if (!dentry || cgrp == dummytop) { @@ -1723,7 +1722,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) break; dentry = rcu_dereference_check(cgrp->dentry, - rcu_read_lock_held() || cgroup_lock_is_held()); if (!cgrp->parent) continue; @@ -4813,8 +4811,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, - rcu_read_lock_held() || atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); if (cssid) return cssid->id; @@ -4826,8 +4823,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) { struct css_id *cssid; - cssid = rcu_dereference_check(css->id, - rcu_read_lock_held() || atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); if (cssid) return cssid->depth; diff --git a/kernel/exit.c b/kernel/exit.c index 20a40647152..07dc154fc79 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk) struct tty_struct *uninitialized_var(tty); sighand = rcu_dereference_check(tsk->sighand, - rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); diff --git a/kernel/pid.c b/kernel/pid.c index 57a8346a270..e432057f3b2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), - rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pids[(type)].node); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e138db0338..ced72102adc 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused) idx = cur_ops->readlock(); completed = cur_ops->completed(); p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_held() || rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); @@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg) idx = cur_ops->readlock(); completed = cur_ops->completed(); p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_held() || rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609..71e5a25a8a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -581,7 +581,6 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index b83870bf60f..3db78b696c5 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -97,7 +97,6 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], - rcu_read_lock_held() || lockdep_is_held(&local->sta_lock) || lockdep_is_held(&local->sta_mtx)); while (sta) { @@ -105,7 +104,6 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) break; sta = rcu_dereference_check(sta->hnext, - rcu_read_lock_held() || lockdep_is_held(&local->sta_lock) || lockdep_is_held(&local->sta_mtx)); } @@ -123,7 +121,6 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], - rcu_read_lock_held() || lockdep_is_held(&local->sta_lock) || lockdep_is_held(&local->sta_mtx)); while (sta) { @@ -132,7 +129,6 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) break; sta = rcu_dereference_check(sta->hnext, - rcu_read_lock_held() || lockdep_is_held(&local->sta_lock) || lockdep_is_held(&local->sta_mtx)); } diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index de0d8e4cbfb..2aa975e5452 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -55,8 +55,7 @@ struct netlbl_domhsh_tbl { * should be okay */ static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ - rcu_dereference_check(p, rcu_read_lock_held() || \ - lockdep_is_held(&netlbl_domhsh_lock)) + rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; static struct netlbl_dom_map *netlbl_domhsh_def = NULL; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 9c38658fba8..3de3768360f 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -116,8 +116,7 @@ struct netlbl_unlhsh_walk_arg { * hash table should be okay */ static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ - rcu_dereference_check(p, rcu_read_lock_held() || \ - lockdep_is_held(&netlbl_unlhsh_lock)) + rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index a06ffab3856..30e242f7bd0 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -155,7 +155,6 @@ static void keyring_destroy(struct key *keyring) } klist = rcu_dereference_check(keyring->payload.subscriptions, - rcu_read_lock_held() || atomic_read(&keyring->usage) == 0); if (klist) { for (loop = klist->nkeys - 1; loop >= 0; loop--) -- cgit v1.2.3-70-g09d2 From 2dc98fd3206f8106520eced769781a21a20707ca Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Fri, 8 Jul 2011 21:11:16 +0000 Subject: doc: Konfig: Documentation/power/{pm => apm-acpi}.txt Signed-off-by: Michael Witten Signed-off-by: Jiri Kosina --- arch/x86/Kconfig | 4 ++-- kernel/power/Kconfig | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da349723d41..ab03e883206 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1737,8 +1737,8 @@ menuconfig APM machines with more than one CPU. In order to use APM, you will need supporting software. For location - and more information, read and the - Battery Powered Linux mini-HOWTO, available from + and more information, read + and the Battery Powered Linux mini-HOWTO, available from . This driver does not spin down disk drives (see the hdparm(8) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 87f4d24b55b..bcd8fce351b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -193,8 +193,8 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read and the - Battery Powered Linux mini-HOWTO, available from + and more information, read + and the Battery Powered Linux mini-HOWTO, available from . This driver does not spin down disk drives (see the hdparm(8) -- cgit v1.2.3-70-g09d2 From 4aede84b33d6beb401136a3deca0651ae07c5e99 Mon Sep 17 00:00:00 2001 From: Justin TerAvest Date: Tue, 12 Jul 2011 08:31:45 +0200 Subject: fixlet: Remove fs_excl from struct task. fs_excl is a poor man's priority inheritance for filesystems to hint to the block layer that an operation is important. It was never clearly specified, not widely adopted, and will not prevent starvation in many cases (like across cgroups). fs_excl was introduced with the time sliced CFQ IO scheduler, to indicate when a process held FS exclusive resources and thus needed a boost. It doesn't cover all file systems, and it was never fully complete. Lets kill it. Signed-off-by: Justin TerAvest Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 28 +--------------------------- fs/reiserfs/journal.c | 13 ------------- fs/super.c | 4 ---- include/linux/fs.h | 4 ---- include/linux/init_task.h | 1 - include/linux/sched.h | 1 - kernel/exit.c | 1 - kernel/fork.c | 1 - 8 files changed, 1 insertion(+), 52 deletions(-) (limited to 'kernel') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 762bd509b71..d8b108737b7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -134,7 +134,7 @@ struct cfq_queue { /* io prio of this group */ unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; + unsigned short ioprio_class; pid_t pid; @@ -2869,7 +2869,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) * elevate the priority of this queue */ cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; cfq_clear_cfqq_prio_changed(cfqq); } @@ -3593,30 +3592,6 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -/* - * we temporarily boost lower priority queues if they are holding fs exclusive - * resources. they are boosted to normal prio (CLASS_BE/4) - */ -static void cfq_prio_boost(struct cfq_queue *cfqq) -{ - if (has_fs_excl()) { - /* - * boost idle prio on transactions that would lock out other - * users of the filesystem - */ - if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; - } else { - /* - * unboost the queue (if needed) - */ - cfqq->ioprio_class = cfqq->org_ioprio_class; - cfqq->ioprio = cfqq->org_ioprio; - } -} - static inline int __cfq_may_queue(struct cfq_queue *cfqq) { if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { @@ -3647,7 +3622,6 @@ static int cfq_may_queue(struct request_queue *q, int rw) cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); if (cfqq) { cfq_init_prio_data(cfqq, cic->ioc); - cfq_prio_boost(cfqq); return __cfq_may_queue(cfqq); } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index c5e82ece7c6..a159ba5a35e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -678,23 +678,19 @@ struct buffer_chunk { static void write_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_logged_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static void write_ordered_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_ordered_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, @@ -986,8 +982,6 @@ static int flush_commit_list(struct super_block *s, return 0; } - get_fs_excl(); - /* before we can put our commit blocks on disk, we have to make sure everyone older than ** us is on disk too */ @@ -1145,7 +1139,6 @@ static int flush_commit_list(struct super_block *s, if (retval) reiserfs_abort(s, retval, "Journal write error in %s", __func__); - put_fs_excl(); return retval; } @@ -1374,8 +1367,6 @@ static int flush_journal_list(struct super_block *s, return 0; } - get_fs_excl(); - /* if all the work is already done, get out of here */ if (atomic_read(&(jl->j_nonzerolen)) <= 0 && atomic_read(&(jl->j_commit_left)) <= 0) { @@ -1597,7 +1588,6 @@ static int flush_journal_list(struct super_block *s, put_journal_list(s, jl); if (flushall) mutex_unlock(&journal->j_flush_mutex); - put_fs_excl(); return err; } @@ -3108,7 +3098,6 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, th->t_trans_id = journal->j_trans_id; unlock_journal(sb); INIT_LIST_HEAD(&th->t_list); - get_fs_excl(); return 0; out_fail: @@ -3964,7 +3953,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, flush = flags & FLUSH_ALL; wait_on_commit = flags & WAIT; - put_fs_excl(); current->journal_info = th->t_handle_save; reiserfs_check_lock_depth(sb, "journal end"); if (journal->j_len == 0) { @@ -4316,4 +4304,3 @@ void reiserfs_abort_journal(struct super_block *sb, int errno) dump_stack(); #endif } - diff --git a/fs/super.c b/fs/super.c index ab3d672db0d..cf12ba50973 100644 --- a/fs/super.c +++ b/fs/super.c @@ -245,13 +245,11 @@ static int grab_super(struct super_block *s) __releases(sb_lock) */ void lock_super(struct super_block * sb) { - get_fs_excl(); mutex_lock(&sb->s_lock); } void unlock_super(struct super_block * sb) { - put_fs_excl(); mutex_unlock(&sb->s_lock); } @@ -280,7 +278,6 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(&sb->s_inodes); @@ -295,7 +292,6 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - put_fs_excl(); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e73e2e9ae3..f6c866c287b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1453,10 +1453,6 @@ enum { #define vfs_check_frozen(sb, level) \ wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) -#define get_fs_excl() atomic_inc(¤t->fs_excl) -#define put_fs_excl() atomic_dec(¤t->fs_excl) -#define has_fs_excl() atomic_read(¤t->fs_excl) - /* * until VFS tracks user namespaces for inodes, just make all files * belong to init_user_ns diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 580f70c0239..d14e058aaee 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -176,7 +176,6 @@ extern struct cred init_cred; .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ - .fs_excl = ATOMIC_INIT(0), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ .pids = { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index a837b20ba19..22f54249cde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1503,7 +1503,6 @@ struct task_struct { short il_next; short pref_node_fork; #endif - atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; /* diff --git a/kernel/exit.c b/kernel/exit.c index f2b321bae44..b412df45ea6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -906,7 +906,6 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); - WARN_ON(atomic_read(&tsk->fs_excl)); WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a..30a0e860722 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -291,7 +291,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); - atomic_set(&tsk->fs_excl, 0); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif -- cgit v1.2.3-70-g09d2 From 1dda606c5f94b14a8f36c220d1d8844bab68a720 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 8 Jun 2011 02:45:37 +0200 Subject: KVM: Add compat ioctl for KVM_SET_SIGNAL_MASK KVM has an ioctl to define which signal mask should be used while running inside VCPU_RUN. At least for big endian systems, this mask is different on 32-bit and 64-bit systems (though the size is identical). Add a compat wrapper that converts the mask to whatever the kernel accepts, allowing 32-bit kvm user space to set signal masks. This patch fixes qemu with --enable-io-thread on ppc64 hosts when running 32-bit user land. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- kernel/compat.c | 1 + virt/kvm/kvm_main.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index fc9eb093acd..18197ae2d46 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -890,6 +890,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); } } +EXPORT_SYMBOL_GPL(sigset_from_compat); asmlinkage long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa2321ac77c..11d2783eb9d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -84,6 +84,10 @@ struct dentry *kvm_debugfs_dir; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); +#ifdef CONFIG_COMPAT +static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg); +#endif static int hardware_enable_all(void); static void hardware_disable_all(void); @@ -1586,7 +1590,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) static struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, - .compat_ioctl = kvm_vcpu_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = kvm_vcpu_compat_ioctl, +#endif .mmap = kvm_vcpu_mmap, .llseek = noop_llseek, }; @@ -1875,6 +1881,50 @@ out: return r; } +#ifdef CONFIG_COMPAT +static long kvm_vcpu_compat_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = compat_ptr(arg); + int r; + + if (vcpu->kvm->mm != current->mm) + return -EIO; + + switch (ioctl) { + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask __user *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + compat_sigset_t csigset; + sigset_t sigset; + + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof csigset) + goto out; + r = -EFAULT; + if (copy_from_user(&csigset, sigmask_arg->sigset, + sizeof csigset)) + goto out; + } + sigset_from_compat(&sigset, &csigset); + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); + break; + } + default: + r = kvm_vcpu_ioctl(filp, ioctl, arg); + } + +out: + return r; +} +#endif + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { -- cgit v1.2.3-70-g09d2 From 41fb61c2d08107ce96a5dcb3a6289b2afd3e135c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:03:44 -0400 Subject: ftrace: Balance records when updating the hash Whenever the hash of the ftrace_ops is updated, the record counts must be balance. This requires disabling the records that are set in the original hash, and then enabling the records that are set in the updated hash. Moving the update into ftrace_hash_move() removes the bug where the hash was updated but the records were not, which results in ftrace triggering a warning and disabling itself because the ftrace_ops filter is updated while the ftrace_ops was registered, and then the failure happens when the ftrace_ops is unregistered. The current code will not trigger this bug, but new code will. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index df93392aad8..853f6f0a4b4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1170,8 +1170,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; } +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + static int -ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; @@ -1181,8 +1187,15 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) unsigned long key; int size = src->count; int bits = 0; + int ret; int i; + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + /* * If the new source is empty, just free dst and assign it * the empty_hash. @@ -1203,9 +1216,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; + ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + goto out; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1224,7 +1238,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - return 0; + ret = 0; + out: + /* + * Enable regardless of ret: + * On success, we enable the new hash. + * On failure, we re-enable the original hash. + */ + ftrace_hash_rec_enable(ops, enable); + + return ret; } /* @@ -2845,7 +2868,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ftrace_match_records(hash, buf, len); mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(orig_hash, hash); + ret = ftrace_hash_move(ops, enable, orig_hash, hash); mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); @@ -3028,18 +3051,12 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(iter->ops, filter_hash); - ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret) { - ftrace_hash_rec_enable(iter->ops, filter_hash); - if (iter->ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - } + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); -- cgit v1.2.3-70-g09d2 From 072126f4529196f71a97960248bca54fd4554c2d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:08:31 -0400 Subject: ftrace: Update filter when tracing enabled in set_ftrace_filter() Currently, if set_ftrace_filter() is called when the ftrace_ops is active, the function filters will not be updated. They will only be updated when tracing is disabled and re-enabled. Update the functions immediately during set_ftrace_filter(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 853f6f0a4b4..a0dc0de8d64 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2869,6 +2869,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); -- cgit v1.2.3-70-g09d2 From 6331c28c962561aee59e5a493b7556a4bb585957 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Jul 2011 15:11:02 -0400 Subject: ftrace: Fix dynamic selftest failure on some archs Archs that do not implement CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST, will fail the dynamic ftrace selftest. The function tracer has a quick 'off' variable that will prevent the call back functions from being called. This variable is called function_trace_stop. In x86, this is implemented directly in the mcount assembly, but for other archs, an intermediate function is used called ftrace_test_stop_func(). In dynamic ftrace, the function pointer variable ftrace_trace_function is used to update the caller code in the mcount caller. But for archs that do not have CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST set, it only calls ftrace_test_stop_func() instead, which in turn calls __ftrace_trace_function. When more than one ftrace_ops is registered, the function it calls is ftrace_ops_list_func(), which will iterate over all registered ftrace_ops and call the callbacks that have their hash matching. The issue happens when two ftrace_ops are registered for different functions and one is then unregistered. The __ftrace_trace_function is then pointed to the remaining ftrace_ops callback function directly. This mean it will be called for all functions that were registered to trace by both ftrace_ops that were registered. This is not an issue for archs with CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST, because the update of ftrace_trace_function doesn't happen until after all functions have been updated, and then the mcount caller is updated. But for those archs that do use the ftrace_test_stop_func(), the update is immediate. The dynamic selftest fails because it hits this situation, and the ftrace_ops that it registers fails to only trace what it was suppose to and instead traces all other functions. The solution is to delay the setting of __ftrace_trace_function until after all the functions have been updated according to the registered ftrace_ops. Also, function_trace_stop is set during the update to prevent function tracing from calling code that is caused by the function tracer itself. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a0dc0de8d64..62e26d93053 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -88,6 +88,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -146,9 +147,11 @@ void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; __ftrace_trace_function = ftrace_stub; + __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } +#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their @@ -207,8 +210,13 @@ static void update_ftrace_function(void) #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; +#else +#ifdef CONFIG_DYNAMIC_FTRACE + /* do not update till all functions have been modified */ + __ftrace_trace_function_delay = func; #else __ftrace_trace_function = func; +#endif ftrace_trace_function = ftrace_test_stop_func; #endif } @@ -1607,6 +1615,12 @@ static int __ftrace_modify_code(void *data) { int *command = data; + /* + * Do not call function tracer while we update the code. + * We are in stop machine, no worrying about races. + */ + function_trace_stop++; + if (*command & FTRACE_ENABLE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) @@ -1620,6 +1634,18 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; + return 0; } -- cgit v1.2.3-70-g09d2 From c9aaa8957f203bd6df83b002fb40b98390bed078 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:14 -0400 Subject: KVM: Steal time implementation To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM, while the vcpu had meaningful work to do - halt time does not count. This information is acquired through the run_delay field of delayacct/schedstats infrastructure, that counts time spent in a runqueue but not running. Steal time is a per-cpu information, so the traditional MSR-based infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the memory area address containing information about steal time This patch contains the hypervisor part of the steal time infrasructure, and can be backported independently of the guest portion. [avi, yongjie: export delayacct_on, to avoid build failures in some configs] Signed-off-by: Glauber Costa Tested-by: Eric B Munson CC: Rik van Riel CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Yongjie Ren Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 9 +++++ arch/x86/include/asm/kvm_para.h | 4 +++ arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/x86.c | 74 +++++++++++++++++++++++++++++++++++++++-- include/linux/kvm_host.h | 1 + kernel/delayacct.c | 2 ++ 6 files changed, 89 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da6bbee878c..59086a77ff1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -389,6 +389,15 @@ struct kvm_vcpu_arch { unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; + + struct { + u64 msr_val; + u64 last_steal; + u64 accum_steal; + struct gfn_to_hva_cache stime; + struct kvm_steal_time steal; + } st; + u64 last_guest_tsc; u64 last_kernel_ns; u64 last_tsc_nsec; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 65f8bb9279e..c484ba8e05e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -45,6 +45,10 @@ struct kvm_steal_time { __u32 pad[12]; }; +#define KVM_STEAL_ALIGNMENT_BITS 5 +#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) +#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) + #define KVM_MAX_MMU_OP_BATCH 32 #define KVM_ASYNC_PF_ENABLED (1 << 0) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 50f63648ce1..99c3f0589fa 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -31,6 +31,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO + select TASK_DELAY_ACCT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b803f04bde..c96cdc09248 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -808,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 8 +#define KVM_SAVE_MSRS_BEGIN 9 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1488,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) } } +static void accumulate_steal_time(struct kvm_vcpu *vcpu) +{ + u64 delta; + + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + vcpu->arch.st.accum_steal = delta; +} + +static void record_steal_time(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + return; + + vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; + vcpu->arch.st.steal.version += 2; + vcpu->arch.st.accum_steal = 0; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1570,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_STEAL_TIME: + + if (unlikely(!sched_info_on())) + return 1; + + if (data & KVM_STEAL_RESERVED_MASK) + return 1; + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, + data & KVM_STEAL_VALID_BITS)) + return 1; + + vcpu->arch.st.msr_val = data; + + if (!(data & KVM_MSR_ENABLED)) + break; + + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + preempt_disable(); + accumulate_steal_time(vcpu); + preempt_enable(); + + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1855,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_ASYNC_PF_EN: data = vcpu->arch.apf.msr_val; break; + case MSR_KVM_STEAL_TIME: + data = vcpu->arch.st.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -2166,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; } + + accumulate_steal_time(vcpu); + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -2487,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + + if (sched_info_on()) + entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); + entry->ebx = 0; entry->ecx = 0; entry->edx = 0; @@ -5470,6 +5536,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } + if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) + record_steal_time(vcpu); + } r = kvm_mmu_reload(vcpu); @@ -6206,6 +6275,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f7df0a3b031..c8e023902f7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -47,6 +47,7 @@ #define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_REQ_EVENT 11 #define KVM_REQ_APF_HALT 12 +#define KVM_REQ_STEAL_UPDATE 13 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ead9b610aa7..418b3f7053a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -19,8 +19,10 @@ #include #include #include +#include int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ +EXPORT_SYMBOL_GPL(delayacct_on); struct kmem_cache *delayacct_cache; static int __init delayacct_setup_disable(char *str) -- cgit v1.2.3-70-g09d2 From e6e6685accfa81f509fadfc9624bc7c3862d75c4 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:17 -0400 Subject: KVM guest: Steal time accounting This patch accounts steal time time in account_process_tick. If one or more tick is considered stolen in the current accounting cycle, user/system accounting is skipped. Idle is fine, since the hypervisor does not report steal time if the guest is halted. Accounting steal time from the core scheduler give us the advantage of direct acess to the runqueue data. In a later opportunity, it can be used to tweak cpu power and make the scheduler aware of the time it lost. [avi: doesn't exist on many archs] Signed-off-by: Glauber Costa Acked-by: Rik van Riel Acked-by: Peter Zijlstra Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Anthony Liguori Signed-off-by: Avi Kivity --- kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502d609..f98a28b19b2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,9 @@ #include #include #include +#ifdef CONFIG_PARAVIRT +#include +#endif #include "sched_cpupri.h" #include "workqueue_sched.h" @@ -528,6 +531,9 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -1953,6 +1959,18 @@ void account_system_vtime(struct task_struct *curr) } EXPORT_SYMBOL_GPL(account_system_vtime); +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + static void update_rq_clock_task(struct rq *rq, s64 delta) { s64 irq_delta; @@ -3845,6 +3863,25 @@ void account_idle_time(cputime_t cputime) cpustat->idle = cputime64_add(cpustat->idle, cputime64); } +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_branch(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -3876,6 +3913,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + if (steal_account_process_tick()) + return; + if (irqtime_account_hi_update()) { cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { @@ -3929,6 +3969,9 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } + if (steal_account_process_tick()) + return; + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) -- cgit v1.2.3-70-g09d2 From 095c0aa83e52d6c3dd7168610746703921f570af Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:18 -0400 Subject: sched: adjust scheduler cpu power for stolen time This patch makes update_rq_clock() aware of steal time. The mechanism of operation is not different from irq_time, and follows the same principles. This lives in a CONFIG option itself, and can be compiled out independently of the rest of steal time reporting. The effect of disabling it is that the scheduler will still report steal time (that cannot be disabled), but won't use this information for cpu power adjustments. Everytime update_rq_clock_task() is invoked, we query information about how much time was stolen since last call, and feed it into sched_rt_avg_update(). Although steal time reporting in account_process_tick() keeps track of the last time we read the steal clock, in prev_steal_time, this patch do it independently using another field, prev_steal_time_rq. This is because otherwise, information about time accounted in update_process_tick() would never reach us in update_rq_clock(). Signed-off-by: Glauber Costa Acked-by: Rik van Riel Acked-by: Peter Zijlstra Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Anthony Liguori Signed-off-by: Avi Kivity --- arch/x86/Kconfig | 12 ++++++++++++ kernel/sched.c | 47 +++++++++++++++++++++++++++++++++++++---------- kernel/sched_features.h | 4 ++-- 3 files changed, 51 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da349723d41..1f03e221a01 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -512,6 +512,18 @@ menuconfig PARAVIRT_GUEST if PARAVIRT_GUEST +config PARAVIRT_TIME_ACCOUNTING + bool "Paravirtual steal time accounting" + select PARAVIRT + default n + ---help--- + Select this option to enable fine granularity task steal time + accounting. Time spent executing other tasks in parallel with + the current vCPU is discounted from the vCPU power. To account for + that, there can be a small performance impact. + + If in doubt, say N here. + source "arch/x86/xen/Kconfig" config KVM_CLOCK diff --git a/kernel/sched.c b/kernel/sched.c index f98a28b19b2..b35ac50b26c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -534,6 +534,9 @@ struct rq { #ifdef CONFIG_PARAVIRT u64 prev_steal_time; #endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -1973,8 +1976,14 @@ static inline u64 steal_ticks(u64 steal) static void update_rq_clock_task(struct rq *rq, s64 delta) { - s64 irq_delta; - +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; /* @@ -1997,12 +2006,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_branch((¶virt_steal_rq_enabled))) { + u64 st; + + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + st = steal_ticks(steal); + steal = st * TICK_NSEC; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + rq->clock_task += delta; - if (irq_delta && sched_feat(NONIRQ_POWER)) - sched_rt_avg_update(rq, irq_delta); +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -2037,12 +2069,7 @@ static int irqtime_account_si_update(void) #define sched_clock_irqtime (0) -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ - rq->clock_task += delta; -} - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif #include "sched_idletask.c" #include "sched_fair.c" diff --git a/kernel/sched_features.h b/kernel/sched_features.h index be40f7371ee..ca3b025f866 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(OWNER_SPIN, 1) /* - * Decrement CPU power based on irq activity + * Decrement CPU power based on time not spent running tasks */ -SCHED_FEAT(NONIRQ_POWER, 1) +SCHED_FEAT(NONTASK_POWER, 1) /* * Queue remote wakeups on the target CPU and process them -- cgit v1.2.3-70-g09d2 From 4a9bd3f134decd6d16ead8d288342d57aad486be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Jul 2011 16:36:53 -0400 Subject: tracing: Have dynamic size event stack traces Currently the stack trace per event in ftace is only 8 frames. This can be quite limiting and sometimes useless. Especially when the "ignore frames" is wrong and we also use up stack frames for the event processing itself. Change this to be dynamic by adding a percpu buffer that we can write a large stack frame into and then copy into the ring buffer. For interrupts and NMIs that come in while another event is being process, will only get to use the 8 frame stack. That should be enough as the task that it interrupted will have the full stack frame anyway. Requested-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.c | 92 +++++++++++++++++++++++++++++++++++++------- kernel/trace/trace_entries.h | 3 +- kernel/trace/trace_output.c | 11 +++--- 4 files changed, 88 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index b1e69eefc20..96efa6794ea 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -76,6 +76,7 @@ struct trace_iterator { struct trace_entry *ent; unsigned long lost_events; int leftover; + int ent_size; int cpu; u64 ts; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9c16123f6e..e5df02c69b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1248,6 +1248,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, } #ifdef CONFIG_STACKTRACE + +#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +struct ftrace_stack { + unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; +}; + +static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(int, ftrace_stack_reserve); + static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) @@ -1256,25 +1265,77 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; + int use_stack; + int size = FTRACE_STACK_ENTRIES; + + trace.nr_entries = 0; + trace.skip = skip; + + /* + * Since events can happen in NMIs there's no safe way to + * use the per cpu ftrace_stacks. We reserve it and if an interrupt + * or NMI comes in, it will just have to use the default + * FTRACE_STACK_SIZE. + */ + preempt_disable_notrace(); + + use_stack = ++__get_cpu_var(ftrace_stack_reserve); + /* + * We don't need any atomic variables, just a barrier. + * If an interrupt comes in, we don't care, because it would + * have exited and put the counter back to what we want. + * We just need a barrier to keep gcc from moving things + * around. + */ + barrier(); + if (use_stack == 1) { + trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.max_entries = FTRACE_STACK_MAX_ENTRIES; + + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + + if (trace.nr_entries > size) + size = trace.nr_entries; + } else + /* From now on, use_stack is a boolean */ + use_stack = 0; + + size *= sizeof(unsigned long); event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry), flags, pc); + sizeof(*entry) + size, flags, pc); if (!event) - return; - entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, sizeof(entry->caller)); + goto out; + entry = ring_buffer_event_data(event); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = skip; - trace.entries = entry->caller; + memset(&entry->caller, 0, size); + + if (use_stack) + memcpy(&entry->caller, trace.entries, + trace.nr_entries * sizeof(unsigned long)); + else { + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.entries = entry->caller; + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + } + + entry->size = trace.nr_entries; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + out: + /* Again, don't let gcc optimize things here */ + barrier(); + __get_cpu_var(ftrace_stack_reserve)--; + preempt_enable_notrace(); + } void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, @@ -1562,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, ftrace_enable_cpu(); - return event ? ring_buffer_event_data(event) : NULL; + if (event) { + iter->ent_size = ring_buffer_event_length(event); + return ring_buffer_event_data(event); + } + iter->ent_size = 0; + return NULL; } static struct trace_entry * diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e32744c84d9..93365907f21 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, F_STRUCT( - __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + __field( int, size ) + __dynamic_array(unsigned long, caller ) ), F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e37de492a9e..51999309a6c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, { struct stack_entry *field; struct trace_seq *s = &iter->seq; - int i; + unsigned long *p; + unsigned long *end; trace_assign_type(field, iter->ent); + end = (unsigned long *)((long)iter->ent + iter->ent_size); if (!trace_seq_puts(s, "\n")) goto partial; - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) - break; + + for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { if (!trace_seq_puts(s, " => ")) goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) + if (!seq_print_ip_sym(s, *p, flags)) goto partial; if (!trace_seq_puts(s, "\n")) goto partial; -- cgit v1.2.3-70-g09d2 From f91298709790b9a483752ca3c967845537df2af3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 9 Jul 2011 00:17:12 +0400 Subject: perf, x86: P4 PMU - Introduce event alias feature Instead of hw_nmi_watchdog_set_attr() weak function and appropriate x86_pmu::hw_watchdog_set_attr() call we introduce even alias mechanism which allow us to drop this routines completely and isolate quirks of Netburst architecture inside P4 PMU code only. The main idea remains the same though -- to allow nmi-watchdog and perf top run simultaneously. Note the aliasing mechanism applies to generic PERF_COUNT_HW_CPU_CYCLES event only because arbitrary event (say passed as RAW initially) might have some additional bits set inside ESCR register changing the behaviour of event and we can't guarantee anymore that alias event will give the same result. P.S. Thanks a huge to Don and Steven for for testing and early review. Acked-by: Don Zickus Tested-by: Steven Rostedt Signed-off-by: Cyrill Gorcunov CC: Ingo Molnar CC: Peter Zijlstra CC: Stephane Eranian CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110708201712.GS23657@sun Signed-off-by: Steven Rostedt --- arch/x86/include/asm/perf_event_p4.h | 33 +++++++++ arch/x86/kernel/cpu/perf_event.c | 7 -- arch/x86/kernel/cpu/perf_event_p4.c | 135 +++++++++++++++++++++++++++-------- kernel/watchdog.c | 2 - 4 files changed, 139 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 56fd9e3abbd..4d86c86178f 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -101,6 +101,14 @@ #define P4_CONFIG_HT_SHIFT 63 #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) +/* + * If an event has alias it should be marked + * with a special bit. (Don't forget to check + * P4_PEBS_CONFIG_MASK and related bits on + * modification.) + */ +#define P4_CONFIG_ALIASABLE (1 << 9) + /* * The bits we allow to pass for RAW events */ @@ -123,6 +131,31 @@ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) +/* + * In case of event aliasing we need to preserve some + * caller bits otherwise the mapping won't be complete. + */ +#define P4_CONFIG_EVENT_ALIAS_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ + p4_config_pack_cccr(P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE)) + +#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \ + ((P4_CONFIG_HT) | \ + p4_config_pack_escr(P4_ESCR_T0_OS | \ + P4_ESCR_T0_USR | \ + P4_ESCR_T1_OS | \ + P4_ESCR_T1_USR) | \ + p4_config_pack_cccr(P4_CCCR_OVF | \ + P4_CCCR_CASCADE | \ + P4_CCCR_FORCE_OVF | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_OVF_PMI_T0 | \ + P4_CCCR_OVF_PMI_T1 | \ + P4_CONFIG_ALIASABLE)) + static inline bool p4_is_event_cascaded(u64 config) { u32 cccr = p4_config_unpack_cccr(config); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c53d433c3dd..b7a010fce8c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -274,7 +274,6 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); - void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -360,12 +359,6 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - if (x86_pmu.hw_watchdog_set_attr) - x86_pmu.hw_watchdog_set_attr(wd_attr); -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index fb901c5080f..8b7a0c30678 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -570,11 +570,92 @@ static __initconst const u64 p4_hw_cache_event_ids }, }; +/* + * Because of Netburst being quite restricted in now + * many same events can run simultaneously, we use + * event aliases, ie different events which have the + * same functionallity but use non-intersected resources + * (ESCR/CCCR/couter registers). This allow us to run + * two or more semi-same events together. It is done + * transparently to a user space. + * + * Never set any cusom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up-to-dated automatically either not appliable + * at all. + * + * And be really carefull choosing aliases! + */ +struct p4_event_alias { + u64 orig; + u64 alter; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with + * non-sleeping cycles (see Intel SDM Vol3b for + * details). + */ + .orig = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alter = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Probably we're lucky and don't have to do + * matching over all config bits. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + /* + * If an event was previously swapped to the alter config + * we should swap it back otherwise contnention on registers + * will return back. + */ + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].orig) { + config_match = p4_event_aliases[i].alter; + break; + } else if (config_match == p4_event_aliases[i].alter) { + config_match = p4_event_aliases[i].orig; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | + (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + static u64 p4_general_events[PERF_COUNT_HW_MAX] = { /* non-halted CPU clocks */ [PERF_COUNT_HW_CPU_CYCLES] = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, /* * retired instructions @@ -719,31 +800,6 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } -static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - /* - * Watchdog ticks are special on Netburst, we use - * that named "non-sleeping" ticks as recommended - * by Intel SDM Vol3b. - */ - WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || - wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); - - wd_attr->type = PERF_TYPE_RAW; - wd_attr->config = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | - p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | - P4_CCCR_COMPARE); -} - static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1159,6 +1215,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign struct p4_event_bind *bind; unsigned int i, thread, num; int cntr_idx, escr_idx; + u64 config_alias; + int pass; bitmap_zero(used_mask, X86_PMC_IDX_MAX); bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); @@ -1167,6 +1225,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign hwc = &cpuc->event_list[i]->hw; thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * Aliases are swappable so we may hit circular + * lock if both original config and alias need + * resources (MSR registers) which already busy. + */ + if (pass > 2) + goto done; + bind = p4_config_get_bind(hwc->config); escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); if (unlikely(escr_idx == -1)) @@ -1180,8 +1249,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign } cntr_idx = p4_next_cntr(thread, used_mask, bind); - if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) - goto done; + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Probably an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } p4_pmu_swap_config_ts(hwc, cpu); if (assign) @@ -1218,7 +1296,6 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, - .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a933e3a0398..36491cd5b7d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -200,7 +200,6 @@ static int is_softlockup(unsigned long touch_ts) } #ifdef CONFIG_HARDLOCKUP_DETECTOR -void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { } static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -372,7 +371,6 @@ static int watchdog_nmi_enable(int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - hw_nmi_watchdog_set_attr(wd_attr); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v1.2.3-70-g09d2 From 7143f168e2aa4bc7f8a8bcfe46d8dc52f7be869a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:36 +0900 Subject: tracing/kprobes: Rename probe_* to trace_probe_* Rename probe_* to trace_probe_* for avoiding namespace confliction. This also fixes improper names of find_probe_event() and cleanup_all_probes() to find_trace_probe() and release_all_trace_probes() respectively. Signed-off-by: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110627072636.6528.60374.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7db7b68c6c3..14b88ed65bb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -555,12 +555,12 @@ struct trace_probe { (sizeof(struct probe_arg) * (n))) -static __kprobes int probe_is_return(struct trace_probe *tp) +static __kprobes int trace_probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; } -static __kprobes const char *probe_symbol(struct trace_probe *tp) +static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) { return tp->symbol ? tp->symbol : "unknown"; } @@ -671,7 +671,7 @@ static void free_trace_probe(struct trace_probe *tp) kfree(tp); } -static struct trace_probe *find_probe_event(const char *event, +static struct trace_probe *find_trace_probe(const char *event, const char *group) { struct trace_probe *tp; @@ -686,7 +686,7 @@ static struct trace_probe *find_probe_event(const char *event, /* Unregister a trace_probe and probe_event: call with locking probe_lock */ static void unregister_trace_probe(struct trace_probe *tp) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) unregister_kretprobe(&tp->rp); else unregister_kprobe(&tp->rp.kp); @@ -703,7 +703,7 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); /* register as an event */ - old_tp = find_probe_event(tp->call.name, tp->call.class->system); + old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { /* delete old event */ unregister_trace_probe(old_tp); @@ -716,7 +716,7 @@ static int register_trace_probe(struct trace_probe *tp) } tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) ret = register_kretprobe(&tp->rp); else ret = register_kprobe(&tp->rp.kp); @@ -1025,7 +1025,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } mutex_lock(&probe_lock); - tp = find_probe_event(event, group); + tp = find_trace_probe(event, group); if (!tp) { mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); @@ -1144,7 +1144,7 @@ error: return ret; } -static void cleanup_all_probes(void) +static void release_all_trace_probes(void) { struct trace_probe *tp; @@ -1181,15 +1181,16 @@ static int probes_seq_show(struct seq_file *m, void *v) struct trace_probe *tp = v; int i; - seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); if (!tp->symbol) seq_printf(m, " 0x%p", tp->rp.kp.addr); else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + seq_printf(m, " %s+%u", trace_probe_symbol(tp), + tp->rp.kp.offset); else - seq_printf(m, " %s", probe_symbol(tp)); + seq_printf(m, " %s", trace_probe_symbol(tp)); for (i = 0; i < tp->nr_args; i++) seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); @@ -1209,7 +1210,7 @@ static int probes_open(struct inode *inode, struct file *file) { if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - cleanup_all_probes(); + release_all_trace_probes(); return seq_open(file, &probes_seq_op); } @@ -1518,7 +1519,7 @@ static int probe_event_enable(struct ftrace_event_call *call) struct trace_probe *tp = (struct trace_probe *)call->data; tp->flags |= TP_FLAG_TRACE; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) return enable_kretprobe(&tp->rp); else return enable_kprobe(&tp->rp.kp); @@ -1530,7 +1531,7 @@ static void probe_event_disable(struct ftrace_event_call *call) tp->flags &= ~TP_FLAG_TRACE; if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); @@ -1598,7 +1599,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) const char *fmt, *arg; - if (!probe_is_return(tp)) { + if (!trace_probe_is_return(tp)) { fmt = "(%lx)"; arg = "REC->" FIELD_STRING_IP; } else { @@ -1722,7 +1723,7 @@ static int probe_perf_enable(struct ftrace_event_call *call) tp->flags |= TP_FLAG_PROFILE; - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) return enable_kretprobe(&tp->rp); else return enable_kprobe(&tp->rp.kp); @@ -1735,7 +1736,7 @@ static void probe_perf_disable(struct ftrace_event_call *call) tp->flags &= ~TP_FLAG_PROFILE; if (!(tp->flags & TP_FLAG_TRACE)) { - if (probe_is_return(tp)) + if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); @@ -1807,7 +1808,7 @@ static int register_probe_event(struct trace_probe *tp) /* Initialize ftrace_event_call */ INIT_LIST_HEAD(&call->class->fields); - if (probe_is_return(tp)) { + if (trace_probe_is_return(tp)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; } else { @@ -1899,7 +1900,7 @@ static __init int kprobe_trace_self_tests_init(void) warn++; } else { /* Enable trace point */ - tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); + tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { pr_warning("error on getting new probe.\n"); warn++; @@ -1914,7 +1915,7 @@ static __init int kprobe_trace_self_tests_init(void) warn++; } else { /* Enable trace point */ - tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); + tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { pr_warning("error on getting new probe.\n"); warn++; @@ -1940,7 +1941,7 @@ static __init int kprobe_trace_self_tests_init(void) } end: - cleanup_all_probes(); + release_all_trace_probes(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else -- cgit v1.2.3-70-g09d2 From f7bc8b61f65726ff98f52e286b28e294499d7a08 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Jul 2011 23:02:27 -0400 Subject: ftrace: Fix regression where ftrace breaks when modules are loaded Enabling function tracer to trace all functions, then load a module and then disable function tracing will cause ftrace to fail. This can also happen by enabling function tracing on the command line: ftrace=function and during boot up, modules are loaded, then you disable function tracing with 'echo nop > current_tracer' you will trigger a bug in ftrace that will shut itself down. The reason is, the new ftrace code keeps ref counts of all ftrace_ops that are registered for tracing. When one or more ftrace_ops are registered, all the records that represent the functions that the ftrace_ops will trace have a ref count incremented. If this ref count is not zero, when the code modification runs, that function will be enabled for tracing. If the ref count is zero, that function will be disabled from tracing. To make sure the accounting was working, FTRACE_WARN_ON()s were added to updating of the ref counts. If the ref count hits its max (> 2^30 ftrace_ops added), or if the ref count goes below zero, a FTRACE_WARN_ON() is triggered which disables all modification of code. Since it is common for ftrace_ops to trace all functions in the kernel, instead of creating > 20,000 hash items for the ftrace_ops, the hash count is just set to zero, and it represents that the ftrace_ops is to trace all functions. This is where the issues arrise. If you enable function tracing to trace all functions, and then add a module, the modules function records do not get the ref count updated. When the function tracer is disabled, all function records ref counts are subtracted. Since the modules never had their ref counts incremented, they go below zero and the FTRACE_WARN_ON() is triggered. The solution to this is rather simple. When modules are loaded, and their functions are added to the the ftrace pool, look to see if any ftrace_ops are registered that trace all functions. And for those, update the ref count for the module function records. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1c4c0b087e1..ef9271b69b4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1744,10 +1744,36 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; +static int ops_traces_mod(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash; + + hash = ops->filter_hash; + return !!(!hash || !hash->count); +} + static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p; cycle_t start, stop; + unsigned long ref = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are set to trace all functions. + * If they are, we need to enable the module functions as well + * as update the reference counts for those function records. + */ + if (mod) { + struct ftrace_ops *ops; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + if (ops->flags & FTRACE_OPS_FL_ENABLED && + ops_traces_mod(ops)) + ref++; + } + } start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -1760,7 +1786,7 @@ static int ftrace_update_code(struct module *mod) p = ftrace_new_addrs; ftrace_new_addrs = p->newlist; - p->flags = 0L; + p->flags = ref; /* * Do the initial record conversion from mcount jump @@ -1783,7 +1809,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up) { + if (ftrace_start_up && ref) { int failed = __ftrace_replace_code(p, 1); if (failed) { ftrace_bug(failed, p->ip); -- cgit v1.2.3-70-g09d2 From 1538f888f1e793de04e0f90372352ac1b05833cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:44 +0900 Subject: tracing/kprobes: Merge trace probe enable/disable functions Merge redundant enable/disable functions into enable_trace_probe() and disable_trace_probe(). Signed-off-by: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: yrl.pp-manager.tt@hitachi.com Cc: Frederic Weisbecker Cc: Ingo Molnar Link: http://lkml.kernel.org/r/20110627072644.6528.26910.stgit@fedora15 [ converted kprobe selftest to use enable_trace_probe ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 92 ++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 14b88ed65bb..3be50c52d08 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -683,6 +683,34 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } +/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ +static int enable_trace_probe(struct trace_probe *tp, int flag) +{ + int ret = 0; + + tp->flags |= flag; + if (tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)) { + if (trace_probe_is_return(tp)) + ret = enable_kretprobe(&tp->rp); + else + ret = enable_kprobe(&tp->rp.kp); + } + + return ret; +} + +/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ +static void disable_trace_probe(struct trace_probe *tp, int flag) +{ + tp->flags &= ~flag; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (trace_probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + /* Unregister a trace_probe and probe_event: call with locking probe_lock */ static void unregister_trace_probe(struct trace_probe *tp) { @@ -1514,30 +1542,6 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -static int probe_event_enable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags |= TP_FLAG_TRACE; - if (trace_probe_is_return(tp)) - return enable_kretprobe(&tp->rp); - else - return enable_kprobe(&tp->rp.kp); -} - -static void probe_event_disable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags &= ~TP_FLAG_TRACE; - if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); - else - disable_kprobe(&tp->rp.kp); - } -} - #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ do { \ @@ -1716,49 +1720,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } - -static int probe_perf_enable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags |= TP_FLAG_PROFILE; - - if (trace_probe_is_return(tp)) - return enable_kretprobe(&tp->rp); - else - return enable_kprobe(&tp->rp.kp); -} - -static void probe_perf_disable(struct ftrace_event_call *call) -{ - struct trace_probe *tp = (struct trace_probe *)call->data; - - tp->flags &= ~TP_FLAG_PROFILE; - - if (!(tp->flags & TP_FLAG_TRACE)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); - else - disable_kprobe(&tp->rp.kp); - } -} #endif /* CONFIG_PERF_EVENTS */ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) { + struct trace_probe *tp = (struct trace_probe *)event->data; + switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(event); + return enable_trace_probe(tp, TP_FLAG_TRACE); case TRACE_REG_UNREGISTER: - probe_event_disable(event); + disable_trace_probe(tp, TP_FLAG_TRACE); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_perf_enable(event); + return enable_trace_probe(tp, TP_FLAG_PROFILE); case TRACE_REG_PERF_UNREGISTER: - probe_perf_disable(event); + disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; #endif } @@ -1905,7 +1885,7 @@ static __init int kprobe_trace_self_tests_init(void) pr_warning("error on getting new probe.\n"); warn++; } else - probe_event_enable(&tp->call); + enable_trace_probe(tp, TP_FLAG_TRACE); } ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " @@ -1920,7 +1900,7 @@ static __init int kprobe_trace_self_tests_init(void) pr_warning("error on getting new probe.\n"); warn++; } else - probe_event_enable(&tp->call); + enable_trace_probe(tp, TP_FLAG_TRACE); } if (warn) -- cgit v1.2.3-70-g09d2 From bc81d48d13d8839fae6833c95794c403b2133f36 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:50 +0900 Subject: kprobes: Return -ENOENT if probe point doesn't exist Return -ENOENT if probe point doesn't exist, but still returns -EINVAL if both of kprobe->addr and kprobe->symbol_name are specified or both are not specified. Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Anil S Keshavamurthy Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20110627072650.6528.67329.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 77981813a1e..b30fd54eb98 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr) /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. + * This returns encoded errors if it fails to look up symbol or invalid + * combination of parameters. */ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) { kprobe_opcode_t *addr = p->addr; + + if ((p->symbol_name && p->addr) || + (!p->symbol_name && !p->addr)) + goto invalid; + if (p->symbol_name) { - if (addr) - return NULL; kprobe_lookup_name(p->symbol_name, addr); + if (!addr) + return ERR_PTR(-ENOENT); } - if (!addr) - return NULL; - return (kprobe_opcode_t *)(((char *)addr) + p->offset); + addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); + if (addr) + return addr; + +invalid: + return ERR_PTR(-EINVAL); } /* Check passed kprobe is valid and return kprobe in kprobe_table. */ @@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p) kprobe_opcode_t *addr; addr = kprobe_addr(p); - if (!addr) - return -EINVAL; + if (IS_ERR(addr)) + return PTR_ERR(addr); p->addr = addr; ret = check_kprobe_rereg(p); @@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p) */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { + /* Return -ENOENT if fail. */ + ret = -ENOENT; /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) module_put(probed_mod); goto fail_with_jump_label; } + /* ret will be updated by following code */ } preempt_enable(); jump_label_unlock(); @@ -1399,7 +1412,7 @@ out: fail_with_jump_label: preempt_enable(); jump_label_unlock(); - return -EINVAL; + return ret; } EXPORT_SYMBOL_GPL(register_kprobe); @@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); - if (!addr) - return -EINVAL; + if (IS_ERR(addr)) + return PTR_ERR(addr); for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { if (kretprobe_blacklist[i].addr == addr) -- cgit v1.2.3-70-g09d2 From 614243181050436785f5a621749a7da2336a7916 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:26:56 +0900 Subject: tracing/kprobes: Support module init function probing To support probing module init functions, kprobe-tracer allows user to define a probe on non-existed function when it is given with a module name. This also enables user to set a probe on a function on a specific module, even if a same name (but different) function is locally defined in another module. The module name must be in the front of function name and separated by a ':'. e.g. btrfs:btrfs_init_sysfs Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072656.6528.89970.stgit@fedora15 Signed-off-by: Steven Rostedt --- Documentation/trace/kprobetrace.txt | 9 +- kernel/trace/trace_kprobe.c | 164 ++++++++++++++++++++++++++++++------ 2 files changed, 143 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index c83bd6b4e6e..d0d0bb9e3e2 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -22,14 +22,15 @@ current_tracer. Instead of that, add probe points via Synopsis of kprobe_events ------------------------- - p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe + p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe + r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe -:[GRP/]EVENT : Clear a probe GRP : Group name. If omitted, use "kprobes" for it. EVENT : Event name. If omitted, the event name is generated - based on SYMBOL+offs or MEMADDR. - SYMBOL[+offs] : Symbol+offset where the probe is inserted. + based on SYM+offs or MEMADDR. + MOD : Module name which has given SYM. + SYM[+offs] : Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. FETCHARGS : Arguments. Each probe can have up to 128 args. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3be50c52d08..acc6664a6b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -536,6 +536,7 @@ struct probe_arg { /* Flags for trace_probe */ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 +#define TP_FLAG_REGISTERED 4 struct trace_probe { struct list_head list; @@ -565,6 +566,39 @@ static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } +static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) +{ + return tp->rp.kp.offset; +} + +static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + +static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) +{ + return !!(kprobe_gone(&tp->rp.kp)); +} + +static __kprobes bool trace_probe_within_module(struct trace_probe *tp, + struct module *mod) +{ + int len = strlen(mod->name); + const char *name = trace_probe_symbol(tp); + return strncmp(mod->name, name, len) == 0 && name[len] == ':'; +} + +static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) +{ + return !!strchr(trace_probe_symbol(tp), ':'); +} + static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -689,7 +723,8 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) int ret = 0; tp->flags |= flag; - if (tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)) { + if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && + !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) ret = enable_kretprobe(&tp->rp); else @@ -703,7 +738,7 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) static void disable_trace_probe(struct trace_probe *tp, int flag) { tp->flags &= ~flag; - if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else @@ -711,13 +746,64 @@ static void disable_trace_probe(struct trace_probe *tp, int flag) } } -/* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static void unregister_trace_probe(struct trace_probe *tp) +/* Internal register function - just handle k*probes and flags */ +static int __register_trace_probe(struct trace_probe *tp) { + int ret; + + if (trace_probe_is_registered(tp)) + return -EINVAL; + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(tp)) + tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + else + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); + ret = register_kretprobe(&tp->rp); else - unregister_kprobe(&tp->rp.kp); + ret = register_kprobe(&tp->rp.kp); + + if (ret == 0) + tp->flags |= TP_FLAG_REGISTERED; + else { + pr_warning("Could not insert probe at %s+%lu: %d\n", + trace_probe_symbol(tp), trace_probe_offset(tp), ret); + if (ret == -ENOENT && trace_probe_is_on_module(tp)) { + pr_warning("This probe might be able to register after" + "target module is loaded. Continue.\n"); + ret = 0; + } else if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + } + + return ret; +} + +/* Internal unregister function - just handle k*probes and flags */ +static void __unregister_trace_probe(struct trace_probe *tp) +{ + if (trace_probe_is_registered(tp)) { + if (trace_probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + tp->flags &= ~TP_FLAG_REGISTERED; + /* Cleanup kprobe for reuse */ + if (tp->rp.kp.symbol_name) + tp->rp.kp.addr = NULL; + } +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + __unregister_trace_probe(tp); list_del(&tp->list); unregister_probe_event(tp); } @@ -730,41 +816,65 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); - /* register as an event */ + /* Delete old (same name) event if exist */ old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { - /* delete old event */ unregister_trace_probe(old_tp); free_trace_probe(old_tp); } + + /* Register new event */ ret = register_probe_event(tp); if (ret) { pr_warning("Failed to register probe event(%d)\n", ret); goto end; } - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); - else - ret = register_kprobe(&tp->rp.kp); - - if (ret) { - pr_warning("Could not insert probe(%d)\n", ret); - if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tp->rp.kp.addr); - ret = -EINVAL; - } + /* Register k*probe */ + ret = __register_trace_probe(tp); + if (ret < 0) unregister_probe_event(tp); - } else + else list_add_tail(&tp->list, &probe_list); + end: mutex_unlock(&probe_lock); return ret; } +/* Module notifier call back, checking event on the module */ +static int trace_probe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct trace_probe *tp; + int ret; + + if (val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + /* Update probes on coming module */ + mutex_lock(&probe_lock); + list_for_each_entry(tp, &probe_list, list) { + if (trace_probe_within_module(tp, mod)) { + __unregister_trace_probe(tp); + ret = __register_trace_probe(tp); + if (ret) + pr_warning("Failed to re-register probe %s on" + "%s: %d\n", + tp->call.name, mod->name, ret); + } + } + mutex_unlock(&probe_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block trace_probe_module_nb = { + .notifier_call = trace_probe_module_callback, + .priority = 1 /* Invoked after kprobe module callback */ +}; + /* Split symbol and offset. */ static int split_symbol_offset(char *symbol, unsigned long *offset) { @@ -990,8 +1100,8 @@ static int create_trace_probe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -1186,7 +1296,6 @@ static void release_all_trace_probes(void) mutex_unlock(&probe_lock); } - /* Probes listing interfaces */ static void *probes_seq_start(struct seq_file *m, loff_t *pos) { @@ -1827,6 +1936,9 @@ static __init int init_kprobe_trace(void) struct dentry *d_tracer; struct dentry *entry; + if (register_module_notifier(&trace_probe_module_nb)) + return -EINVAL; + d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; -- cgit v1.2.3-70-g09d2 From 7f6878a3d707b947603e09d95df0c3a98987e3a4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 27 Jun 2011 16:27:03 +0900 Subject: tracing/kprobe: Update symbol reference when loading module Since the address of a module-local variable can only be solved after the target module is loaded, the symbol fetch-argument should be updated when loading target module. Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20110627072703.6528.75042.stgit@fedora15 Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index acc6664a6b2..5fb3697bf0e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) DEFINE_FETCH_deref(string_size) +static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) @@ -376,6 +384,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL +static __kprobes void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + static __kprobes void free_bitfield_fetch_param(struct bitfield_fetch_param *data) { @@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) free_symbol_cache(data->orig.data); kfree(data); } + /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -680,6 +702,16 @@ error: return ERR_PTR(ret); } +static void update_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + update_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + update_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + update_symbol_cache(arg->fetch.data); +} + static void free_probe_arg(struct probe_arg *arg) { if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) @@ -749,11 +781,14 @@ static void disable_trace_probe(struct trace_probe *tp, int flag) /* Internal register function - just handle k*probes and flags */ static int __register_trace_probe(struct trace_probe *tp) { - int ret; + int i, ret; if (trace_probe_is_registered(tp)) return -EINVAL; + for (i = 0; i < tp->nr_args; i++) + update_probe_arg(&tp->args[i]); + /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(tp)) tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; -- cgit v1.2.3-70-g09d2 From 3b5fe85252326217cd96f24a7bda4460d8f71bee Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Sun, 12 Jun 2011 15:57:05 +0200 Subject: PM / Suspend: Add .suspend_again() callback to suspend_ops A system or a device may need to control suspend/wakeup events. It may want to wakeup the system after a predefined amount of time or at a predefined event decided while entering suspend for polling or delayed work. Then, it may want to enter suspend again if its predefined wakeup condition is the only wakeup reason and there is no outstanding events; thus, it does not wakeup the userspace unnecessary or unnecessary devices and keeps suspended as long as possible (saving the power). Enabling a system to wakeup after a specified time can be easily achieved by using RTC. However, to enter suspend again immediately without invoking userland and unrelated devices, we need additional features in the suspend framework. Such need comes from: 1. Monitoring a critical device status without interrupts that can wakeup the system. (in-suspend polling) An example is ambient temperature monitoring that needs to shut down the system or a specific device function if it is too hot or cold. The temperature of a specific device may be needed to be monitored as well; e.g., a charger monitors battery temperature in order to stop charging if overheated. 2. Execute critical "delayed work" at suspend. A driver or a system/board may have a delayed work (or any similar things) that it wants to execute at the requested time. For example, some chargers want to check the battery voltage some time (e.g., 30 seconds) after the battery is fully charged and the charger has stopped. Then, the charger restarts charging if the voltage has dropped more than a threshold, which is smaller than "restart-charger" voltage, which is a threshold to restart charging regardless of the time passed. This patch allows to add "suspend_again" callback at struct platform_suspend_ops and let the "suspend_again" callback return true if the system is required to enter suspend again after the current instance of wakeup. Device-wise suspend_again implemented at dev_pm_ops or syscore is not done because: a) suspend_again feature is usually under platform-wise decision and controls the behavior of the whole platform and b) There are very limited devices related to the usage cases of suspend_again; chargers and temperature sensors are mentioned so far. With suspend_again callback registered at struct platform_suspend_ops suspend_ops in kernel/power/suspend.c with suspend_set_ops by the platform, the suspend framework tries to enter suspend again by looping suspend_enter() if suspend_again has returned true and there has been no errors in the suspending sequence or pending wakeups (by pm_wakeup_pending). Tested at Exynos4-NURI. [rjw: Fixed up kerneldoc comment for suspend_enter().] Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 8 ++++++++ kernel/power/suspend.c | 18 ++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 083ffea7ba1..e1e3742733b 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -92,6 +92,13 @@ typedef int __bitwise suspend_state_t; * @enter() and @wake(), even if any of them fails. It is executed after * a failing @prepare. * + * @suspend_again: Returns whether the system should suspend again (true) or + * not (false). If the platform wants to poll sensors or execute some + * code during suspended without invoking userspace and most of devices, + * suspend_again callback is the place assuming that periodic-wakeup or + * alarm-wakeup is already setup. This allows to execute some codes while + * being kept suspended in the view of userland and devices. + * * @end: Called by the PM core right after resuming devices, to indicate to * the platform that the system has returned to the working state or * the transition to the sleep state has been aborted. @@ -113,6 +120,7 @@ struct platform_suspend_ops { int (*enter)(suspend_state_t state); void (*wake)(void); void (*finish)(void); + bool (*suspend_again)(void); void (*end)(void); void (*recover)(void); }; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1c41ba21541..b6762f43365 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -126,12 +126,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) } /** - * suspend_enter - enter the desired system sleep state. - * @state: state to enter + * suspend_enter - enter the desired system sleep state. + * @state: State to enter + * @wakeup: Returns information that suspend should not be entered again. * - * This function should be called after devices have been suspended. + * This function should be called after devices have been suspended. */ -static int suspend_enter(suspend_state_t state) +static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; @@ -165,7 +166,8 @@ static int suspend_enter(suspend_state_t state) error = syscore_suspend(); if (!error) { - if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { + *wakeup = pm_wakeup_pending(); + if (!(suspend_test(TEST_CORE) || *wakeup)) { error = suspend_ops->enter(state); events_check_enabled = false; } @@ -199,6 +201,7 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; + bool wakeup = false; if (!suspend_ops) return -ENOSYS; @@ -220,7 +223,10 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_test(TEST_DEVICES)) goto Recover_platform; - error = suspend_enter(state); + do { + error = suspend_enter(state, &wakeup); + } while (!error && !wakeup + && suspend_ops->suspend_again && suspend_ops->suspend_again()); Resume_devices: suspend_test_start(); -- cgit v1.2.3-70-g09d2 From a5e4fd8783a2bec861ecf1138cdc042269ff59aa Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Mon, 27 Jun 2011 01:01:07 +0200 Subject: PM / Suspend: Export suspend_set_ops, suspend_valid_only_mem Some platforms wish to implement their PM core suspend code as modules. To do so, these functions need to be exported to modules. [rjw: Replaced EXPORT_SYMBOL with EXPORT_SYMBOL_GPL] Reported-by: Jean Pihet Signed-off-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b6762f43365..b6b71ad2208 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) suspend_ops = ops; mutex_unlock(&pm_mutex); } +EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { @@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state) { return state == PM_SUSPEND_MEM; } +EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static int suspend_test(int level) { -- cgit v1.2.3-70-g09d2 From f0c077a8b7f9dce590c760a7b2f3c417dffa52d1 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 8 Jul 2011 20:53:36 +0200 Subject: PM: Improve error code of pm_notifier_call_chain() This enables pm_notifier_call_chain() to get the actual error code in the callback rather than always assume -EINVAL by converting all PM notifier calls to return encapsulate error code with notifier_from_errno(). Signed-off-by: Akinobu Mita Signed-off-by: Rafael J. Wysocki --- drivers/char/apm-emulation.c | 2 +- drivers/s390/char/vmwatchdog.c | 4 ++-- drivers/s390/cio/css.c | 8 ++++---- kernel/power/main.c | 5 +++-- 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c index 548708c4b2b..a7346ab97a3 100644 --- a/drivers/char/apm-emulation.c +++ b/drivers/char/apm-emulation.c @@ -606,7 +606,7 @@ static int apm_suspend_notifier(struct notifier_block *nb, return NOTIFY_OK; /* interrupted by signal */ - return NOTIFY_BAD; + return notifier_from_errno(err); case PM_POST_SUSPEND: /* diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c index 12ef9121d4f..11312f401c7 100644 --- a/drivers/s390/char/vmwatchdog.c +++ b/drivers/s390/char/vmwatchdog.c @@ -258,13 +258,13 @@ static int vmwdt_suspend(void) if (test_and_set_bit(VMWDT_OPEN, &vmwdt_is_open)) { pr_err("The system cannot be suspended while the watchdog" " is in use\n"); - return NOTIFY_BAD; + return notifier_from_errno(-EBUSY); } if (test_bit(VMWDT_RUNNING, &vmwdt_is_open)) { clear_bit(VMWDT_OPEN, &vmwdt_is_open); pr_err("The system cannot be suspended while the watchdog" " is running\n"); - return NOTIFY_BAD; + return notifier_from_errno(-EBUSY); } return NOTIFY_DONE; } diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index c47b25fd3f4..92d7324acb1 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -814,8 +814,8 @@ static int css_power_event(struct notifier_block *this, unsigned long event, mutex_unlock(&css->mutex); continue; } - if (__chsc_do_secm(css, 0)) - ret = NOTIFY_BAD; + ret = __chsc_do_secm(css, 0); + ret = notifier_from_errno(ret); mutex_unlock(&css->mutex); } break; @@ -831,8 +831,8 @@ static int css_power_event(struct notifier_block *this, unsigned long event, mutex_unlock(&css->mutex); continue; } - if (__chsc_do_secm(css, 1)) - ret = NOTIFY_BAD; + ret = __chsc_do_secm(css, 1); + ret = notifier_from_errno(ret); mutex_unlock(&css->mutex); } /* search for subchannels, which appeared during hibernation */ diff --git a/kernel/power/main.c b/kernel/power/main.c index 2981af4ce7c..6c601f87196 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain(unsigned long val) { - return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) - == NOTIFY_BAD) ? -EINVAL : 0; + int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + + return notifier_to_errno(ret); } /* If set, devices may be suspended and resumed asynchronously. */ -- cgit v1.2.3-70-g09d2 From 961c4675c75112717705fa5c0c53cb9664051479 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 Jul 2011 21:33:54 +0200 Subject: has_stopped_jobs: s/task_is_stopped/SIGNAL_STOP_STOPPED/ has_stopped_jobs() naively checks task_is_stopped(group_leader). This was always wrong even without ptrace, group_leader can be dead. And given that ptrace can change the state to TRACED this is wrong even in the single-threaded case. Change the code to check SIGNAL_STOP_STOPPED and simplify the code, retval + break/continue doesn't make this trivial code more readable. We could probably add the usual "|| signal->group_stop_count" check but I don't think this makes sense, the task can start the group-stop right after the check anyway. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- kernel/exit.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b8d3b47bb88..6c7fbbe7d86 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -266,18 +266,16 @@ int is_current_pgrp_orphaned(void) return retval; } -static int has_stopped_jobs(struct pid *pgrp) +static bool has_stopped_jobs(struct pid *pgrp) { - int retval = 0; struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (!task_is_stopped(p)) - continue; - retval = 1; - break; + if (p->signal->flags & SIGNAL_STOP_STOPPED) + return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return retval; + + return false; } /* -- cgit v1.2.3-70-g09d2 From dcace06cc29df927a74a6bc0e57b9bef87704377 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Jul 2011 19:13:54 +0200 Subject: ptrace: mv send-SIGSTOP from do_fork() to ptrace_init_task() If the new child is traced, do_fork() adds the pending SIGSTOP. It assumes that either it is traced because of auto-attach or the tracer attached later, in both cases sigaddset/set_thread_flag is correct even if SIGSTOP is already pending. Now that we have PTRACE_SEIZE this is no longer right in the latter case. If the tracer does PTRACE_SEIZE after copy_process() makes the child visible the queued SIGSTOP is wrong. We could check PT_SEIZED bit and change ptrace_attach() to set both PT_PTRACED and PT_SEIZED bits simultaneously but see the next patch, we need to know whether this child was auto-attached or not anyway. So this patch simply moves this code to ptrace_init_task(), this way we can never race with ptrace_attach(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- include/linux/ptrace.h | 3 +++ kernel/fork.c | 12 ------------ 2 files changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index fd8669fc339..9b5d2c901d0 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -227,6 +227,9 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; __ptrace_link(child, current->parent); + + sigaddset(&child->pending.signal, SIGSTOP); + set_tsk_thread_flag(child, TIF_SIGPENDING); } } diff --git a/kernel/fork.c b/kernel/fork.c index 3c72a5b321a..4d4117e0150 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -1521,17 +1520,6 @@ long do_fork(unsigned long clone_flags, audit_finish_fork(p); - /* - * Child is ready but hasn't started running yet. Queue - * SIGSTOP if it's gonna be ptraced - it doesn't matter who - * attached/attaching to this task, the pending SIGSTOP is - * right in any case. - */ - if (unlikely(p->ptrace)) { - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - } - /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that -- cgit v1.2.3-70-g09d2 From f701e5b73a1a79ea62ffd45d9e2bed4c7d5c1fd2 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Fri, 15 Jul 2011 20:45:18 +0300 Subject: connector: add an event for monitoring process tracers This change adds a procfs connector event, which is emitted on every successful process tracer attach or detach. If some process connects to other one, kernelspace connector reports process id and thread group id of both these involved processes. On disconnection null process id is returned. Such an event allows to create a simple automated userspace mechanism to be aware about processes connecting to others, therefore predefined process policies can be applied to them if needed. Note, a detach signal is emitted only in case, if a tracer process explicitly executes PTRACE_DETACH request. In other cases like tracee or tracer exit detach event from proc connector is not reported. Signed-off-by: Vladimir Zapolskiy Acked-by: Evgeniy Polyakov Cc: David S. Miller Signed-off-by: Oleg Nesterov --- drivers/connector/cn_proc.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/cn_proc.h | 13 +++++++++++++ kernel/ptrace.c | 7 ++++++- 3 files changed, 54 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 2b46a7efa0a..281902d3f7e 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -166,6 +167,40 @@ void proc_sid_connector(struct task_struct *task) cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } +void proc_ptrace_connector(struct task_struct *task, int ptrace_id) +{ + struct cn_msg *msg; + struct proc_event *ev; + struct timespec ts; + __u8 buffer[CN_PROC_MSG_SIZE]; + struct task_struct *tracer; + + if (atomic_read(&proc_event_num_listeners) < 1) + return; + + msg = (struct cn_msg *)buffer; + ev = (struct proc_event *)msg->data; + get_seq(&msg->seq, &ev->cpu); + ktime_get_ts(&ts); /* get high res monotonic timestamp */ + put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); + ev->what = PROC_EVENT_PTRACE; + ev->event_data.ptrace.process_pid = task->pid; + ev->event_data.ptrace.process_tgid = task->tgid; + if (ptrace_id == PTRACE_ATTACH) { + ev->event_data.ptrace.tracer_pid = current->pid; + ev->event_data.ptrace.tracer_tgid = current->tgid; + } else if (ptrace_id == PTRACE_DETACH) { + ev->event_data.ptrace.tracer_pid = 0; + ev->event_data.ptrace.tracer_tgid = 0; + } else + return; + + memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); + msg->ack = 0; /* not used */ + msg->len = sizeof(*ev); + cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); +} + void proc_exit_connector(struct task_struct *task) { struct cn_msg *msg; diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h index 47dac5ea8d3..12c517b51ca 100644 --- a/include/linux/cn_proc.h +++ b/include/linux/cn_proc.h @@ -53,6 +53,7 @@ struct proc_event { PROC_EVENT_UID = 0x00000004, PROC_EVENT_GID = 0x00000040, PROC_EVENT_SID = 0x00000080, + PROC_EVENT_PTRACE = 0x00000100, /* "next" should be 0x00000400 */ /* "last" is the last process event: exit */ PROC_EVENT_EXIT = 0x80000000 @@ -95,6 +96,13 @@ struct proc_event { __kernel_pid_t process_tgid; } sid; + struct ptrace_proc_event { + __kernel_pid_t process_pid; + __kernel_pid_t process_tgid; + __kernel_pid_t tracer_pid; + __kernel_pid_t tracer_tgid; + } ptrace; + struct exit_proc_event { __kernel_pid_t process_pid; __kernel_pid_t process_tgid; @@ -109,6 +117,7 @@ void proc_fork_connector(struct task_struct *task); void proc_exec_connector(struct task_struct *task); void proc_id_connector(struct task_struct *task, int which_id); void proc_sid_connector(struct task_struct *task); +void proc_ptrace_connector(struct task_struct *task, int which_id); void proc_exit_connector(struct task_struct *task); #else static inline void proc_fork_connector(struct task_struct *task) @@ -124,6 +133,10 @@ static inline void proc_id_connector(struct task_struct *task, static inline void proc_sid_connector(struct task_struct *task) {} +static inline void proc_ptrace_connector(struct task_struct *task, + int ptrace_id) +{} + static inline void proc_exit_connector(struct task_struct *task) {} #endif /* CONFIG_PROC_EVENTS */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d7ccc79454f..9de3ecfd20f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -23,6 +23,7 @@ #include #include #include +#include static int ptrace_trapping_sleep_fn(void *flags) @@ -305,9 +306,12 @@ unlock_tasklist: unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: - if (!retval) + if (!retval) { wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); + proc_ptrace_connector(task, PTRACE_ATTACH); + } + return retval; } @@ -415,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) } write_unlock_irq(&tasklist_lock); + proc_ptrace_connector(child, PTRACE_DETACH); if (unlikely(dead)) release_task(child); -- cgit v1.2.3-70-g09d2 From e78e8f2d8318851d0911039999c903a6082bef2e Mon Sep 17 00:00:00 2001 From: Peter Foley Date: Tue, 5 Jul 2011 19:42:18 -0400 Subject: kernel: prevent unnecessary rebuilding due to config_data.gz When IKCONFIG is built-in make oldconfig will cause the kernel to be relinked even if .config didn't change. This happens because of a config_data.gz dependency on .config. This patch changes the if_changed to a filechk so that config_data.h is only rebuilt when the contents have actually changed. Signed-off-by: Peter Foley Signed-off-by: Michal Marek --- kernel/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2d64cfcc8b4..d06467fc8f7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -125,11 +125,10 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) -quiet_cmd_ikconfiggz = IKCFG $@ - cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE - $(call if_changed,ikconfiggz) + $(call filechk,ikconfiggz) $(obj)/time.o: $(obj)/timeconst.h -- cgit v1.2.3-70-g09d2 From 3bfa784a6539f91a27d7ffdd408efdb638e3bebd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jun 2011 12:55:10 -0400 Subject: kill file_permission() completely convert the last remaining caller to inode_permission() Signed-off-by: Al Viro --- fs/namei.c | 18 ------------------ include/linux/fs.h | 1 - kernel/cgroup.c | 3 ++- 3 files changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/fs/namei.c b/fs/namei.c index cf2554635a1..4ad2b781a65 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -303,23 +303,6 @@ int inode_permission(struct inode *inode, int mask) return security_inode_permission(inode, mask); } -/** - * file_permission - check for additional access rights to a given file - * @file: file to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * - * Used to check for read/write/execute permissions on an already opened - * file. - * - * Note: - * Do not use this function in new code. All access checks should - * be done using inode_permission(). - */ -int file_permission(struct file *file, int mask) -{ - return inode_permission(file->f_path.dentry->d_inode, mask); -} - /* * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. @@ -3405,7 +3388,6 @@ EXPORT_SYMBOL(kern_path_parent); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(file_permission); EXPORT_SYMBOL(unlock_rename); EXPORT_SYMBOL(vfs_create); EXPORT_SYMBOL(vfs_follow_link); diff --git a/include/linux/fs.h b/include/linux/fs.h index a8735e7e1b3..d04e55586a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,6 @@ extern void dentry_unhash(struct dentry *dentry); /* * VFS file helper functions. */ -extern int file_permission(struct file *, int); extern void inode_init_owner(struct inode *inode, const struct inode *dir, mode_t mode); /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2731d115d72..e1c72c0f512 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3542,7 +3542,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, } /* the process need read permission on control file */ - ret = file_permission(cfile, MAY_READ); + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); if (ret < 0) goto fail; -- cgit v1.2.3-70-g09d2 From 6657719390cd05be45f4e3b501d8bb46889c0a19 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Jun 2011 15:41:10 -0400 Subject: make sure that nsproxy_cache is initialized early enough Signed-off-by: Al Viro --- include/linux/nsproxy.h | 1 + kernel/fork.c | 1 + kernel/nsproxy.c | 4 +--- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 50d20aba57d..cc37a55ad00 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -68,6 +68,7 @@ void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *); +int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a..31fa13e63b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1574,6 +1574,7 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); + nsproxy_cache_init(); } /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index d6a00f3de15..9aeab4b98c6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -271,10 +271,8 @@ out: return err; } -static int __init nsproxy_cache_init(void) +int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); return 0; } - -module_init(nsproxy_cache_init); -- cgit v1.2.3-70-g09d2 From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2011 13:00:06 +0200 Subject: sched: Break out cpu_power from the sched_group structure In order to prepare for non-unique sched_groups per domain, we need to carry the cpu_power elsewhere, so put a level of indirection in. Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 +++++++++----- kernel/sched.c | 32 ++++++++++++++++++++++++++------ kernel/sched_fair.c | 46 +++++++++++++++++++++++----------------------- 3 files changed, 58 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 496770a9648..2e5b3c8e2d3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -893,16 +893,20 @@ static inline int sd_power_saving_flags(void) return 0; } -struct sched_group { - struct sched_group *next; /* Must be a circular list */ - atomic_t ref; - +struct sched_group_power { /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a * single CPU. */ - unsigned int cpu_power, cpu_power_orig; + unsigned int power, power_orig; +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + unsigned int group_weight; + struct sched_group_power *sgp; /* * The CPUs this group covers. diff --git a/kernel/sched.c b/kernel/sched.c index 3dc716f6d8a..36c10d25d4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6557,7 +6557,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->cpu_power) { + if (!group->sgp->power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6581,9 +6581,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_POWER_SCALE) { + if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + group->sgp->power); } group = group->next; @@ -6777,8 +6777,10 @@ static struct root_domain *alloc_rootdomain(void) static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) + if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); kfree(sd->groups); + } kfree(sd); } @@ -6945,6 +6947,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; }; struct s_data { @@ -6981,8 +6984,10 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) + if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + } return cpu; } @@ -7020,7 +7025,7 @@ build_sched_groups(struct sched_domain *sd) continue; cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; + sg->sgp->power = 0; for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -7185,6 +7190,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (cpu == cpumask_first(sched_group_cpus(sg))) { WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); *per_cpu_ptr(sdd->sg, cpu) = NULL; + *per_cpu_ptr(sdd->sgp, cpu) = NULL; } } @@ -7234,9 +7240,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; + struct sched_group_power *sgp; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -7251,6 +7262,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) return -ENOMEM; *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; } } @@ -7268,9 +7286,11 @@ static void __sdt_free(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { kfree(*per_cpu_ptr(sdd->sd, j)); kfree(*per_cpu_ptr(sdd->sg, j)); + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); free_percpu(sdd->sg); + free_percpu(sdd->sgp); } } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 433491c2dc8..c768588e180 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; if (local_group) { this_load = avg_load; @@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->cpu_power_orig = power; + sdg->sgp->power_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->cpu_power = power; + sdg->sgp->power = power; } static void update_group_power(struct sched_domain *sd, int cpu) @@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power += group->cpu_power; + power += group->sgp->power; group = group->next; } while (group != child->groups); - sdg->cpu_power = power; + sdg->sgp->power = power; } /* @@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * If ~90% of the cpu_power is still there, we're good. */ - if (group->cpu_power * 32 > group->cpu_power_orig * 29) + if (group->sgp->power * 32 > group->sgp->power_orig * 29) return 1; return 0; @@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; /* * Consider the group unbalanced when the imbalance is larger @@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); @@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, return; sds->total_load += sgs.group_load; - sds->total_pwr += sg->cpu_power; + sds->total_pwr += sg->sgp->power; /* * In case the child domain prefers tasks go to siblings @@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd, if (this_cpu > busiest_cpu) return 0; - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); return 1; } @@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->cpu_power; + scaled_busy_load_per_task /= sds->busiest->sgp->power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->cpu_power * + pwr_now += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->cpu_power * + pwr_now += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->cpu_power; + sds->busiest->sgp->power; if (sds->max_load > tmp) - pwr_move += sds->busiest->cpu_power * + pwr_move += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->cpu_power < + if (sds->max_load * sds->busiest->sgp->power < sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->cpu_power) / - sds->this->cpu_power; + tmp = (sds->max_load * sds->busiest->sgp->power) / + sds->this->sgp->power; else tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->cpu_power; - pwr_move += sds->this->cpu_power * + sds->this->sgp->power; + pwr_move += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_POWER_SCALE; @@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->cpu_power; + load_above_capacity /= sds->busiest->sgp->power; } /* @@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->cpu_power, - (sds->avg_load - sds->this_load) * sds->this->cpu_power) + *imbalance = min(max_pull * sds->busiest->sgp->power, + (sds->avg_load - sds->this_load) * sds->this->sgp->power) / SCHED_POWER_SCALE; /* -- cgit v1.2.3-70-g09d2 From e3589f6c81e4764d32a25d2a2a0afe54fa344f5c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jul 2011 10:35:52 +0200 Subject: sched: Allow for overlapping sched_domain spans Allow for sched_domain spans that overlap by giving such domains their own sched_group list instead of sharing the sched_groups amongst each-other. This is needed for machines with more than 16 nodes, because sched_domain_node_span() will generate a node mask from the 16 nearest nodes without regard if these masks have any overlap. Currently sched_domains have a sched_group that maps to their child sched_domain span, and since there is no overlap we share the sched_group between the sched_domains of the various CPUs. If however there is overlap, we would need to link the sched_group list in different ways for each cpu, and hence sharing isn't possible. In order to solve this, allocate private sched_groups for each CPU's sched_domain but have the sched_groups share a sched_group_power structure such that we can uniquely track the power. Reported-and-tested-by: Anton Blanchard Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 + kernel/sched.c | 157 +++++++++++++++++++++++++++++++++++++++--------- kernel/sched_features.h | 2 + 3 files changed, 132 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2e5b3c8e2d3..bde99d5358d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -844,6 +844,7 @@ enum cpu_idle_type { #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ @@ -894,6 +895,7 @@ static inline int sd_power_saving_flags(void) } struct sched_group_power { + atomic_t ref; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a * single CPU. diff --git a/kernel/sched.c b/kernel/sched.c index 36c10d25d4c..921adf6f6fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6774,10 +6774,36 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) { + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { kfree(sd->groups->sgp); kfree(sd->groups); } @@ -6967,15 +6993,73 @@ struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +#define SDTL_OVERLAP 0x01 + struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int flags; struct sd_data data; }; -/* - * Assumes the sched_domain tree is fully constructed - */ +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(i)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + + child = *per_cpu_ptr(sdd->sd, i); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + atomic_inc(&sg->sgp->ref); + + if (cpumask_test_cpu(cpu, sg_span)) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -6987,23 +7071,21 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ } return cpu; } /* - * build_sched_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed */ -static void -build_sched_groups(struct sched_domain *sd) +static int +build_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; @@ -7011,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd) struct cpumask *covered; int i; + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; @@ -7042,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd) last = sg; } last->next = first; + + return 0; } /* @@ -7056,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd) */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - WARN_ON(!sd || !sd->groups); + struct sched_group *sg = sd->groups; - if (cpu != group_first_cpu(sd->groups)) - return; + WARN_ON(!sd || !sg); + + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + if (cpu != group_first_cpu(sg)) + return; update_group_power(sd, cpu); } @@ -7182,16 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, static void claim_allocations(int cpu, struct sched_domain *sd) { struct sd_data *sdd = sd->private; - struct sched_group *sg = sd->groups; WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (cpu == cpumask_first(sched_group_cpus(sg))) { - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) *per_cpu_ptr(sdd->sgp, cpu) = NULL; - } } #ifdef CONFIG_SCHED_SMT @@ -7216,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = { #endif { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, }, + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, @@ -7284,7 +7378,9 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(sdd->sd, j)); + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sg, j)); kfree(*per_cpu_ptr(sdd->sgp, j)); } @@ -7336,8 +7432,11 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) + for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + } while (sd->child) sd = sd->child; @@ -7349,13 +7448,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - get_group(i, sd->private, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (i != cpumask_first(sched_domain_span(sd))) - continue; - - build_sched_groups(sd); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } } } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index be40f7371ee..1e7066d76c2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1) * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, 1) + +SCHED_FEAT(FORCE_SD_OVERLAP, 0) -- cgit v1.2.3-70-g09d2 From d110235d2c331c4f79e0879f51104be79e17a469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Jul 2011 18:42:57 +0200 Subject: sched: Avoid creating superfluous NUMA domains on non-NUMA systems When creating sched_domains, stop when we've covered the entire target span instead of continuing to create domains, only to later find they're redundant and throw them away again. This avoids single node systems from touching funny NUMA sched_domain creation code and reduces the risks of the new SD_OVERLAP code. Requested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Anton Blanchard Cc: mahesh@linux.vnet.ibm.com Cc: benh@kernel.crashing.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1311180177.29152.57.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 921adf6f6fa..14168c49a15 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7436,6 +7436,8 @@ static int build_sched_domains(const struct cpumask *cpu_map, sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; } while (sd->child) -- cgit v1.2.3-70-g09d2 From 3b097c46964b07479855b01056c61540b8cadd50 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:03:53 +0800 Subject: audit_tree,rcu: Convert call_rcu(__put_tree) to kfree_rcu() The rcu callback __put_tree() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(__put_tree). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Cc: Al Viro Cc: Eric Paris Reviewed-by: Josh Triplett --- kernel/audit_tree.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e99dda04b12..5bf0790497e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree) atomic_inc(&tree->count); } -static void __put_tree(struct rcu_head *rcu) -{ - struct audit_tree *tree = container_of(rcu, struct audit_tree, head); - kfree(tree); -} - static inline void put_tree(struct audit_tree *tree) { if (atomic_dec_and_test(&tree->count)) - call_rcu(&tree->head, __put_tree); + kfree_rcu(tree, head); } /* to avoid bringing the entire thing in audit.h */ -- cgit v1.2.3-70-g09d2 From a95cded32de3deae13af34715200532e6823cc9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 1 May 2011 23:21:00 -0700 Subject: sysctl,rcu: Convert call_rcu(free_head) to kfree The RCU callback free_head just calls kfree(), so we can use kfree_rcu() instead of call_rcu(). Signed-off-by: Paul E. McKenney Cc: Andrew Morton Reviewed-by: Josh Triplett --- kernel/sysctl.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f175d98bd35..11d65b531e5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1590,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head) spin_unlock(&sysctl_lock); } -static void free_head(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct ctl_table_header, rcu)); -} - void sysctl_head_put(struct ctl_table_header *head) { spin_lock(&sysctl_lock); if (!--head->count) - call_rcu(&head->rcu, free_head); + kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); } @@ -1971,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) start_unregistering(header); if (!--header->parent->count) { WARN_ON(1); - call_rcu(&header->parent->rcu, free_head); + kfree_rcu(header->parent, rcu); } if (!--header->count) - call_rcu(&header->rcu, free_head); + kfree_rcu(header, rcu); spin_unlock(&sysctl_lock); } -- cgit v1.2.3-70-g09d2 From cbaa51524b3224813814607177a00c350ee35d12 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 20 Jul 2011 15:42:55 -0700 Subject: time: Fix stupid KERN_WARN compile issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Terribly embarassing. Don't know how I committed this, but its KERN_WARNING not KERN_WARN. This fixes the following compile error: kernel/time/timekeeping.c: In function ‘__timekeeping_inject_sleeptime’: kernel/time/timekeeping.c:608: error: ‘KERN_WARN’ undeclared (first use in this function) kernel/time/timekeeping.c:608: error: (Each undeclared identifier is reported only once kernel/time/timekeeping.c:608: error: for each function it appears in.) kernel/time/timekeeping.c:608: error: expected ‘)’ before string constant make[2]: *** [kernel/time/timekeeping.o] Error 1 Reported-by: Ingo Molnar Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fdc6b887b20..2b021b0e850 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -605,7 +605,7 @@ static struct timespec timekeeping_suspend_time; static void __timekeeping_inject_sleeptime(struct timespec *delta) { if (!timespec_valid(delta)) { - printk(KERN_WARN "__timekeeping_inject_sleeptime: Invalid " + printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } -- cgit v1.2.3-70-g09d2 From 11b80f459adaf91a712f95e7734a17655a36bf30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:44 -0400 Subject: rw_semaphore: remove up/down_read_non_owner Now that the last users is gone these can be removed. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/rwsem.h | 10 ---------- kernel/rwsem.c | 16 ---------------- 2 files changed, 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index a8afe9cd000..77950dfa0a9 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -124,19 +124,9 @@ extern void downgrade_write(struct rw_semaphore *sem); */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); -/* - * Take/release a lock when not the owner will release it. - * - * [ This API should be avoided as much as possible - the - * proper abstraction for this case is completions. ] - */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */ diff --git a/kernel/rwsem.c b/kernel/rwsem.c index cae050b05f5..176e5e56ffa 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); -void down_read_non_owner(struct rw_semaphore *sem) -{ - might_sleep(); - - __down_read(sem); -} - -EXPORT_SYMBOL(down_read_non_owner); - void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); @@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); -void up_read_non_owner(struct rw_semaphore *sem) -{ - __up_read(sem); -} - -EXPORT_SYMBOL(up_read_non_owner); - #endif -- cgit v1.2.3-70-g09d2 From 8a35241803eeb0e9fd3fe27835d6b2775c73b641 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 21 Jul 2011 17:06:53 +0200 Subject: ptrace: fix ptrace_signal() && STOP_DEQUEUED interaction Simple test-case, int main(void) { int pid, status; pid = fork(); if (!pid) { pause(); assert(0); return 0x23; } assert(ptrace(PTRACE_ATTACH, pid, 0,0) == 0); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); kill(pid, SIGCONT); // <--- also clears STOP_DEQUEUD assert(ptrace(PTRACE_CONT, pid, 0,0) == 0); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGCONT); assert(ptrace(PTRACE_CONT, pid, 0, SIGSTOP) == 0); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); kill(pid, SIGKILL); return 0; } Without the patch it hangs. After the patch SIGSTOP "injected" by the tracer is not ignored and stops the tracee. Note also that if this test-case uses, say, SIGWINCH instead of SIGCONT, everything works without the patch. This can't be right, and this is confusing. The problem is that SIGSTOP (or any other sig_kernel_stop() signal) has no effect without JOBCTL_STOP_DEQUEUED. This means it is simply ignored after PTRACE_CONT unless JOBCTL_STOP_DEQUEUED was set "by accident", say it wasn't cleared after initial SIGSTOP sent by PTRACE_ATTACH. At first glance we could change ptrace_signal() to add STOP_DEQUEUED after return from ptrace_stop(), but this is not right in case when the tracer does not change the reported SIGSTOP and SIGCONT comes in between. This is even more wrong with PT_SEIZED, SIGCONT adds JOBCTL_TRAP_NOTIFY which will be "lost" during the TRAP_STOP | TRAP_NOTIFY report. So lets add STOP_DEQUEUED _before_ we report the signal. It has no effect unless sig_kernel_stop() == T after the tracer resumes us, and in the latter case the pending STOP_DEQUEUED means no SIGCONT in between, we should stop. Note also that if SIGCONT was sent, PT_SEIZED tracee will correctly report PTRACE_EVENT_STOP/SIGTRAP and thus the tracer can notice the fact SIGSTOP was cancelled. Also, move the current->ptrace check from ptrace_signal() to its caller, get_signal_to_deliver(), this looks more natural. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- kernel/signal.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0a1bf2c8bdc..c34f8f899b7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2084,12 +2084,17 @@ static void do_jobctl_trap(void) static int ptrace_signal(int signr, siginfo_t *info, struct pt_regs *regs, void *cookie) { - if (!current->ptrace) - return signr; - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ + /* + * We do not check sig_kernel_stop(signr) but set this marker + * unconditionally because we do not know whether debugger will + * change signr. This flag has no meaning unless we are going + * to stop after return from ptrace_stop(). In this case it will + * be checked in do_signal_stop(), we should only stop if it was + * not cleared by SIGCONT while we were sleeping. See also the + * comment in dequeue_signal(). + */ + current->jobctl |= JOBCTL_STOP_DEQUEUED; ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ @@ -2193,7 +2198,7 @@ relock: if (!signr) break; /* will return 0 */ - if (signr != SIGKILL) { + if (unlikely(current->ptrace) && signr != SIGKILL) { signr = ptrace_signal(signr, info, regs, cookie); if (!signr) -- cgit v1.2.3-70-g09d2 From 9bbd7374361d9bfc75108c3ad1c1b6db28b1be59 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Tue, 5 Jul 2011 19:07:21 -0700 Subject: sched: update correct entity's runtime in check_preempt_wakeup() While looking at check_preempt_wakeup() I realized that we are potentially updating the wrong entity in the fair-group scheduling case. In this case the current task's cfs_rq may not be the same as the one used for the comparison between the waking task and the existing task's vruntime. This potentially results in us using a stale vruntime in the pre-emption decision, providing a small false preference for the previous task. The effects of this are bounded since we always perform a hierarchal update on the tick. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/CAPM31R+2Ke2urUZKao5W92_LupdR4AYEv-EZWiJ3tG=tEes2cw@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e7d67a9e259..f88720b3df8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1919,8 +1919,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (!sched_feat(WAKEUP_PREEMPT)) return; - update_curr(cfs_rq); find_matching_se(&se, &pse); + update_curr(cfs_rq_of(se)); BUG_ON(!pse); if (wakeup_preempt_entity(se, pse) == 1) { /* -- cgit v1.2.3-70-g09d2 From 9598c82dcacadc3b9daa8170613fd054c6124d30 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 6 Jul 2011 22:30:37 -0700 Subject: sched: Don't update shares twice on on_rq parent In dequeue_task_fair() we bail on dequeue when we encounter a parenting entity with additional weight. However, we perform a double shares update on this entity as we continue the shares update traversal from this point, despite dequeue_entity() having already updated its queuing cfs_rq. Avoid this by starting from the parent when we resume. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110707053059.797714697@google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f88720b3df8..6cdff849fc1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1370,6 +1370,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ if (task_sleep && parent_entity(se)) set_next_buddy(parent_entity(se)); + + /* avoid re-evaluating load for this entity */ + se = parent_entity(se); break; } flags |= DEQUEUE_SLEEP; -- cgit v1.2.3-70-g09d2 From 9763b67fb9f3050c6da739105888327587c30c4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2011 13:09:25 +0200 Subject: sched, cgroup: Optimize load_balance_fair() Use for_each_leaf_cfs_rq() instead of list_for_each_entry_rcu(), this achieves that load_balance_fair() only iterates those task_groups that actually have tasks on busiest, and that we iterate bottom-up, trying to move light groups before the heavier ones. No idea if it will actually work out to be beneficial in practice, does anybody have a cgroup workload that might show a difference one way or the other? [ Also move update_h_load to sched_fair.c, loosing #ifdef-ery ] Signed-off-by: Peter Zijlstra Reviewed-by: Paul Turner Link: http://lkml.kernel.org/r/1310557009.2586.28.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 -------------------------------- kernel/sched_fair.c | 40 +++++++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b0e7ad796d3..474f341d6f9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return rq->avg_load_per_task; } -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#endif - #ifdef CONFIG_PREEMPT static void double_rq_lock(struct rq *rq1, struct rq *rq2); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6cdff849fc1..180bcf1efa7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2232,11 +2232,43 @@ static void update_shares(int cpu) struct rq *rq = cpu_rq(cpu); rcu_read_lock(); + /* + * Iterates the task_group tree in a bottom up fashion, see + * list_add_leaf_cfs_rq() for details. + */ for_each_leaf_cfs_rq(rq, cfs_rq) update_shares_cpu(cfs_rq->tg, cpu); rcu_read_unlock(); } +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->se[cpu]->load.weight; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_h_load(long cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -2244,14 +2276,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, int *all_pinned) { long rem_load_move = max_load_move; - int busiest_cpu = cpu_of(busiest); - struct task_group *tg; + struct cfs_rq *busiest_cfs_rq; rcu_read_lock(); - update_h_load(busiest_cpu); + update_h_load(cpu_of(busiest)); - list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; -- cgit v1.2.3-70-g09d2 From 5f817d676b7b7ac4a29f5ed93063ae7a24550c12 Mon Sep 17 00:00:00 2001 From: Jan Schoenherr Date: Wed, 13 Jul 2011 20:13:31 +0200 Subject: sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED' This patch fixes a typo located in a comment. Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-2-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 474f341d6f9..3b3826ebe79 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8362,7 +8362,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); raw_spin_unlock_irqrestore(&rq->lock, flags); } -#else /* !CONFG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } -- cgit v1.2.3-70-g09d2 From 045176d22f08bc0b650a028df0f62fc3c2747699 Mon Sep 17 00:00:00 2001 From: Jan Schoenherr Date: Wed, 13 Jul 2011 20:13:32 +0200 Subject: sched: Remove unused function cpu_cfs_rq() The last reference to cpu_cfs_rq() was removed with commit 88ec22d3 ("sched: Remove the cfs_rq dependency from set_task_cpu()"). Thus, remove this function, too. Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-3-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 180bcf1efa7..0588c0b933a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on - * another cpu ('this_cpu') - */ -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return cfs_rq->tg->cfs_rq[this_cpu]; -} - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return &cpu_rq(this_cpu)->cfs; -} - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -- cgit v1.2.3-70-g09d2 From 99bc52429f11d1f4f81495ac8237085aaeb6bccf Mon Sep 17 00:00:00 2001 From: Bianca Lutz Date: Wed, 13 Jul 2011 20:13:36 +0200 Subject: sched: Do not attempt to destroy uninitialized rt_bandwidth If a task group is to be created and alloc_fair_sched_group() fails, then the rt_bandwidth of the corresponding task group is not yet initialized. The caller, sched_create_group(), starts a clean up procedure which calls free_rt_sched_group() which unconditionally destroys the not yet initialized rt_bandwidth. This crashes or hangs the system in lock_hrtimer_base(): UP systems dereference a NULL pointer, while SMP systems loop endlessly on a condition that cannot become true. This patch simply avoids the destruction of rt_bandwidth when the initialization code path was not reached. (This was discovered by accident with a custom kernel modification.) Signed-off-by: Bianca Lutz Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-7-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3b3826ebe79..f107204db53 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8383,7 +8383,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; - destroy_rt_bandwidth(&tg->rt_bandwidth); + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); for_each_possible_cpu(i) { if (tg->rt_rq) -- cgit v1.2.3-70-g09d2 From 26a148eb9c790149750f7e77da0d96029443d400 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 15 Jul 2011 11:41:31 +0100 Subject: sched: Reorder root_domain to remove 64 bit alignment padding Reorder root_domain to remove 8 bytes of alignment padding on 64 bit builds, this shrinks the size from 1736 to 1728 bytes, therefore using one fewer cachelines. Signed-off-by: Richard Kennedy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310726492.1977.5.camel@castor.rsk Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f107204db53..e3f0bac0527 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -422,6 +422,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + atomic_t rto_count; struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -431,7 +432,6 @@ struct root_domain { * one runnable RT task. */ cpumask_var_t rto_mask; - atomic_t rto_count; struct cpupri cpupri; }; -- cgit v1.2.3-70-g09d2 From acb5a9ba3bd7cd8b3264f67a3789a9587d3b935b Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Thu, 14 Jul 2011 18:32:43 +0200 Subject: sched: Separate group-scheduling code more clearly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up cfs/rt runqueue initialization by moving group scheduling related code into the corresponding functions. Also, keep group scheduling as an add-on, so that things are only done additionally, i. e. remove the init_*_rq() calls from init_tg_*_entry(). (This removes a redundant initalization during sched_init()). In case of group scheduling rt_rq->highest_prio.curr is now initialized twice, but adding another #ifdef seems not worth it. Signed-off-by: Jan H. Schönherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310661163-16606-1-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e3f0bac0527..6fdf7ffbebc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7859,17 +7859,10 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; - /* allow initial update_cfs_load() to truncate */ -#ifdef CONFIG_SMP - cfs_rq->load_stamp = 1; -#endif -#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -7889,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO; -#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; -#endif -#endif -#ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); @@ -7905,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7918,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; + /* se could be NULL for root_task_group */ if (!se) return; @@ -7945,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; rt_rq->tg = tg; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + tg->rt_rq[cpu] = rt_rq; tg->rt_se[cpu] = rt_se; + if (!rt_se) return; @@ -8032,7 +8024,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs, rq); + init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = root_task_group_load; @@ -8335,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; + init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } @@ -8425,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } -- cgit v1.2.3-70-g09d2 From 2bd2d6f2dc952fc44fc52887de36e51896da96b9 Mon Sep 17 00:00:00 2001 From: Stephan Baerwolf Date: Wed, 20 Jul 2011 14:46:59 +0200 Subject: sched: Replace use of entity_key() "entity_key()" is only used in "__enqueue_entity()" and its only function is to subtract a tasks vruntime by its groups minvruntime. Before this patch a rbtree enqueue-decision is done by comparing two tasks in the style: "if (entity_key(cfs_rq, se) < entity_key(cfs_rq, entry))" which would be "if (se->vruntime-cfs_rq->min_vruntime < entry->vruntime-cfs_rq->min_vruntime)" or (if reducing cfs_rq->min_vruntime out) "if (se->vruntime < entry->vruntime)" which is "if (entity_before(se, entry))" So we do not need "entity_key()". If "entity_before()" is inline we will also save one subtraction (only one, because "entity_key(cfs_rq, se)" was cached in "key") Signed-off-by: Stephan Baerwolf Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ns12mnd2h5w8rb9agd8hnsfk@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0588c0b933a..a2ecbaa28b4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -321,11 +321,6 @@ static inline int entity_before(struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } -static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return se->vruntime - cfs_rq->min_vruntime; -} - static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; @@ -359,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - s64 key = entity_key(cfs_rq, se); int leftmost = 1; /* @@ -372,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * We dont care about collisions. Nodes with * the same key stay together. */ - if (key < entity_key(cfs_rq, entry)) { + if (entity_before(se, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; -- cgit v1.2.3-70-g09d2 From 9985c20f9e4aee6857c08246b273a3695a52b929 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Thu, 30 Jun 2011 08:09:55 +0000 Subject: perf: Remove perf_event_attr::type check PMU type id can be allocated dynamically, so perf_event_attr::type check when copying attribute from userspace to kernel is not valid. Signed-off-by: Lin Ming Cc: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309421396-17438-4-git-send-email-ming.m.lin@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0567e32d71a..b8785e26ee1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5926,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - if (attr->__reserved_1) return -EINVAL; -- cgit v1.2.3-70-g09d2 From efbe2eee6dc0f179be84292bf269528b3ec365e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jul 2011 11:39:45 +0200 Subject: lockdep: Fix lockdep_no_validate against IRQ states Thomas noticed that a lock marked with lockdep_set_novalidate_class() will still trigger warnings for IRQ inversions. Cure this by skipping those when marking irq state. Reported-and-tested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-2dp5vmpsxeraqm42kgww6ge2@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 298c9276dfd..628276d0591 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2468,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); + if (hlock_class(hlock)->key == &__lockdep_no_validate__) + continue; + if (!mark_lock(curr, hlock, usage_bit)) return 0; } -- cgit v1.2.3-70-g09d2 From 0f3171438fc917b9f6b8b60dbb7a3fff9a0f68fd Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 22 Jul 2011 09:14:31 +0800 Subject: sched: Cleanup duplicate local variable in [enqueue|dequeue]_task_fair No need to define a new "cfs_rq" variable in the "for" block. Just use the one at the top of the function. Signed-off-by: Lin Ming Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1311297271.3938.1352.camel@minggr.sh.intel.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a2ecbaa28b4..bc8ee999381 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1317,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); @@ -1360,7 +1360,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); -- cgit v1.2.3-70-g09d2 From 81c7413650fbbf881bcb9e567be61a6717eb1876 Mon Sep 17 00:00:00 2001 From: Satoru Moriya Date: Thu, 26 May 2011 19:38:04 -0400 Subject: param: fix return value handling in param_set_* In STANDARD_PARAM_DEF, param_set_* handles the case in which strtolfn returns -EINVAL but it may return -ERANGE. If it returns -ERANGE, param_set_* may set uninitialized value to the paramerter. We should handle both cases. The one of the cases in which strtolfn() returns -ERANGE is following: *Type of module parameter is long *Set the parameter more than LONG_MAX Signed-off-by: Satoru Moriya Signed-off-by: Rusty Russell --- kernel/params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index ed72e133086..2a4ba258f04 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -225,8 +225,8 @@ int parse_args(const char *name, int ret; \ \ ret = strtolfn(val, 0, &l); \ - if (ret == -EINVAL || ((type)l != l)) \ - return -EINVAL; \ + if (ret < 0 || ((type)l != l)) \ + return ret < 0 ? ret : -EINVAL; \ *((type *)kp->arg) = l; \ return 0; \ } \ -- cgit v1.2.3-70-g09d2 From 74e08fcf7bef973512a1f813700f802a93678670 Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Thu, 30 Jun 2011 21:22:11 +0200 Subject: modules: add default loader hook implementations The module loader code allows architectures to hook into the code by providing a small number of entry points that each arch must implement. This patch provides __weakly linked generic implementations of these entry points for architectures that don't need to do anything special. Signed-off-by: Jonas Bonn Signed-off-by: Rusty Russell --- include/linux/moduleloader.h | 7 ++++++- kernel/module.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index c1f40c2f7ff..b2be02ebf45 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -5,7 +5,12 @@ #include #include -/* These must be implemented by the specific architecture */ +/* These may be implemented by architectures that need to hook into the + * module loader code. Architectures that don't need to do anything special + * can just rely on the 'weak' default hooks defined in kernel/module.c. + * Note, however, that at least one of apply_relocate or apply_relocate_add + * must be implemented by each architecture. + */ /* Adjust arch-specific sections. Return 0 on success. */ int module_frob_arch_sections(Elf_Ehdr *hdr, diff --git a/kernel/module.c b/kernel/module.c index 795bdc7f5c3..6301d6e173c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1697,6 +1697,15 @@ static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif +void __weak module_free(struct module *mod, void *module_region) +{ + vfree(module_region); +} + +void __weak module_arch_cleanup(struct module *mod) +{ +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1851,6 +1860,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) return ret; } +int __weak apply_relocate(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + pr_err("module %s: REL relocation unsupported\n", me->name); + return -ENOEXEC; +} + +int __weak apply_relocate_add(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + pr_err("module %s: RELA relocation unsupported\n", me->name); + return -ENOEXEC; +} + static int apply_relocations(struct module *mod, const struct load_info *info) { unsigned int i; @@ -2235,6 +2264,11 @@ static void dynamic_debug_remove(struct _ddebug *debug) ddebug_remove_module(debug->modname); } +void * __weak module_alloc(unsigned long size) +{ + return size == 0 ? NULL : vmalloc_exec(size); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -2645,6 +2679,14 @@ static void flush_module_icache(const struct module *mod) set_fs(old_fs); } +int __weak module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + static struct module *layout_and_allocate(struct load_info *info) { /* Module within temporary copy. */ @@ -2716,6 +2758,13 @@ static void module_deallocate(struct module *mod, struct load_info *info) module_free(mod, mod->module_core); } +int __weak module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + return 0; +} + static int post_relocation(struct module *mod, const struct load_info *info) { /* Sort exception table now relocations are done. */ -- cgit v1.2.3-70-g09d2 From 4befb026cf74b52fc7d382142bbfc0e9b6aab5e7 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Sun, 24 Jul 2011 22:06:04 +0930 Subject: module: change attr callbacks to take struct module_kobject This simplifies the next patch, where we have an attribute on a builtin module (ie. module == NULL). Signed-off-by: Kay Sievers Signed-off-by: Rusty Russell (split into 2) --- include/linux/module.h | 23 ++++++++++++----------- kernel/module.c | 14 +++++++------- kernel/params.c | 10 +++++----- 3 files changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index d9ca2d5dc6d..b87e7625a98 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -48,10 +48,18 @@ struct modversion_info struct module; +struct module_kobject { + struct kobject kobj; + struct module *mod; + struct kobject *drivers_dir; + struct module_param_attrs *mp; +}; + struct module_attribute { - struct attribute attr; - ssize_t (*show)(struct module_attribute *, struct module *, char *); - ssize_t (*store)(struct module_attribute *, struct module *, + struct attribute attr; + ssize_t (*show)(struct module_attribute *, struct module_kobject *, + char *); + ssize_t (*store)(struct module_attribute *, struct module_kobject *, const char *, size_t count); void (*setup)(struct module *, const char *); int (*test)(struct module *); @@ -65,15 +73,8 @@ struct module_version_attribute { } __attribute__ ((__aligned__(sizeof(void *)))); extern ssize_t __modver_version_show(struct module_attribute *, - struct module *, char *); + struct module_kobject *, char *); -struct module_kobject -{ - struct kobject kobj; - struct module *mod; - struct kobject *drivers_dir; - struct module_param_attrs *mp; -}; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); diff --git a/kernel/module.c b/kernel/module.c index 6301d6e173c..a4295e67dd8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ mod->field = kstrdup(s, GFP_KERNEL); \ } \ static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ - struct module *mod, char *buffer) \ + struct module_kobject *mk, char *buffer) \ { \ - return sprintf(buffer, "%s\n", mod->field); \ + return sprintf(buffer, "%s\n", mk->mod->field); \ } \ static int modinfo_##field##_exists(struct module *mod) \ { \ @@ -902,9 +902,9 @@ void symbol_put_addr(void *addr) EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, - struct module *mod, char *buffer) + struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", module_refcount(mod)); + return sprintf(buffer, "%u\n", module_refcount(mk->mod)); } static struct module_attribute refcnt = { @@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod) #endif /* CONFIG_MODULE_UNLOAD */ static ssize_t show_initstate(struct module_attribute *mattr, - struct module *mod, char *buffer) + struct module_kobject *mk, char *buffer) { const char *state = "unknown"; - switch (mod->state) { + switch (mk->mod->state) { case MODULE_STATE_LIVE: state = "live"; break; @@ -1187,7 +1187,7 @@ struct module_sect_attrs }; static ssize_t module_sect_show(struct module_attribute *mattr, - struct module *mod, char *buf) + struct module_kobject *mk, char *buf) { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); diff --git a/kernel/params.c b/kernel/params.c index 2a4ba258f04..37e9b20a718 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -511,7 +511,7 @@ struct module_param_attrs #define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, - struct module *mod, char *buf) + struct module_kobject *mk, char *buf) { int count; struct param_attribute *attribute = to_param_attr(mattr); @@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr, /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, - struct module *owner, + struct module_kobject *km, const char *buf, size_t len) { int err; @@ -807,7 +807,7 @@ static void __init param_sysfs_builtin(void) } ssize_t __modver_version_show(struct module_attribute *mattr, - struct module *mod, char *buf) + struct module_kobject *mk, char *buf) { struct module_version_attribute *vattr = container_of(mattr, struct module_version_attribute, mattr); @@ -852,7 +852,7 @@ static ssize_t module_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - ret = attribute->show(attribute, mk->mod, buf); + ret = attribute->show(attribute, mk, buf); return ret; } @@ -871,7 +871,7 @@ static ssize_t module_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - ret = attribute->store(attribute, mk->mod, buf, len); + ret = attribute->store(attribute, mk, buf, len); return ret; } -- cgit v1.2.3-70-g09d2 From 88bfa3247961fe5f3623f4d2cf1cd5dc72457598 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Sun, 24 Jul 2011 22:06:04 +0930 Subject: module: add /sys/module//uevent files Userspace wants to manage module parameters with udev rules. This currently only works for loaded modules, but not for built-in ones. To allow access to the built-in modules we need to re-trigger all module load events that happened before any userspace was running. We already do the same thing for all devices, subsystems(buses) and drivers. This adds the currently missing /sys/module//uevent files to all module entries. Signed-off-by: Kay Sievers Signed-off-by: Rusty Russell (split & trivial fix) --- include/linux/module.h | 1 + kernel/module.c | 17 +++++++++++++++++ kernel/params.c | 4 ++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index b87e7625a98..1c30087a2d8 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -75,6 +75,7 @@ struct module_version_attribute { extern ssize_t __modver_version_show(struct module_attribute *, struct module_kobject *, char *); +extern struct module_attribute module_uevent; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); diff --git a/kernel/module.c b/kernel/module.c index a4295e67dd8..04379f92f84 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -975,10 +975,27 @@ static struct module_attribute initstate = { .show = show_initstate, }; +static ssize_t store_uevent(struct module_attribute *mattr, + struct module_kobject *mk, + const char *buffer, size_t count) +{ + enum kobject_action action; + + if (kobject_action_type(buffer, count, &action) == 0) + kobject_uevent(&mk->kobj, action); + return count; +} + +struct module_attribute module_uevent = { + .attr = { .name = "uevent", .mode = 0200 }, + .store = store_uevent, +}; + static struct module_attribute *modinfo_attrs[] = { &modinfo_version, &modinfo_srcversion, &initstate, + &module_uevent, #ifdef CONFIG_MODULE_UNLOAD &refcnt, #endif diff --git a/kernel/params.c b/kernel/params.c index 37e9b20a718..22df3e0d142 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name) mk->kobj.kset = module_kset; err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); +#ifdef CONFIG_MODULES + if (!err) + err = sysfs_create_file(&mk->kobj, &module_uevent.attr); +#endif if (err) { kobject_put(&mk->kobj); printk(KERN_ERR -- cgit v1.2.3-70-g09d2 From 2efaca927f5cd7ecd0f1554b8f9b6a9a2c329c03 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 25 Jul 2011 17:12:32 -0700 Subject: mm/futex: fix futex writes on archs with SW tracking of dirty & young I haven't reproduced it myself but the fail scenario is that on such machines (notably ARM and some embedded powerpc), if you manage to hit that futex path on a writable page whose dirty bit has gone from the PTE, you'll livelock inside the kernel from what I can tell. It will go in a loop of trying the atomic access, failing, trying gup to "fix it up", getting succcess from gup, go back to the atomic access, failing again because dirty wasn't fixed etc... So I think you essentially hang in the kernel. The scenario is probably rare'ish because affected architecture are embedded and tend to not swap much (if at all) so we probably rarely hit the case where dirty is missing or young is missing, but I think Shan has a piece of SW that can reliably reproduce it using a shared writable mapping & fork or something like that. On archs who use SW tracking of dirty & young, a page without dirty is effectively mapped read-only and a page without young unaccessible in the PTE. Additionally, some architectures might lazily flush the TLB when relaxing write protection (by doing only a local flush), and expect a fault to invalidate the stale entry if it's still present on another processor. The futex code assumes that if the "in_atomic()" access -EFAULT's, it can "fix it up" by causing get_user_pages() which would then be equivalent to taking the fault. However that isn't the case. get_user_pages() will not call handle_mm_fault() in the case where the PTE seems to have the right permissions, regardless of the dirty and young state. It will eventually update those bits ... in the struct page, but not in the PTE. Additionally, it will not handle the lazy TLB flushing that can be required by some architectures in the fault case. Basically, gup is the wrong interface for the job. The patch provides a more appropriate one which boils down to just calling handle_mm_fault() since what we are trying to do is simulate a real page fault. The futex code currently attempts to write to user memory within a pagefault disabled section, and if that fails, tries to fix it up using get_user_pages(). This doesn't work on archs where the dirty and young bits are maintained by software, since they will gate access permission in the TLB, and will not be updated by gup(). In addition, there's an expectation on some archs that a spurious write fault triggers a local TLB flush, and that is missing from the picture as well. I decided that adding those "features" to gup() would be too much for this already too complex function, and instead added a new simpler fixup_user_fault() which is essentially a wrapper around handle_mm_fault() which the futex code can call. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix some nits Darren saw, fiddle comment layout] Signed-off-by: Benjamin Herrenschmidt Reported-by: Shan Hai Tested-by: Shan Hai Cc: David Laight Acked-by: Peter Zijlstra Cc: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ kernel/futex.c | 4 ++-- mm/memory.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3cccd053850..3172a1c0f08 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -988,6 +988,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); diff --git a/kernel/futex.c b/kernel/futex.c index 3fbc76cbb9a..0a308970c24 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; diff --git a/mm/memory.c b/mm/memory.c index 3c9f3aa8332..a56e3ba816b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1805,7 +1805,63 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); -/** +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. -- cgit v1.2.3-70-g09d2 From ae891a1b93bf62e9aaa116a7a71312375047fc9f Mon Sep 17 00:00:00 2001 From: Maxin B John Date: Mon, 25 Jul 2011 17:12:59 -0700 Subject: devres: fix possible use after free devres uses the pointer value as key after it's freed, which is safe but triggers spurious use-after-free warnings on some static analysis tools. Rearrange code to avoid such warnings. Signed-off-by: Maxin B. John Reviewed-by: Rolf Eike Beer Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/devres.c | 2 +- lib/devres.c | 2 +- mm/dmapool.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef4ffcdfa5..bd8e788d71e 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { struct irq_devres match_data = { irq, dev_id }; - free_irq(irq, dev_id); WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, &match_data)); + free_irq(irq, dev_id); } EXPORT_SYMBOL(devm_free_irq); diff --git a/lib/devres.c b/lib/devres.c index 6efddf53b90..7c0e953a748 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -79,9 +79,9 @@ EXPORT_SYMBOL(devm_ioremap_nocache); */ void devm_iounmap(struct device *dev, void __iomem *addr) { - iounmap(addr); WARN_ON(devres_destroy(dev, devm_ioremap_release, devm_ioremap_match, (void *)addr)); + iounmap(addr); } EXPORT_SYMBOL(devm_iounmap); diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb4519..fbb58e34688 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - dma_pool_destroy(pool); WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); + dma_pool_destroy(pool); } EXPORT_SYMBOL(dmam_pool_destroy); -- cgit v1.2.3-70-g09d2 From c5f41752fd37979dbaec61dc59c7ece0606ddf7e Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 25 Jul 2011 17:13:10 -0700 Subject: notifiers: sys: move reboot notifiers into reboot.h It is not necessary to share the same notifier.h. This patch already moves register_reboot_notifier() and unregister_reboot_notifier() from kernel/notifier.c to kernel/sys.c. [amwang@redhat.com: make allyesconfig succeed on ppc64] Signed-off-by: WANG Cong Cc: David Miller Cc: "Rafael J. Wysocki" Cc: Greg KH Signed-off-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/rtas.c | 1 + arch/powerpc/kernel/rtas_flash.c | 1 + include/linux/notifier.h | 5 +---- include/linux/reboot.h | 5 +++++ kernel/notifier.c | 31 ------------------------------- kernel/sys.c | 32 +++++++++++++++++++++++++++++++- 6 files changed, 39 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 271ff6318ed..0e0ea941156 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index bf5f5ce3a7b..e037c7494fd 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/notifier.h b/include/linux/notifier.h index e8a858a1d39..145c43658db 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -189,10 +189,7 @@ static inline int notifier_to_errno(int ret) /* netdevice notifiers are defined in include/linux/netdevice.h */ -#define SYS_DOWN 0x0001 /* Notify of system down */ -#define SYS_RESTART SYS_DOWN -#define SYS_HALT 0x0002 /* Notify of system halt */ -#define SYS_POWER_OFF 0x0003 /* Notify of system power off */ +/* reboot notifiers are defined in include/linux/reboot.h. */ #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 3005d5a7fce..e0879a70e83 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -39,6 +39,11 @@ #include +#define SYS_DOWN 0x0001 /* Notify of system down */ +#define SYS_RESTART SYS_DOWN +#define SYS_HALT 0x0002 /* Notify of system halt */ +#define SYS_POWER_OFF 0x0003 /* Notify of system power off */ + extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); diff --git a/kernel/notifier.c b/kernel/notifier.c index 2488ba7eb56..8d7b435806c 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace __kprobes notify_die(enum die_val val, const char *str, diff --git a/kernel/sys.c b/kernel/sys.c index e4128b278f2..a101ba36c44 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -319,6 +318,37 @@ void kernel_restart_prepare(char *cmd) syscore_shutdown(); } +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart -- cgit v1.2.3-70-g09d2 From 626a0312514a121a90b4478cbde111ffc6826ae2 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Jul 2011 17:13:12 -0700 Subject: kernel/configs.c: include MODULE_*() when CONFIG_IKCONFIG_PROC=n If CONFIG_IKCONFIG=m but CONFIG_IKCONFIG_PROC=n we get a module that has no MODULE_LICENSE definition. Move the MODULE_*() definitions outside the CONFIG_IKCONFIG_PROC #ifdef to prevent this configuration from tainting the kernel. Signed-off-by: Stephen Boyd Acked-by: Randy Dunlap Acked-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index b4066b44a99..42e8fa075ee 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void) module_init(ikconfig_init); module_exit(ikconfig_cleanup); +#endif /* CONFIG_IKCONFIG_PROC */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Randy Dunlap"); MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); - -#endif /* CONFIG_IKCONFIG_PROC */ -- cgit v1.2.3-70-g09d2 From 778d3b0ff0654ad7092bf823fd32010066b12365 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:30 -0700 Subject: cpusets: randomize node rotor used in cpuset_mem_spread_node() [ This patch has already been accepted as commit 0ac0c0d0f837 but later reverted (commit 35926ff5fba8) because it itroduced arch specific __node_random which was defined only for x86 code so it broke other archs. This is a followup without any arch specific code. Other than that there are no functional changes.] Some workloads that create a large number of small files tend to assign too many pages to node 0 (multi-node systems). Part of the reason is that the rotor (in cpuset_mem_spread_node()) used to assign nodes starts at node 0 for newly created tasks. This patch changes the rotor to be initialized to a random node number of the cpuset. [akpm@linux-foundation.org: fix layout] [Lee.Schermerhorn@hp.com: Define stub numa_random() for !NUMA configuration] [mhocko@suse.cz: Make it arch independent] [akpm@linux-foundation.org: fix CONFIG_NUMA=y, MAX_NUMNODES>1 build] Signed-off-by: Jack Steiner Signed-off-by: Lee Schermerhorn Signed-off-by: Michal Hocko Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Pekka Enberg Cc: Paul Menage Cc: Jack Steiner Cc: Robin Holt Cc: David Rientjes Cc: Christoph Lameter Cc: David Rientjes Cc: Jack Steiner Cc: KOSAKI Motohiro Cc: Lee Schermerhorn Cc: Michal Hocko Cc: Paul Menage Cc: Pekka Enberg Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 1 + include/linux/nodemask.h | 13 +++++++++++++ kernel/cpuset.c | 8 ++++++++ kernel/fork.c | 4 ++++ lib/bitmap.c | 2 +- mm/mempolicy.c | 16 ++++++++++++++++ 6 files changed, 43 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index dcafe0bf000..3bac44cce14 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -144,6 +144,7 @@ extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); +extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); #define BITMAP_LAST_WORD_MASK(nbits) \ ( \ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index dba35e41337..7afc36334d5 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -66,6 +66,8 @@ * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * + * int node_random(mask) Random node with set bit in mask + * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * @@ -430,6 +432,7 @@ static inline void node_set_offline(int nid) node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } + #else static inline int node_state(int node, enum node_states state) @@ -460,6 +463,16 @@ static inline int num_node_state(enum node_states state) #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) + +#endif + +#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) +extern int node_random(const nodemask_t *maskp); +#else +static inline int node_random(const nodemask_t *mask) +{ + return 0; +} #endif #define node_online_map node_states[N_ONLINE] diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9c9b7545c81..f8bc977ccbb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor) int cpuset_mem_spread_node(void) { + if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) + current->cpuset_mem_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } int cpuset_slab_spread_node(void) { + if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) + current->cpuset_slab_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); } diff --git a/kernel/fork.c b/kernel/fork.c index 17bf7c8d651..e33177edb3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,6 +1173,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, } mpol_fix_fork_child_flag(p); #endif +#ifdef CONFIG_CPUSETS + p->cpuset_mem_spread_rotor = NUMA_NO_NODE; + p->cpuset_slab_spread_rotor = NUMA_NO_NODE; +#endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW diff --git a/lib/bitmap.c b/lib/bitmap.c index 3f3b68199d7..37ef4b04879 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -756,7 +756,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits) * * The bit positions 0 through @bits are valid positions in @buf. */ -static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) +int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) { int pos = 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7fb9d25c54..8b57173c1dd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,6 +93,7 @@ #include #include +#include #include "internal.h" @@ -1645,6 +1646,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } +/* + * Return the bit number of a random bit set in the nodemask. + * (returns -1 if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = -1; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} + #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) -- cgit v1.2.3-70-g09d2 From fb0a685cb95a0267a96153af2f72486f27be5847 Mon Sep 17 00:00:00 2001 From: Daniel Rebelo de Oliveira Date: Tue, 26 Jul 2011 16:08:39 -0700 Subject: kernel/fork.c: fix a few coding style issues Signed-off-by: Daniel Rebelo de Oliveira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 83 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e33177edb3b..e7ceaca8960 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -80,7 +80,7 @@ * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ -int nr_threads; /* The idle threads do not count.. */ +int nr_threads; /* The idle threads do not count.. */ int max_threads; /* tunable limit on nr_threads */ @@ -232,7 +232,7 @@ void __init fork_init(unsigned long mempages) /* * we need to allow at least 20 threads to boot a system */ - if(max_threads < 20) + if (max_threads < 20) max_threads = 20; init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; @@ -268,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - err = arch_dup_task_struct(tsk, orig); + err = arch_dup_task_struct(tsk, orig); if (err) goto out; @@ -288,8 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack_canary = get_random_int(); #endif - /* One for us, one for whoever does the "release_task()" (usually parent) */ - atomic_set(&tsk->usage,2); + /* + * One for us, one for whoever does the "release_task()" (usually + * parent) + */ + atomic_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -437,7 +440,7 @@ fail_nomem: goto out; } -static inline int mm_alloc_pgd(struct mm_struct * mm) +static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) @@ -445,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) return 0; } -static inline void mm_free_pgd(struct mm_struct * mm) +static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } @@ -482,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm) #endif } -static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -513,9 +516,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) /* * Allocate and initialize an mm_struct. */ -struct mm_struct * mm_alloc(void) +struct mm_struct *mm_alloc(void) { - struct mm_struct * mm; + struct mm_struct *mm; mm = allocate_mm(); if (!mm) @@ -583,7 +586,7 @@ void added_exe_file_vma(struct mm_struct *mm) void removed_exe_file_vma(struct mm_struct *mm) { mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { fput(mm->exe_file); mm->exe_file = NULL; } @@ -775,9 +778,9 @@ fail_nocontext: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { - struct mm_struct * mm, *oldmm; + struct mm_struct *mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; @@ -844,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk) { struct files_struct *oldf, *newf; int error = 0; @@ -1166,11 +1169,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); - if (IS_ERR(p->mempolicy)) { - retval = PTR_ERR(p->mempolicy); - p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; - } + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup_cgroup; + } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS @@ -1216,25 +1219,33 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; - - if ((retval = audit_alloc(p))) + retval = audit_alloc(p); + if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ - if ((retval = copy_semundo(clone_flags, p))) + retval = copy_semundo(clone_flags, p); + if (retval) goto bad_fork_cleanup_audit; - if ((retval = copy_files(clone_flags, p))) + retval = copy_files(clone_flags, p); + if (retval) goto bad_fork_cleanup_semundo; - if ((retval = copy_fs(clone_flags, p))) + retval = copy_fs(clone_flags, p); + if (retval) goto bad_fork_cleanup_files; - if ((retval = copy_sighand(clone_flags, p))) + retval = copy_sighand(clone_flags, p); + if (retval) goto bad_fork_cleanup_fs; - if ((retval = copy_signal(clone_flags, p))) + retval = copy_signal(clone_flags, p); + if (retval) goto bad_fork_cleanup_sighand; - if ((retval = copy_mm(clone_flags, p))) + retval = copy_mm(clone_flags, p); + if (retval) goto bad_fork_cleanup_signal; - if ((retval = copy_namespaces(clone_flags, p))) + retval = copy_namespaces(clone_flags, p); + if (retval) goto bad_fork_cleanup_mm; - if ((retval = copy_io(clone_flags, p))) + retval = copy_io(clone_flags, p); + if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) @@ -1256,7 +1267,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1324,7 +1335,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). - */ + */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); @@ -1685,12 +1696,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_fs(unshare_flags, &new_fs))) + err = unshare_fs(unshare_flags, &new_fs); + if (err) goto bad_unshare_out; - if ((err = unshare_fd(unshare_flags, &new_fd))) + err = unshare_fd(unshare_flags, &new_fd); + if (err) goto bad_unshare_cleanup_fs; - if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, - new_fs))) + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + if (err) goto bad_unshare_cleanup_fd; if (new_fs || new_fd || do_sysvsem || new_nsproxy) { -- cgit v1.2.3-70-g09d2 From b34a6b1da371ed8af1221459a18c67970f7e3d53 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 26 Jul 2011 16:08:48 -0700 Subject: ipc: introduce shm_rmid_forced sysctl Add support for the shm_rmid_forced sysctl. If set to 1, all shared memory objects in current ipc namespace will be automatically forced to use IPC_RMID. The POSIX way of handling shmem allows one to create shm objects and call shmdt(), leaving shm object associated with no process, thus consuming memory not counted via rlimits. With shm_rmid_forced=1 the shared memory object is counted at least for one process, so OOM killer may effectively kill the fat process holding the shared memory. It obviously breaks POSIX - some programs relying on the feature would stop working. So set shm_rmid_forced=1 only if you're sure nobody uses "orphaned" memory. Use shm_rmid_forced=0 by default for compatability reasons. The feature was previously impemented in -ow as a configure option. [akpm@linux-foundation.org: fix documentation, per Randy] [akpm@linux-foundation.org: fix warning] [akpm@linux-foundation.org: readability/conventionality tweaks] [akpm@linux-foundation.org: fix shm_rmid_forced/shm_forced_rmid confusion, use standard comment layout] Signed-off-by: Vasiliy Kulikov Cc: Randy Dunlap Cc: "Eric W. Biederman" Cc: "Serge E. Hallyn" Cc: Daniel Lezcano Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Alan Cox Cc: Solar Designer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 22 ++++++++++ include/linux/ipc_namespace.h | 7 +++ include/linux/shm.h | 4 ++ ipc/ipc_sysctl.c | 36 +++++++++++++++ ipc/shm.c | 97 +++++++++++++++++++++++++++++++++++++++-- kernel/exit.c | 1 + 6 files changed, 163 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 1c7fb0a94e2..704e474a93d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -61,6 +61,7 @@ show up in /proc/sys/kernel: - rtsig-nr - sem - sg-big-buff [ generic SCSI device (sg) ] +- shm_rmid_forced - shmall - shmmax [ sysv ipc ] - shmmni @@ -518,6 +519,27 @@ kernel. This value defaults to SHMMAX. ============================================================== +shm_rmid_forced: + +Linux lets you set resource limits, including how much memory one +process can consume, via setrlimit(2). Unfortunately, shared memory +segments are allowed to exist without association with any process, and +thus might not be counted against any resource limits. If enabled, +shared memory segments are automatically destroyed when their attach +count becomes zero after a detach or a process termination. It will +also destroy segments that were created, but never attached to, on exit +from the process. The only use left for IPC_RMID is to immediately +destroy an unattached segment. Of course, this breaks the way things are +defined, so some applications might stop working. Note that this +feature will do you no good unless you also configure your resource +limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't +need this. + +Note that if you change this from 0 to 1, already created segments +without users and with a dead originative process will be destroyed. + +============================================================== + softlockup_thresh: This value can be used to lower the softlockup tolerance threshold. The diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index a6d1655f960..8a297a5e794 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -44,6 +44,11 @@ struct ipc_namespace { size_t shm_ctlall; int shm_ctlmni; int shm_tot; + /* + * Defines whether IPC_RMID is forced for _all_ shm segments regardless + * of shmctl() + */ + int shm_rmid_forced; struct notifier_block ipcns_nb; @@ -72,6 +77,7 @@ extern int register_ipcns_notifier(struct ipc_namespace *); extern int cond_register_ipcns_notifier(struct ipc_namespace *); extern void unregister_ipcns_notifier(struct ipc_namespace *); extern int ipcns_notify(unsigned long); +extern void shm_destroy_orphaned(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC */ static inline int register_ipcns_notifier(struct ipc_namespace *ns) { return 0; } @@ -79,6 +85,7 @@ static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) { return 0; } static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } static inline int ipcns_notify(unsigned long l) { return 0; } +static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #endif /* CONFIG_SYSVIPC */ #ifdef CONFIG_POSIX_MQUEUE diff --git a/include/linux/shm.h b/include/linux/shm.h index eca6235a46c..7d27ffde019 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -106,6 +106,7 @@ struct shmid_kernel /* private to the kernel */ #ifdef CONFIG_SYSVIPC long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr); extern int is_file_shm_hugepages(struct file *file); +extern void exit_shm(struct task_struct *task); #else static inline long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr) @@ -116,6 +117,9 @@ static inline int is_file_shm_hugepages(struct file *file) { return 0; } +static inline void exit_shm(struct task_struct *task) +{ +} #endif #endif /* __KERNEL__ */ diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 56410faa455..00fba2bab87 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -31,12 +31,37 @@ static int proc_ipc_dointvec(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; + memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); } +static int proc_ipc_dointvec_minmax(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table ipc_table; + + memcpy(&ipc_table, table, sizeof(ipc_table)); + ipc_table.data = get_ipc(table); + + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); +} + +static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (err < 0) + return err; + if (ns->shm_rmid_forced) + shm_destroy_orphaned(ns); + return err; +} + static int proc_ipc_callback_dointvec(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -125,6 +150,8 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, #else #define proc_ipc_doulongvec_minmax NULL #define proc_ipc_dointvec NULL +#define proc_ipc_dointvec_minmax NULL +#define proc_ipc_dointvec_minmax_orphans NULL #define proc_ipc_callback_dointvec NULL #define proc_ipcauto_dointvec_minmax NULL #endif @@ -154,6 +181,15 @@ static struct ctl_table ipc_kern_table[] = { .mode = 0644, .proc_handler = proc_ipc_dointvec, }, + { + .procname = "shm_rmid_forced", + .data = &init_ipc_ns.shm_rmid_forced, + .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), + .mode = 0644, + .proc_handler = proc_ipc_dointvec_minmax_orphans, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "msgmax", .data = &init_ipc_ns.msg_ctlmax, diff --git a/ipc/shm.c b/ipc/shm.c index 27884adb1a9..3f5b14365f3 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -74,6 +74,7 @@ void shm_init_ns(struct ipc_namespace *ns) ns->shm_ctlmax = SHMMAX; ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; + ns->shm_rmid_forced = 0; ns->shm_tot = 0; ipc_init_ids(&shm_ids(ns)); } @@ -186,6 +187,23 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) ipc_rcu_putref(shp); } +/* + * shm_may_destroy - identifies whether shm segment should be destroyed now + * + * Returns true if and only if there are no active users of the segment and + * one of the following is true: + * + * 1) shmctl(id, IPC_RMID, NULL) was called for this shp + * + * 2) sysctl kernel.shm_rmid_forced is set to 1. + */ +static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +{ + return (shp->shm_nattch == 0) && + (ns->shm_rmid_forced || + (shp->shm_perm.mode & SHM_DEST)); +} + /* * remove the attach descriptor vma. * free memory for segment if it is marked destroyed. @@ -206,11 +224,83 @@ static void shm_close(struct vm_area_struct *vma) shp->shm_lprid = task_tgid_vnr(current); shp->shm_dtim = get_seconds(); shp->shm_nattch--; - if(shp->shm_nattch == 0 && - shp->shm_perm.mode & SHM_DEST) + if (shm_may_destroy(ns, shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + up_write(&shm_ids(ns).rw_mutex); +} + +static int shm_try_destroy_current(int id, void *p, void *data) +{ + struct ipc_namespace *ns = data; + struct shmid_kernel *shp = shm_lock(ns, id); + + if (IS_ERR(shp)) + return 0; + + if (shp->shm_cprid != task_tgid_vnr(current)) { + shm_unlock(shp); + return 0; + } + + if (shm_may_destroy(ns, shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + return 0; +} + +static int shm_try_destroy_orphaned(int id, void *p, void *data) +{ + struct ipc_namespace *ns = data; + struct shmid_kernel *shp = shm_lock(ns, id); + struct task_struct *task; + + if (IS_ERR(shp)) + return 0; + + /* + * We want to destroy segments without users and with already + * exit'ed originating process. + * + * XXX: the originating process may exist in another pid namespace. + */ + task = find_task_by_vpid(shp->shm_cprid); + if (task != NULL) { + shm_unlock(shp); + return 0; + } + + if (shm_may_destroy(ns, shp)) shm_destroy(ns, shp); else shm_unlock(shp); + return 0; +} + +void shm_destroy_orphaned(struct ipc_namespace *ns) +{ + down_write(&shm_ids(ns).rw_mutex); + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); + up_write(&shm_ids(ns).rw_mutex); +} + + +void exit_shm(struct task_struct *task) +{ + struct nsproxy *nsp = task->nsproxy; + struct ipc_namespace *ns; + + if (!nsp) + return; + ns = nsp->ipc_ns; + if (!ns || !ns->shm_rmid_forced) + return; + + /* Destroy all already created segments, but not mapped yet */ + down_write(&shm_ids(ns).rw_mutex); + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); up_write(&shm_ids(ns).rw_mutex); } @@ -950,8 +1040,7 @@ out_nattch: shp = shm_lock(ns, shmid); BUG_ON(IS_ERR(shp)); shp->shm_nattch--; - if(shp->shm_nattch == 0 && - shp->shm_perm.mode & SHM_DEST) + if (shm_may_destroy(ns, shp)) shm_destroy(ns, shp); else shm_unlock(shp); diff --git a/kernel/exit.c b/kernel/exit.c index 9ee58bb9e60..2913b3509d4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -980,6 +980,7 @@ NORET_TYPE void do_exit(long code) trace_sched_process_exit(tsk); exit_sem(tsk); + exit_shm(tsk); exit_files(tsk); exit_fs(tsk); check_stack_usage(); -- cgit v1.2.3-70-g09d2 From 947be5dfdaaef01b43a3d5444688ebef2bd263d4 Mon Sep 17 00:00:00 2001 From: Vitaliy Ivanov Date: Tue, 26 Jul 2011 16:08:49 -0700 Subject: gcov: disable CONSTRUCTORS for UML Selecting GCOV for UML causing configuration mismatch: warning: (GCOV_KERNEL) selects CONSTRUCTORS which has unmet direct dependencies (!UML) Constructors are not needed for UML. Signed-off-by: Vitaliy Ivanov Cc: Peter Oberparleiter Acked-by: Richard Weinberger Acked-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 5bf924d80b5..a92028196cc 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS + select CONSTRUCTORS if !UML default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage -- cgit v1.2.3-70-g09d2 From 4302fbc8ec2ccae279c939f241bf8ce64e1a0bb7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 26 Jul 2011 16:08:52 -0700 Subject: panic: panic=-1 for immediate reboot When a kernel BUG or oops occurs, ChromeOS intends to panic and immediately reboot, with stacktrace and other messages preserved in RAM across reboot. But the longer we delay, the more likely the user is to poweroff and lose the info. panic_timeout (seconds before rebooting) is set by panic= boot option or sysctl or /proc/sys/kernel/panic; but 0 means wait forever, so at present we have to delay at least 1 second. Let a negative number mean reboot immediately (with the small cosmetic benefit of suppressing that newline-less "Rebooting in %d seconds.." message). Signed-off-by: Hugh Dickins Signed-off-by: Mandeep Singh Baines Cc: Huang Ying Cc: Andi Kleen Cc: Hugh Dickins Cc: Olaf Hering Cc: Jesse Barnes Cc: Dave Airlie Cc: Greg Kroah-Hartman Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 4 +++- kernel/panic.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a70e43edcb6..4ca93898fbd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1846,7 +1846,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. See Documentation/sound/oss/oss-parameters.txt panic= [KNL] Kernel behaviour on panic: delay - seconds before rebooting + timeout > 0: seconds before rebooting + timeout = 0: wait forever + timeout < 0: reboot immediately Format: parkbd.port= [HW] Parallel port number the keyboard adapter is diff --git a/kernel/panic.c b/kernel/panic.c index 69231670eb9..d7bb6974efb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...) } mdelay(PANIC_TIMER_STEP); } + } + if (panic_timeout != 0) { /* * This will not be a clean reboot, with everything * shutting down. But if there is a chance of -- cgit v1.2.3-70-g09d2 From 60063497a95e716c9a689af3be2687d261f115b4 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:06 -0700 Subject: atomic: use This allows us to move duplicated code in (atomic_inc_not_zero() for now) to Signed-off-by: Arun Sharma Reviewed-by: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Cc: Eric Dumazet Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/atomic.h | 1 - arch/alpha/include/asm/local.h | 2 +- arch/alpha/kernel/perf_event.c | 2 +- arch/alpha/kernel/smp.c | 2 +- arch/alpha/lib/dec_and_lock.c | 2 +- arch/arm/include/asm/atomic.h | 1 - arch/arm/kernel/smp.c | 2 +- arch/arm/kernel/traps.c | 2 +- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-bcmring/dma.c | 2 +- arch/arm/mach-cns3xxx/include/mach/pm.h | 2 +- arch/arm/mach-cns3xxx/pm.c | 2 +- arch/arm/mach-omap1/pm.c | 2 +- arch/arm/mach-s3c2440/clock.c | 2 +- arch/arm/mach-s3c2440/s3c2442.c | 2 +- arch/arm/mach-s3c2440/s3c244x-clock.c | 2 +- arch/avr32/include/asm/atomic.h | 1 - arch/blackfin/include/asm/atomic.h | 1 - arch/blackfin/include/asm/dma.h | 2 +- arch/blackfin/include/asm/ipipe.h | 2 +- arch/blackfin/include/asm/spinlock.h | 2 +- arch/blackfin/kernel/ftrace.c | 2 +- arch/blackfin/kernel/ipipe.c | 2 +- arch/blackfin/kernel/nmi.c | 2 +- arch/blackfin/mach-common/smp.c | 2 +- arch/cris/arch-v32/drivers/cryptocop.c | 2 +- arch/cris/arch-v32/kernel/smp.c | 2 +- arch/cris/include/asm/atomic.h | 1 - arch/cris/include/asm/bitops.h | 2 +- arch/cris/kernel/process.c | 2 +- arch/frv/include/asm/atomic.h | 1 - arch/frv/include/asm/hardirq.h | 2 +- arch/frv/kernel/irq.c | 2 +- arch/h8300/include/asm/atomic.h | 1 - arch/ia64/include/asm/atomic.h | 1 - arch/ia64/include/asm/processor.h | 2 +- arch/ia64/include/asm/spinlock.h | 2 +- arch/ia64/kernel/smp.c | 2 +- arch/ia64/kernel/smpboot.c | 2 +- arch/ia64/kernel/uncached.c | 2 +- arch/m32r/include/asm/atomic.h | 1 - arch/m32r/include/asm/mmu_context.h | 2 +- arch/m32r/include/asm/spinlock.h | 2 +- arch/m32r/kernel/smp.c | 2 +- arch/m32r/kernel/traps.c | 2 +- arch/m68k/include/asm/atomic.h | 1 - arch/microblaze/include/asm/mmu_context_mm.h | 2 +- arch/microblaze/include/asm/prom.h | 2 +- arch/mips/include/asm/atomic.h | 1 - arch/mips/include/asm/hw_irq.h | 2 +- arch/mips/include/asm/local.h | 2 +- arch/mips/include/asm/smp.h | 2 +- arch/mips/kernel/irq.c | 2 +- arch/mips/kernel/mips-mt.c | 2 +- arch/mips/kernel/rtlx.c | 2 +- arch/mips/kernel/smp-cmp.c | 2 +- arch/mips/kernel/smp-mt.c | 2 +- arch/mips/kernel/smp.c | 2 +- arch/mips/kernel/smtc-proc.c | 2 +- arch/mips/kernel/smtc.c | 2 +- arch/mips/kernel/sync-r4k.c | 2 +- arch/mips/kernel/vpe.c | 2 +- arch/mips/mipssim/sim_smtc.c | 2 +- arch/mips/sgi-ip27/ip27-nmi.c | 2 +- arch/mn10300/include/asm/atomic.h | 1 - arch/mn10300/include/asm/mmu_context.h | 2 +- arch/mn10300/include/asm/spinlock.h | 2 +- arch/mn10300/include/asm/system.h | 2 +- arch/mn10300/kernel/mn10300-watchdog.c | 2 +- arch/mn10300/kernel/traps.c | 2 +- arch/mn10300/mm/misalignment.c | 2 +- arch/mn10300/proc-mn2ws0050/proc-init.c | 2 +- arch/parisc/include/asm/atomic.h | 1 - arch/parisc/include/asm/bitops.h | 2 +- arch/parisc/include/asm/mmu_context.h | 2 +- arch/parisc/kernel/parisc_ksyms.c | 2 +- arch/parisc/kernel/smp.c | 2 +- arch/parisc/kernel/traps.c | 2 +- arch/parisc/lib/bitops.c | 2 +- arch/powerpc/include/asm/atomic.h | 1 - arch/powerpc/include/asm/emulated_ops.h | 2 +- arch/powerpc/include/asm/irq.h | 2 +- arch/powerpc/include/asm/local.h | 2 +- arch/powerpc/include/asm/prom.h | 2 +- arch/powerpc/kernel/of_platform.c | 2 +- arch/powerpc/kernel/ppc_ksyms.c | 2 +- arch/powerpc/kernel/rtas.c | 2 +- arch/powerpc/kernel/rtasd.c | 2 +- arch/powerpc/kernel/smp-tbsync.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/platforms/83xx/km83xx.c | 2 +- arch/powerpc/platforms/83xx/mpc832x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_itx.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc836x_mds.c | 2 +- arch/powerpc/platforms/83xx/sbc834x.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_cds.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_mds.c | 2 +- arch/powerpc/platforms/85xx/sbc8548.c | 2 +- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 2 +- arch/powerpc/platforms/cell/smp.c | 2 +- arch/powerpc/platforms/cell/spufs/context.c | 2 +- arch/powerpc/platforms/chrp/smp.c | 2 +- arch/powerpc/platforms/iseries/smp.c | 2 +- arch/powerpc/platforms/powermac/backlight.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 2 +- arch/powerpc/platforms/pseries/eeh.c | 2 +- arch/powerpc/platforms/pseries/eeh_cache.c | 2 +- arch/powerpc/platforms/pseries/smp.c | 2 +- arch/powerpc/sysdev/fsl_soc.c | 2 +- arch/powerpc/sysdev/tsi108_dev.c | 2 +- arch/s390/include/asm/atomic.h | 1 - arch/s390/kernel/dis.c | 2 +- arch/s390/kernel/traps.c | 2 +- arch/sh/include/asm/atomic.h | 1 - arch/sh/include/asm/hw_irq.h | 2 +- arch/sh/include/asm/smp.h | 2 +- arch/sh/kernel/idle.c | 2 +- arch/sh/kernel/smp.c | 2 +- arch/sh/kernel/traps_64.c | 2 +- arch/sh/kernel/unwinder.c | 2 +- arch/sparc/include/asm/atomic_32.h | 1 - arch/sparc/include/asm/atomic_64.h | 1 - arch/sparc/include/asm/prom.h | 2 +- arch/sparc/include/asm/smp_32.h | 2 +- arch/sparc/include/asm/smp_64.h | 2 +- arch/sparc/kernel/irq_64.c | 2 +- arch/sparc/kernel/leon_smp.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/sparc/kernel/smp_32.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/lib/atomic32.c | 2 +- arch/tile/include/asm/atomic.h | 9 --------- arch/tile/include/asm/atomic_32.h | 4 ++-- arch/tile/include/asm/atomic_64.h | 2 +- arch/tile/include/asm/bitops_32.h | 2 +- arch/tile/include/asm/bitops_64.h | 2 +- arch/tile/include/asm/spinlock_32.h | 2 +- arch/tile/kernel/intvec_32.S | 2 +- arch/tile/lib/atomic_32.c | 2 +- arch/tile/lib/atomic_asm_32.S | 2 +- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/atomic.h | 1 - arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/local.h | 2 +- arch/x86/include/asm/mce.h | 2 +- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/prom.h | 2 +- arch/x86/include/asm/spinlock.h | 2 +- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/i8259.c | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/timer.c | 2 +- arch/x86/lib/atomic64_32.c | 2 +- arch/x86/mm/mmio-mod.c | 2 +- arch/xtensa/include/asm/atomic.h | 1 - arch/xtensa/kernel/process.c | 2 +- crypto/af_alg.c | 2 +- crypto/proc.c | 2 +- crypto/rng.c | 2 +- drivers/atm/ambassador.c | 2 +- drivers/atm/atmtcp.c | 2 +- drivers/atm/eni.c | 2 +- drivers/atm/eni.h | 2 +- drivers/atm/firestream.c | 2 +- drivers/atm/fore200e.c | 2 +- drivers/atm/horizon.c | 2 +- drivers/atm/idt77252.c | 2 +- drivers/atm/iphase.c | 2 +- drivers/atm/nicstar.c | 2 +- drivers/atm/suni.c | 2 +- drivers/atm/uPD98402.c | 2 +- drivers/atm/zatm.c | 2 +- drivers/base/memory.c | 2 +- drivers/base/power/sysfs.c | 2 +- drivers/block/cciss_scsi.c | 2 +- drivers/char/ipmi/ipmi_watchdog.c | 2 +- drivers/char/mspec.c | 2 +- drivers/connector/cn_proc.c | 3 ++- drivers/edac/edac_stub.c | 2 +- drivers/firewire/core-card.c | 2 +- drivers/firewire/core-device.c | 2 +- drivers/firewire/core-topology.c | 2 +- drivers/firewire/core.h | 2 +- drivers/firewire/nosy.c | 2 +- drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_fence.c | 2 +- drivers/gpu/drm/ttm/ttm_bo.c | 2 +- drivers/gpu/drm/ttm/ttm_lock.c | 2 +- drivers/gpu/drm/ttm/ttm_object.c | 2 +- drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 +- drivers/hwmon/sht15.c | 2 +- drivers/infiniband/hw/cxgb4/mem.c | 2 +- drivers/infiniband/hw/ehca/ehca_tools.h | 2 +- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- drivers/infiniband/ulp/srp/ib_srp.c | 2 +- drivers/isdn/gigaset/gigaset.h | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-queue-length.c | 2 +- drivers/md/dm-table.c | 2 +- drivers/media/video/hdpvr/hdpvr-core.c | 2 +- drivers/media/video/tlg2300/pd-dvb.c | 2 +- drivers/media/video/uvc/uvc_ctrl.c | 2 +- drivers/media/video/uvc/uvc_queue.c | 2 +- drivers/media/video/uvc/uvc_v4l2.c | 2 +- drivers/media/video/uvc/uvc_video.c | 2 +- drivers/message/i2o/i2o_scsi.c | 2 +- drivers/misc/phantom.c | 2 +- drivers/net/atlx/atl1.c | 2 +- drivers/net/atlx/atl2.c | 2 +- drivers/net/atlx/atl2.h | 2 +- drivers/net/cassini.c | 2 +- drivers/net/cpmac.c | 2 +- drivers/net/cxgb3/cxgb3_offload.c | 2 +- drivers/net/cxgb3/l2t.h | 2 +- drivers/net/cxgb3/t3cdev.h | 2 +- drivers/net/cxgb4/cxgb4_uld.h | 2 +- drivers/net/cxgb4/l2t.h | 2 +- drivers/net/hamradio/6pack.c | 2 +- drivers/net/hamradio/dmascc.c | 2 +- drivers/net/ibmveth.c | 2 +- drivers/net/phy/phy.c | 2 +- drivers/net/ppp_generic.c | 2 +- drivers/net/wimax/i2400m/i2400m.h | 2 +- drivers/net/wireless/b43legacy/b43legacy.h | 2 +- drivers/net/wireless/b43legacy/dma.h | 2 +- drivers/oprofile/oprofile_stats.h | 2 +- drivers/pci/hotplug/cpci_hotplug_core.c | 2 +- drivers/pci/xen-pcifront.c | 2 +- drivers/s390/block/dasd_eer.c | 2 +- drivers/s390/char/sclp_quiesce.c | 2 +- drivers/s390/char/vmlogrdr.c | 2 +- drivers/s390/cio/device.h | 2 +- drivers/s390/cio/qdio_main.c | 2 +- drivers/s390/cio/qdio_thinint.c | 2 +- drivers/s390/crypto/ap_bus.c | 2 +- drivers/s390/crypto/zcrypt_api.c | 2 +- drivers/s390/crypto/zcrypt_cex2a.c | 2 +- drivers/s390/crypto/zcrypt_mono.c | 2 +- drivers/s390/crypto/zcrypt_pcica.c | 2 +- drivers/s390/crypto/zcrypt_pcicc.c | 2 +- drivers/s390/crypto/zcrypt_pcixcc.c | 2 +- drivers/s390/net/fsm.h | 2 +- drivers/s390/scsi/zfcp_scsi.c | 2 +- drivers/sbus/char/display7seg.c | 2 +- drivers/scsi/dpt/dpti_i2o.h | 2 +- drivers/scsi/hpsa.c | 2 +- drivers/scsi/pm8001/pm8001_sas.h | 2 +- drivers/staging/octeon/ethernet-rx.c | 2 +- drivers/staging/octeon/ethernet-tx.c | 2 +- drivers/staging/solo6x10/solo6x10.h | 2 +- drivers/staging/tidspbridge/include/dspbridge/host_os.h | 2 +- drivers/staging/winbond/mds_s.h | 2 +- drivers/staging/winbond/wb35reg_s.h | 2 +- drivers/tty/bfin_jtag_comm.c | 2 +- drivers/tty/rocket.c | 2 +- drivers/tty/serial/dz.c | 2 +- drivers/tty/serial/sb1250-duart.c | 2 +- drivers/tty/serial/zs.c | 2 +- drivers/usb/gadget/f_audio.c | 2 +- drivers/usb/gadget/f_rndis.c | 2 +- drivers/usb/gadget/uvc_queue.c | 2 +- drivers/usb/image/microtek.c | 2 +- drivers/usb/misc/appledisplay.c | 2 +- drivers/usb/serial/garmin_gps.c | 2 +- drivers/usb/wusbcore/wa-rpipe.c | 2 +- drivers/vhost/vhost.h | 2 +- drivers/video/sh_mobile_lcdcfb.c | 2 +- drivers/video/vermilion/vermilion.h | 2 +- drivers/w1/masters/matrox_w1.c | 2 +- drivers/w1/w1.c | 2 +- drivers/w1/w1_family.h | 2 +- drivers/watchdog/intel_scu_watchdog.c | 2 +- drivers/watchdog/sbc7240_wdt.c | 2 +- fs/btrfs/delayed-inode.h | 2 +- fs/direct-io.c | 2 +- fs/eventpoll.c | 2 +- fs/file_table.c | 2 +- fs/gfs2/main.c | 2 +- fs/nfs/cache_lib.h | 2 +- fs/nfs/direct.c | 2 +- fs/notify/group.c | 2 +- fs/notify/inode_mark.c | 2 +- fs/notify/mark.c | 2 +- fs/notify/notification.c | 2 +- fs/notify/vfsmount_mark.c | 2 +- fs/ntfs/inode.h | 2 +- fs/posix_acl.c | 2 +- fs/proc/meminfo.c | 2 +- include/acpi/platform/aclinux.h | 2 +- include/asm-generic/atomic.h | 2 -- include/asm-generic/local.h | 2 +- include/asm-generic/local64.h | 2 +- include/drm/ttm/ttm_lock.h | 2 +- include/linux/aio.h | 2 +- include/linux/atmdev.h | 2 +- include/linux/atomic.h | 9 +++++++++ include/linux/backing-dev.h | 2 +- include/linux/bit_spinlock.h | 2 +- include/linux/buffer_head.h | 2 +- include/linux/configfs.h | 2 +- include/linux/connector.h | 2 +- include/linux/cred.h | 2 +- include/linux/crypto.h | 2 +- include/linux/dcache.h | 2 +- include/linux/debug_locks.h | 2 +- include/linux/device.h | 2 +- include/linux/edac.h | 2 +- include/linux/fault-inject.h | 2 +- include/linux/fdtable.h | 2 +- include/linux/filter.h | 2 +- include/linux/firewire.h | 2 +- include/linux/fsnotify_backend.h | 2 +- include/linux/interrupt.h | 2 +- include/linux/jump_label.h | 2 +- include/linux/kdb.h | 2 +- include/linux/key.h | 2 +- include/linux/kgdb.h | 2 +- include/linux/kobject.h | 2 +- include/linux/mlx4/device.h | 2 +- include/linux/mman.h | 2 +- include/linux/mmzone.h | 2 +- include/linux/mount.h | 2 +- include/linux/mutex.h | 2 +- include/linux/netdevice.h | 2 +- include/linux/nfs_fs_sb.h | 2 +- include/linux/oprofile.h | 2 +- include/linux/pci.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/phy.h | 2 +- include/linux/proc_fs.h | 2 +- include/linux/quota.h | 2 +- include/linux/rwsem.h | 2 +- include/linux/sem.h | 2 +- include/linux/skbuff.h | 2 +- include/linux/sonet.h | 2 +- include/linux/spinlock.h | 2 +- include/linux/sunrpc/auth.h | 2 +- include/linux/sunrpc/cache.h | 2 +- include/linux/sunrpc/timer.h | 2 +- include/linux/swap.h | 2 +- include/linux/sysfs.h | 2 +- include/linux/vmstat.h | 2 +- include/linux/workqueue.h | 2 +- include/net/ax25.h | 2 +- include/net/cipso_ipv4.h | 2 +- include/net/flow.h | 2 +- include/net/inet_hashtables.h | 2 +- include/net/inet_timewait_sock.h | 2 +- include/net/inetpeer.h | 2 +- include/net/ip_vs.h | 2 +- include/net/lib80211.h | 2 +- include/net/llc.h | 2 +- include/net/neighbour.h | 2 +- include/net/net_namespace.h | 2 +- include/net/netfilter/nf_conntrack.h | 2 +- include/net/netlabel.h | 2 +- include/net/netns/conntrack.h | 2 +- include/net/sctp/structs.h | 2 +- include/pcmcia/ds.h | 2 +- include/rdma/ib_sa.h | 2 +- include/rdma/ib_verbs.h | 2 +- include/rxrpc/types.h | 2 +- include/scsi/scsi_device.h | 2 +- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- kernel/cgroup.c | 2 +- kernel/cpuset.c | 2 +- kernel/debug/debug_core.c | 2 +- kernel/rcupdate.c | 2 +- kernel/rcutorture.c | 2 +- kernel/rcutree_trace.c | 2 +- kernel/rwsem.c | 2 +- kernel/stop_machine.c | 2 +- kernel/taskstats.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_mmiotrace.c | 2 +- lib/atomic64.c | 2 +- lib/atomic64_test.c | 2 +- lib/crc32.c | 2 +- lib/dec_and_lock.c | 2 +- mm/init-mm.c | 2 +- mm/kmemleak.c | 2 +- mm/slob.c | 2 +- mm/vmalloc.c | 2 +- net/atm/atm_misc.c | 2 +- net/atm/clip.c | 2 +- net/atm/common.c | 2 +- net/atm/lec.c | 2 +- net/atm/proc.c | 2 +- net/bridge/br_fdb.c | 2 +- net/core/flow.c | 2 +- net/decnet/dn_fib.c | 2 +- net/decnet/dn_neigh.c | 2 +- net/decnet/dn_table.c | 2 +- net/decnet/dn_timer.c | 2 +- net/ipv4/cipso_ipv4.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/iucv/iucv.c | 2 +- net/l2tp/l2tp_core.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- net/netfilter/nfnetlink_log.c | 2 +- net/netfilter/nfnetlink_queue.c | 2 +- net/netlabel/netlabel_cipso_v4.c | 2 +- net/netlabel/netlabel_kapi.c | 2 +- net/netlabel/netlabel_mgmt.c | 2 +- net/netlabel/netlabel_mgmt.h | 2 +- net/netlabel/netlabel_unlabeled.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- net/tipc/core.h | 2 +- security/selinux/hooks.c | 2 +- security/selinux/xfrm.c | 2 +- sound/pci/echoaudio/darla20.c | 2 +- sound/pci/echoaudio/darla24.c | 2 +- sound/pci/echoaudio/echo3g.c | 2 +- sound/pci/echoaudio/gina20.c | 2 +- sound/pci/echoaudio/gina24.c | 2 +- sound/pci/echoaudio/indigo.c | 2 +- sound/pci/echoaudio/indigodj.c | 2 +- sound/pci/echoaudio/indigodjx.c | 2 +- sound/pci/echoaudio/indigoio.c | 2 +- sound/pci/echoaudio/indigoiox.c | 2 +- sound/pci/echoaudio/layla20.c | 2 +- sound/pci/echoaudio/layla24.c | 2 +- sound/pci/echoaudio/mia.c | 2 +- sound/pci/echoaudio/mona.c | 2 +- sound/pci/lx6464es/lx6464es.h | 2 +- sound/sparc/dbri.c | 2 +- 439 files changed, 427 insertions(+), 448 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index e756d04b6cd..88b7491490b 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -199,7 +199,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /** * atomic64_add_unless - add unless the number is a given value diff --git a/arch/alpha/include/asm/local.h b/arch/alpha/include/asm/local.h index b9e3e331837..9c94b845604 100644 --- a/arch/alpha/include/asm/local.h +++ b/arch/alpha/include/asm/local.h @@ -2,7 +2,7 @@ #define _ALPHA_LOCAL_H #include -#include +#include typedef struct { diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 8e47709160f..8143cd7cdbf 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index d739703608f..4087a569b43 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c index 0f5520d2f45..f9f5fe830e9 100644 --- a/arch/alpha/lib/dec_and_lock.c +++ b/arch/alpha/lib/dec_and_lock.c @@ -6,7 +6,7 @@ */ #include -#include +#include asm (".text \n\ .global _atomic_dec_and_lock \n\ diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 7e79503ab89..4d501f1bdc9 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -217,7 +217,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) c = old; return c != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_inc(v) atomic_add(1, v) #define atomic_dec(v) atomic_sub(1, v) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 167e3cbe1f2..d88ff0230e8 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2d3436e9f71..bc9f9da782c 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index ea53f4d9b28..4159eca7894 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c index 9f2a948e0e7..0ca00050666 100644 --- a/arch/arm/mach-bcmring/dma.c +++ b/arch/arm/mach-bcmring/dma.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include /* I don't quite understand why dc4 fails when this is set to 1 and DMA is enabled */ diff --git a/arch/arm/mach-cns3xxx/include/mach/pm.h b/arch/arm/mach-cns3xxx/include/mach/pm.h index 6eae7f764d1..c2588cc991d 100644 --- a/arch/arm/mach-cns3xxx/include/mach/pm.h +++ b/arch/arm/mach-cns3xxx/include/mach/pm.h @@ -11,7 +11,7 @@ #ifndef __CNS3XXX_PM_H #define __CNS3XXX_PM_H -#include +#include void cns3xxx_pwr_clk_en(unsigned int block); void cns3xxx_pwr_clk_dis(unsigned int block); diff --git a/arch/arm/mach-cns3xxx/pm.c b/arch/arm/mach-cns3xxx/pm.c index 5e579552aa5..0c04678615c 100644 --- a/arch/arm/mach-cns3xxx/pm.c +++ b/arch/arm/mach-cns3xxx/pm.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c index 98ba9784aa1..495b3987d46 100644 --- a/arch/arm/mach-omap1/pm.c +++ b/arch/arm/mach-omap1/pm.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/clock.c b/arch/arm/mach-s3c2440/clock.c index 554e0d3ec70..f9e6bdaf41d 100644 --- a/arch/arm/mach-s3c2440/clock.c +++ b/arch/arm/mach-s3c2440/clock.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/s3c2442.c b/arch/arm/mach-s3c2440/s3c2442.c index 6224bad4d60..9ad99f8016a 100644 --- a/arch/arm/mach-s3c2440/s3c2442.c +++ b/arch/arm/mach-s3c2440/s3c2442.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/s3c244x-clock.c b/arch/arm/mach-s3c2440/s3c244x-clock.c index f8d96130d1d..7f5ea0a169a 100644 --- a/arch/arm/mach-s3c2440/s3c244x-clock.c +++ b/arch/arm/mach-s3c2440/s3c244x-clock.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h index bbce6a1c6bb..f229c3849f0 100644 --- a/arch/avr32/include/asm/atomic.h +++ b/arch/avr32/include/asm/atomic.h @@ -188,7 +188,6 @@ static inline int atomic_sub_if_positive(int i, atomic_t *v) #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0) -#define atomic_inc_not_zero(v) atomic_add_unless(v, 1, 0) #define atomic_dec_if_positive(v) atomic_sub_if_positive(1, v) #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h index 4c707dbe1ff..f2cf5b714ea 100644 --- a/arch/blackfin/include/asm/atomic.h +++ b/arch/blackfin/include/asm/atomic.h @@ -97,7 +97,6 @@ static inline void atomic_set_mask(int mask, atomic_t *v) c = old; \ c != (u); \ }) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* * atomic_inc_and_test - increment and test diff --git a/arch/blackfin/include/asm/dma.h b/arch/blackfin/include/asm/dma.h index d9dbc1a5353..dac0c97242b 100644 --- a/arch/blackfin/include/asm/dma.h +++ b/arch/blackfin/include/asm/dma.h @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/blackfin/include/asm/ipipe.h b/arch/blackfin/include/asm/ipipe.h index 9e0cc0e2534..17b5e92e3bc 100644 --- a/arch/blackfin/include/asm/ipipe.h +++ b/arch/blackfin/include/asm/ipipe.h @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index 2336093fca2..490c7caa02d 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -11,7 +11,7 @@ # include #else -#include +#include asmlinkage int __raw_spin_is_locked_asm(volatile int *ptr); asmlinkage void __raw_spin_lock_asm(volatile int *ptr); diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c index 48808a12b42..9277905b82c 100644 --- a/arch/blackfin/kernel/ftrace.c +++ b/arch/blackfin/kernel/ftrace.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/blackfin/kernel/ipipe.c b/arch/blackfin/kernel/ipipe.c index 486426f8a0d..dbe11220cc5 100644 --- a/arch/blackfin/kernel/ipipe.c +++ b/arch/blackfin/kernel/ipipe.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include DEFINE_PER_CPU(struct pt_regs, __ipipe_tick_regs); diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 679d0db3525..9919d29287d 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 1c143a4de5f..107622aacf6 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index c03bc3bc30c..642c6fed43d 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index a0843a71aae..0b99df72d2a 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h index 88dc9b9c4ba..ce9f67e4d97 100644 --- a/arch/cris/include/asm/atomic.h +++ b/arch/cris/include/asm/atomic.h @@ -150,7 +150,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) cris_atomic_restore(v, flags); return ret != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* Atomic operations are already serializing */ #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/cris/include/asm/bitops.h b/arch/cris/include/asm/bitops.h index c0092fc7d84..a78a2d70cd8 100644 --- a/arch/cris/include/asm/bitops.h +++ b/arch/cris/include/asm/bitops.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include /* diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index c99aeab7cef..aa585e4e979 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -12,7 +12,7 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include +#include #include #include #include diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h index fae32c7fdcb..b07b75f411f 100644 --- a/arch/frv/include/asm/atomic.h +++ b/arch/frv/include/asm/atomic.h @@ -256,7 +256,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #include #endif /* _ASM_ATOMIC_H */ diff --git a/arch/frv/include/asm/hardirq.h b/arch/frv/include/asm/hardirq.h index 5fc8b6f5bc5..c62833d6ebb 100644 --- a/arch/frv/include/asm/hardirq.h +++ b/arch/frv/include/asm/hardirq.h @@ -12,7 +12,7 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include +#include extern atomic_t irq_err_count; static inline void ack_bad_irq(int irq) diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c index a5f624a9f55..3facbc28cbb 100644 --- a/arch/frv/kernel/irq.c +++ b/arch/frv/kernel/irq.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index 984221abb66..b641714774e 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -116,7 +116,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) local_irq_restore(flags); return ret != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ void atomic_clear_mask(unsigned long mask, unsigned long *v) { diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 44688143967..fdb887005df 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -105,7 +105,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u) { diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 03afe797074..d9f397fae03 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -75,7 +75,7 @@ #include #include #include -#include +#include #ifdef CONFIG_NUMA #include #endif diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index 1a91c9121d1..b77768d35f9 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index be450a3e987..0bd537b4ea6 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 14ec641003d..55909798667 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index c4696d217ce..6a867dc45c0 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h index d44a51e5271..d64d894dc54 100644 --- a/arch/m32r/include/asm/atomic.h +++ b/arch/m32r/include/asm/atomic.h @@ -262,7 +262,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ void atomic_clear_mask(unsigned long mask, atomic_t *addr) { diff --git a/arch/m32r/include/asm/mmu_context.h b/arch/m32r/include/asm/mmu_context.h index a70a3df3363..a979a419816 100644 --- a/arch/m32r/include/asm/mmu_context.h +++ b/arch/m32r/include/asm/mmu_context.h @@ -11,7 +11,7 @@ #ifndef __ASSEMBLY__ -#include +#include #include #include #include diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 179a06489b1..b0ea2f26da3 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -10,7 +10,7 @@ */ #include -#include +#include #include /* diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 092d40a6708..ce7aea34fdf 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index fbd109031df..ee6a9199561 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index 307a573881a..e844a2d2ba2 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -198,7 +198,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* Atomic operations are already serializing */ #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h index 3e5c254e8d1..d6864774644 100644 --- a/arch/microblaze/include/asm/mmu_context_mm.h +++ b/arch/microblaze/include/asm/mmu_context_mm.h @@ -11,7 +11,7 @@ #ifndef _ASM_MICROBLAZE_MMU_CONTEXT_H #define _ASM_MICROBLAZE_MMU_CONTEXT_H -#include +#include #include #include #include diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index 9bd01ecb00d..9ad567e2d42 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -21,7 +21,7 @@ #include #include -#include +#include #define HAVE_ARCH_DEVTREE_FIXUPS diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 4a02fe891ab..833a4023648 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -325,7 +325,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) } return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_dec_return(v) atomic_sub_return(1, (v)) #define atomic_inc_return(v) atomic_add_return(1, (v)) diff --git a/arch/mips/include/asm/hw_irq.h b/arch/mips/include/asm/hw_irq.h index 77adda297ad..9e8ef5994c9 100644 --- a/arch/mips/include/asm/hw_irq.h +++ b/arch/mips/include/asm/hw_irq.h @@ -8,7 +8,7 @@ #ifndef __ASM_HW_IRQ_H #define __ASM_HW_IRQ_H -#include +#include extern atomic_t irq_err_count; diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h index fffc8307a80..94fde8d0fac 100644 --- a/arch/mips/include/asm/local.h +++ b/arch/mips/include/asm/local.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index af42385245d..d4fb4d852a6 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include extern int smp_num_siblings; diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c index 9b734d74ae8..b53970d8099 100644 --- a/arch/mips/kernel/irq.c +++ b/arch/mips/kernel/irq.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index b2259e7cd82..594ca69cb86 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c index 557ef72472e..7a80b7cda7c 100644 --- a/arch/mips/kernel/rtlx.c +++ b/arch/mips/kernel/rtlx.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c index cc81771b882..fe309516065 100644 --- a/arch/mips/kernel/smp-cmp.c +++ b/arch/mips/kernel/smp-cmp.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c index 1ec56e635d0..ce9e286f0a7 100644 --- a/arch/mips/kernel/smp-mt.c +++ b/arch/mips/kernel/smp-mt.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 32a25610108..32c1e954cd3 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smtc-proc.c b/arch/mips/kernel/smtc-proc.c index fe256559c99..928a5a61e1a 100644 --- a/arch/mips/kernel/smtc-proc.c +++ b/arch/mips/kernel/smtc-proc.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index cedac463374..f0895e70e28 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/sync-r4k.c b/arch/mips/kernel/sync-r4k.c index 05dd170a83f..99f913c8d7a 100644 --- a/arch/mips/kernel/sync-r4k.c +++ b/arch/mips/kernel/sync-r4k.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index dbb6b408f00..2cd50ad0d5c 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/mipssim/sim_smtc.c b/arch/mips/mipssim/sim_smtc.c index 30df47258c2..915063991f6 100644 --- a/arch/mips/mipssim/sim_smtc.c +++ b/arch/mips/mipssim/sim_smtc.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/sgi-ip27/ip27-nmi.c b/arch/mips/sgi-ip27/ip27-nmi.c index bc4fa8dd67f..005c29ed419 100644 --- a/arch/mips/sgi-ip27/ip27-nmi.c +++ b/arch/mips/sgi-ip27/ip27-nmi.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index 9d773a63951..041b9d69d86 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -269,7 +269,6 @@ static inline void atomic_dec(atomic_t *v) c != (u); \ }) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /** * atomic_clear_mask - Atomically clear bits in memory diff --git a/arch/mn10300/include/asm/mmu_context.h b/arch/mn10300/include/asm/mmu_context.h index c8f6c82672a..c67c2b5365a 100644 --- a/arch/mn10300/include/asm/mmu_context.h +++ b/arch/mn10300/include/asm/mmu_context.h @@ -22,7 +22,7 @@ #ifndef _ASM_MMU_CONTEXT_H #define _ASM_MMU_CONTEXT_H -#include +#include #include #include #include diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 93429154e89..1ae580f3893 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -11,7 +11,7 @@ #ifndef _ASM_SPINLOCK_H #define _ASM_SPINLOCK_H -#include +#include #include #include diff --git a/arch/mn10300/include/asm/system.h b/arch/mn10300/include/asm/system.h index 8ff3e5aaca4..94b4c5e1491 100644 --- a/arch/mn10300/include/asm/system.h +++ b/arch/mn10300/include/asm/system.h @@ -19,7 +19,7 @@ #include #include -#include +#include #if !defined(CONFIG_LAZY_SAVE_FPU) struct fpu_state_struct; diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c index c5e12bfd9fc..a45f0c7549a 100644 --- a/arch/mn10300/kernel/mn10300-watchdog.c +++ b/arch/mn10300/kernel/mn10300-watchdog.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index bd3e5e73826..9220a75a7b4 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c index eef989c1d0c..f9bb8cb1c14 100644 --- a/arch/mn10300/mm/misalignment.c +++ b/arch/mn10300/mm/misalignment.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/proc-mn2ws0050/proc-init.c b/arch/mn10300/proc-mn2ws0050/proc-init.c index c58249b9525..fe6e24906ff 100644 --- a/arch/mn10300/proc-mn2ws0050/proc-init.c +++ b/arch/mn10300/proc-mn2ws0050/proc-init.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index f81955934ae..192488999b6 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -220,7 +220,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_add(i,v) ((void)(__atomic_add_return( (i),(v)))) #define atomic_sub(i,v) ((void)(__atomic_add_return(-(i),(v)))) diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 4e833aa05a4..8c9b631d2a7 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -8,7 +8,7 @@ #include #include /* for BITS_PER_LONG/SHIFT_PER_LONG */ #include -#include +#include /* * HP-PARISC specific bit operations diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index 354b2aca990..59be2576443 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c index df653663d3d..a7bb757a549 100644 --- a/arch/parisc/kernel/parisc_ksyms.c +++ b/arch/parisc/kernel/parisc_ksyms.c @@ -31,7 +31,7 @@ #include EXPORT_SYMBOL(memset); -#include +#include EXPORT_SYMBOL(__xchg8); EXPORT_SYMBOL(__xchg32); EXPORT_SYMBOL(__cmpxchg_u32); diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 828305f19cf..32d588488f0 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 8b58bf0b7d5..f19e6604026 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c index 353963d4205..a8bffd8af77 100644 --- a/arch/parisc/lib/bitops.c +++ b/arch/parisc/lib/bitops.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #ifdef CONFIG_SMP arch_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = { diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index b8f152ece02..b2bcbee622e 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -212,7 +212,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return t != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_sub_and_test(a, v) (atomic_sub_return((a), (v)) == 0) #define atomic_dec_and_test(v) (atomic_dec_return((v)) == 0) diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index 2cc41c715d2..63f2a22e995 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -18,7 +18,7 @@ #ifndef _ASM_POWERPC_EMULATED_OPS_H #define _ASM_POWERPC_EMULATED_OPS_H -#include +#include #include diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index c57a28e52b6..c0e1bc319e3 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -14,7 +14,7 @@ #include #include -#include +#include /* Define a way to iterate across irqs. */ diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h index c2410af6bfd..b8da9136386 100644 --- a/arch/powerpc/include/asm/local.h +++ b/arch/powerpc/include/asm/local.h @@ -2,7 +2,7 @@ #define _ARCH_POWERPC_LOCAL_H #include -#include +#include typedef struct { diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index b823536375d..b5c91901e38 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -18,7 +18,7 @@ */ #include #include -#include +#include #define HAVE_ARCH_DEVTREE_FIXUPS diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 24582181b6e..59dbf6abaaf 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #ifdef CONFIG_PPC_OF_PLATFORM_PCI diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 7d28f540200..f5ae872a2ef 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 0e0ea941156..d5ca8236315 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 67f6c3b5135..481ef064c8f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c index 03e45c4a9ef..640de836e46 100644 --- a/arch/powerpc/kernel/smp-tbsync.c +++ b/arch/powerpc/kernel/smp-tbsync.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f932f8a0cf0..7bf2187dfd9 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c index f8fa2fc3129..c55129f5760 100644 --- a/arch/powerpc/platforms/83xx/km83xx.c +++ b/arch/powerpc/platforms/83xx/km83xx.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc832x_mds.c b/arch/powerpc/platforms/83xx/mpc832x_mds.c index 93e60f1f21a..32a52896822 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc832x_mds.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc834x_itx.c b/arch/powerpc/platforms/83xx/mpc834x_itx.c index 81e44fa1c64..6b45969567d 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_itx.c +++ b/arch/powerpc/platforms/83xx/mpc834x_itx.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc834x_mds.c b/arch/powerpc/platforms/83xx/mpc834x_mds.c index c1b1dc50b32..041c5177e73 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc834x_mds.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc836x_mds.c b/arch/powerpc/platforms/83xx/mpc836x_mds.c index 81c052b1353..934cc8c46bb 100644 --- a/arch/powerpc/platforms/83xx/mpc836x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc836x_mds.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/sbc834x.c b/arch/powerpc/platforms/83xx/sbc834x.c index 49023dbe157..af41d8c810a 100644 --- a/arch/powerpc/platforms/83xx/sbc834x.c +++ b/arch/powerpc/platforms/83xx/sbc834x.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c index 6299a2a51ae..2bf99786d24 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 747d1ee661f..973b3f4a4b4 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/sbc8548.c b/arch/powerpc/platforms/85xx/sbc8548.c index ecdd8c09e4e..d07dcb7f4ee 100644 --- a/arch/powerpc/platforms/85xx/sbc8548.c +++ b/arch/powerpc/platforms/85xx/sbc8548.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index d809836bcf5..7f92096fe96 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index dbb641ea90d..f2e1dfe4bf3 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c index 0c87bcd2452..bf4d41d8fa1 100644 --- a/arch/powerpc/platforms/cell/spufs/context.c +++ b/arch/powerpc/platforms/cell/spufs/context.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include "spufs.h" diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c index a800122e4dd..feab30bbae2 100644 --- a/arch/powerpc/platforms/chrp/smp.c +++ b/arch/powerpc/platforms/chrp/smp.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/iseries/smp.c b/arch/powerpc/platforms/iseries/smp.c index 2df48c2287b..8bda9be06fa 100644 --- a/arch/powerpc/platforms/iseries/smp.c +++ b/arch/powerpc/platforms/iseries/smp.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/backlight.c b/arch/powerpc/platforms/powermac/backlight.c index d679964ae2a..c2f3e861f5e 100644 --- a/arch/powerpc/platforms/powermac/backlight.c +++ b/arch/powerpc/platforms/powermac/backlight.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index d15fca32297..9a521dc8e48 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 46b55cf563e..ada6e07532e 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c index 8ed0d2d0e1b..fc5ae767989 100644 --- a/arch/powerpc/platforms/pseries/eeh_cache.c +++ b/arch/powerpc/platforms/pseries/eeh_cache.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 1672db2d1b0..4e44c4dcd11 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index 265313e8396..2d66275e489 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index ee056807b52..9f51f97abb5 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index d9db13810d1..29d75632922 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -108,7 +108,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #undef __CS_LOOP diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 1ca3d1d6a86..45df6d456aa 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index e9372c77cce..ffabcd9d336 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index c7983124d99..8ddb2635cf9 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -30,7 +30,6 @@ #define atomic_inc_and_test(v) (atomic_inc_return(v) == 0) #define atomic_sub_and_test(i,v) (atomic_sub_return((i), (v)) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_inc(v) atomic_add(1, (v)) #define atomic_dec(v) atomic_sub(1, (v)) diff --git a/arch/sh/include/asm/hw_irq.h b/arch/sh/include/asm/hw_irq.h index 603cdde813d..693d4418405 100644 --- a/arch/sh/include/asm/hw_irq.h +++ b/arch/sh/include/asm/hw_irq.h @@ -3,7 +3,7 @@ #include #include -#include +#include extern atomic_t irq_err_count; diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h index 9070d943ddd..78b0d0f4b24 100644 --- a/arch/sh/include/asm/smp.h +++ b/arch/sh/include/asm/smp.h @@ -8,7 +8,7 @@ #ifdef CONFIG_SMP #include -#include +#include #include #include diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 425d604e3a2..84db0d6ccd0 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include void (*pm_idle)(void) = NULL; diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 6207561ea34..3147a9a6fb8 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 67110be83fd..cd3a4048329 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/unwinder.c b/arch/sh/kernel/unwinder.c index 468889d958f..521b5432471 100644 --- a/arch/sh/kernel/unwinder.c +++ b/arch/sh/kernel/unwinder.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include /* * This is the most basic stack unwinder an architecture can diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 7ae128b19d3..7646f2cef5d 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -52,7 +52,6 @@ extern void atomic_set(atomic_t *, int); #define atomic_dec_and_test(v) (atomic_dec_return(v) == 0) #define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* This is the old 24-bit implementation. It's still used internally * by some sparc-specific code, notably the semaphore implementation. diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index bdb2ff880bd..337139ef91b 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -85,7 +85,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic64_cmpxchg(v, o, n) \ ((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n))) diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h index 56bbaadef64..edd3d3cde46 100644 --- a/arch/sparc/include/asm/prom.h +++ b/arch/sparc/include/asm/prom.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #define OF_ROOT_NODE_ADDR_CELLS_DEFAULT 2 #define OF_ROOT_NODE_SIZE_CELLS_DEFAULT 1 diff --git a/arch/sparc/include/asm/smp_32.h b/arch/sparc/include/asm/smp_32.h index 093f10843ff..01c51c70434 100644 --- a/arch/sparc/include/asm/smp_32.h +++ b/arch/sparc/include/asm/smp_32.h @@ -22,7 +22,7 @@ #include #include -#include +#include /* * Private routines/data diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h index 20bca895071..29862a9e906 100644 --- a/arch/sparc/include/asm/smp_64.h +++ b/arch/sparc/include/asm/smp_64.h @@ -27,7 +27,7 @@ */ #include -#include +#include #include DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 4e78862d12f..0dd8422a469 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c index fe8fb44c609..1210fde1874 100644 --- a/arch/sparc/kernel/leon_smp.c +++ b/arch/sparc/kernel/leon_smp.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 62a034318b1..171e8d84dc3 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 21b125341bf..f671e7fd6dd 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 99cb17251bb..4a442c32e11 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index d3c7a12ad87..1a371f8ae0b 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -7,7 +7,7 @@ * Based on asm-parisc/atomic.h Copyright (C) 2000 Philipp Rumpf */ -#include +#include #include #include diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h index 739cfe0499d..e3272715c3c 100644 --- a/arch/tile/include/asm/atomic.h +++ b/arch/tile/include/asm/atomic.h @@ -121,15 +121,6 @@ static inline int atomic_read(const atomic_t *v) */ #define atomic_add_negative(i, v) (atomic_add_return((i), (v)) < 0) -/** - * atomic_inc_not_zero - increment unless the number is zero - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1, so long as @v is non-zero. - * Returns non-zero if @v was non-zero, and zero otherwise. - */ -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - /* Nonexistent functions intended to cause link errors. */ extern unsigned long __xchg_called_with_bad_pointer(void); extern unsigned long __cmpxchg_called_with_bad_pointer(void); diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index 92a8bee3231..246feed4794 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -11,7 +11,7 @@ * NON INFRINGEMENT. See the GNU General Public License for * more details. * - * Do not include directly; use . + * Do not include directly; use . */ #ifndef _ASM_TILE_ATOMIC_32_H @@ -21,7 +21,7 @@ #ifndef __ASSEMBLY__ -/* Tile-specific routines to support . */ +/* Tile-specific routines to support . */ int _atomic_xchg(atomic_t *v, int n); int _atomic_xchg_add(atomic_t *v, int i); int _atomic_xchg_add_unless(atomic_t *v, int a, int u); diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h index 1c1e60d8ccb..a48dda30cbc 100644 --- a/arch/tile/include/asm/atomic_64.h +++ b/arch/tile/include/asm/atomic_64.h @@ -11,7 +11,7 @@ * NON INFRINGEMENT. See the GNU General Public License for * more details. * - * Do not include directly; use . + * Do not include directly; use . */ #ifndef _ASM_TILE_ATOMIC_64_H diff --git a/arch/tile/include/asm/bitops_32.h b/arch/tile/include/asm/bitops_32.h index d31ab905cfa..571b118bfd9 100644 --- a/arch/tile/include/asm/bitops_32.h +++ b/arch/tile/include/asm/bitops_32.h @@ -16,7 +16,7 @@ #define _ASM_TILE_BITOPS_32_H #include -#include +#include #include /* Tile-specific routines to support . */ diff --git a/arch/tile/include/asm/bitops_64.h b/arch/tile/include/asm/bitops_64.h index 68f8c5bc067..e9c8e381ee0 100644 --- a/arch/tile/include/asm/bitops_64.h +++ b/arch/tile/include/asm/bitops_64.h @@ -16,7 +16,7 @@ #define _ASM_TILE_BITOPS_64_H #include -#include +#include #include /* See for API comments. */ diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index a8f2c6e31a8..a5e4208d34f 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -17,7 +17,7 @@ #ifndef _ASM_TILE_SPINLOCK_32_H #define _ASM_TILE_SPINLOCK_32_H -#include +#include #include #include #include diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index 72ade79b621..fc94607f0bd 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 46570211df5..771b251b409 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 24448734f6f..1f75a2a5610 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -70,7 +70,7 @@ */ #include -#include +#include #include #include diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 5852519b2d0..f6f5c53dc90 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4a0b7c7e2cc..7b3ca8324b6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 952a826ac4e..897969bdd4e 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -244,7 +244,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* * atomic_dec_if_positive - decrement by 1 if old value positive diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 13f5504c76c..09199052060 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 2e9972468a5..9cdae5d47e8 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include typedef struct { diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 716b48af786..c9321f34e55 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -124,7 +124,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain; #include #include -#include +#include extern int mce_disabled; extern int mce_p5_enabled; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8b5393ec108..69021528b43 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -2,7 +2,7 @@ #define _ASM_X86_MMU_CONTEXT_H #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index df1287019e6..644dd885f05 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index e9e51f710e6..ee67edf86fd 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H -#include +#include #include #include #include diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1f2e61e2898..a1fe5c127b5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -21,7 +21,7 @@ struct task_struct; struct exec_domain; #include #include -#include +#include struct thread_info { struct task_struct *task; /* main task structure */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b117efd24f7..8a439d364b9 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b24be38c8cf..52fa56399a5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 9536b3fe43f..5d513bc47b6 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d7fbf..62184390a60 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 65b8f5c2eeb..610485223bd 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f09d4bbe2d2..b3300e6bace 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a085c..9682ec50180 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2b2255b1f04..57dcbd4308f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index abd86e865be..ae432ea1cd8 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "kvm_timer.h" static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 540179e8e9f..042f6826bf5 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -4,7 +4,7 @@ #include #include -#include +#include long long atomic64_read_cx8(long long, const atomic64_t *v); EXPORT_SYMBOL(atomic64_read_cx8); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3adff7dcc14..67421f38a21 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -34,7 +34,7 @@ #include #include #include /* for ISA_START_ADDRESS */ -#include +#include #include #include diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index a96a0619d0b..7cca2fb18ba 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -248,7 +248,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static inline void atomic_clear_mask(unsigned int mask, atomic_t *v) { diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index e3558b9a58b..47041e7c088 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 940d70cb5c2..ac33d5f3077 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -12,7 +12,7 @@ * */ -#include +#include #include #include #include diff --git a/crypto/proc.c b/crypto/proc.c index 58fef67d4f4..3808697814d 100644 --- a/crypto/proc.c +++ b/crypto/proc.c @@ -13,7 +13,7 @@ * */ -#include +#include #include #include #include diff --git a/crypto/rng.c b/crypto/rng.c index f93cb531118..45229ae782b 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -12,7 +12,7 @@ * */ -#include +#include #include #include #include diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c index bb3b016b6ce..f8f41e0e8a8 100644 --- a/drivers/atm/ambassador.c +++ b/drivers/atm/ambassador.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index 0b0625054a8..b22d71cac54 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include extern int atm_init_aal5(struct atm_vcc *vcc); /* "raw" AAL5 transport */ diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 3230ea0df83..93071417315 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/atm/eni.h b/drivers/atm/eni.h index 493a6932507..dc9a62cc260 100644 --- a/drivers/atm/eni.h +++ b/drivers/atm/eni.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "midway.h" diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 7c7b571647f..5072f8ac16f 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index bc9e702186d..361f5aee3be 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #ifdef CONFIG_SBUS #include diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c index 28750618389..b81210330ac 100644 --- a/drivers/atm/horizon.c +++ b/drivers/atm/horizon.c @@ -45,7 +45,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index be0dbfeb541..db06f34419c 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include #ifdef CONFIG_ATM_IDT77252_USE_SUNI diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index 957106f636e..cb90f7a3e07 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index 6b313ee9231..1c70c45fa04 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include "nicstar.h" #ifdef CONFIG_ATM_NICSTAR_USE_SUNI #include "suni.h" diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c index 41c56eae4c8..90f1ccca9e5 100644 --- a/drivers/atm/suni.c +++ b/drivers/atm/suni.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include "suni.h" diff --git a/drivers/atm/uPD98402.c b/drivers/atm/uPD98402.c index c45ae0573bb..5120a96b3a8 100644 --- a/drivers/atm/uPD98402.c +++ b/drivers/atm/uPD98402.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include "uPD98402.h" diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index 7f8c5132ff3..d889f56e8d8 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include "uPD98401.h" diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 45d7c8fc73b..2840ed4668c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include static DEFINE_MUTEX(mem_sysfs_mutex); diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index 942d6a7c9ae..17b7934f31c 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include "power.h" diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 696100241a6..951a4e33b92 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 320668f4c3a..3302586655c 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #ifdef CONFIG_X86 /* diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 25d139c9dbe..5c0d96a820f 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 0debc17c8e2..3ee1fdb31ea 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -29,7 +29,8 @@ #include #include #include -#include +#include + #include #include diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c index aab970760b7..86ad2eee120 100644 --- a/drivers/edac/edac_stub.c +++ b/drivers/edac/edac_stub.c @@ -14,7 +14,7 @@ */ #include #include -#include +#include #include int edac_op_state = EDAC_OPSTATE_INVAL; diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 29d2423fae6..85661b060ed 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include "core.h" diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 95a47140189..8ba7f7928f1 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 193ed923314..94d3b494ddf 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 0fe4e4e6eda..b45be576752 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -9,7 +9,7 @@ #include #include -#include +#include struct device; struct fw_card; diff --git a/drivers/firewire/nosy.c b/drivers/firewire/nosy.c index 0618145376a..763626b739d 100644 --- a/drivers/firewire/nosy.c +++ b/drivers/firewire/nosy.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include "nosy.h" diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index ef37a9b5a3c..32807baf55e 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -60,7 +60,7 @@ * are considered as fatal) */ -#include +#include #include #include #include diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 021d2b6b556..7fd4e3e5ad5 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -29,7 +29,7 @@ * Dave Airlie */ #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 2e618b5ac46..56619f64b6b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #define TTM_ASSERT_LOCKED(param) #define TTM_DEBUG(fmt, arg...) diff --git a/drivers/gpu/drm/ttm/ttm_lock.c b/drivers/gpu/drm/ttm/ttm_lock.c index de41e55a944..075daf44bce 100644 --- a/drivers/gpu/drm/ttm/ttm_lock.c +++ b/drivers/gpu/drm/ttm/ttm_lock.c @@ -30,7 +30,7 @@ #include "ttm/ttm_lock.h" #include "ttm/ttm_module.h" -#include +#include #include #include #include diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c index ebddd443d91..93577f2e295 100644 --- a/drivers/gpu/drm/ttm/ttm_object.c +++ b/drivers/gpu/drm/ttm/ttm_object.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include struct ttm_object_file { struct ttm_object_device *tdev; diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c index 170e751c283..727e93daac3 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include "ttm/ttm_bo_driver.h" #include "ttm/ttm_page_alloc.h" diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index 7d231cf5d2c..fe4104c6b76 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include /* Commands */ #define SHT15_MEASURE_TEMP 0x03 diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 0347eed4a16..40c835309e4 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -31,7 +31,7 @@ */ #include -#include +#include #include "iw_cxgb4.h" diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h index f09914cccf5..54c0d23bad9 100644 --- a/drivers/infiniband/hw/ehca/ehca_tools.h +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -58,7 +58,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 73bc18465c9..c118663e443 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -34,7 +34,7 @@ #define TCPOPT_TIMESTAMP 8 -#include +#include #include #include #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 7b6985a2e65..b3cc1e062b1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -45,7 +45,7 @@ #include -#include +#include #include #include diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 7d5109bbd1a..0bfa545675b 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/isdn/gigaset/gigaset.h b/drivers/isdn/gigaset/gigaset.h index 6dd360734cf..212efaf9a4e 100644 --- a/drivers/isdn/gigaset/gigaset.h +++ b/drivers/isdn/gigaset/gigaset.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include #define GIG_VERSION {0, 5, 0, 0} #define GIG_COMPAT {0, 4, 0, 0} diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c8827ffd85b..bae6c4e23d3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 819e37eaaeb..320401dec10 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include #include #include diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aa4e570c2cb..c3547016f0f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index f92b6cea9d9..03a837aa5ce 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "multipath queue-length" #define QL_MIN_IO 128 diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 451c3bb176d..bfe9c2333ce 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "table" diff --git a/drivers/media/video/hdpvr/hdpvr-core.c b/drivers/media/video/hdpvr/hdpvr-core.c index a27d93b503a..5f1db46beb4 100644 --- a/drivers/media/video/hdpvr/hdpvr-core.c +++ b/drivers/media/video/hdpvr/hdpvr-core.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/video/tlg2300/pd-dvb.c b/drivers/media/video/tlg2300/pd-dvb.c index edd78f8b1ba..d0da11ae19d 100644 --- a/drivers/media/video/tlg2300/pd-dvb.c +++ b/drivers/media/video/tlg2300/pd-dvb.c @@ -7,7 +7,7 @@ #include "vendorcmds.h" #include -#include +#include static void dvb_urb_cleanup(struct pd_dvb_adapter *pd_dvb); diff --git a/drivers/media/video/uvc/uvc_ctrl.c b/drivers/media/video/uvc/uvc_ctrl.c index a4db26fa2f5..2c8954ec685 100644 --- a/drivers/media/video/uvc/uvc_ctrl.c +++ b/drivers/media/video/uvc/uvc_ctrl.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "uvcvideo.h" diff --git a/drivers/media/video/uvc/uvc_queue.c b/drivers/media/video/uvc/uvc_queue.c index f90ce9fce53..677691c4450 100644 --- a/drivers/media/video/uvc/uvc_queue.c +++ b/drivers/media/video/uvc/uvc_queue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include "uvcvideo.h" diff --git a/drivers/media/video/uvc/uvc_v4l2.c b/drivers/media/video/uvc/uvc_v4l2.c index 543a80395b7..dde6533e8e6 100644 --- a/drivers/media/video/uvc/uvc_v4l2.c +++ b/drivers/media/video/uvc/uvc_v4l2.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/media/video/uvc/uvc_video.c b/drivers/media/video/uvc/uvc_video.c index 49994793cc7..8244167c891 100644 --- a/drivers/media/video/uvc/uvc_video.c +++ b/drivers/media/video/uvc/uvc_video.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c index 74fbe56321f..c8ed7b63fdf 100644 --- a/drivers/message/i2o/i2o_scsi.c +++ b/drivers/message/i2o/i2o_scsi.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index b05db55c8c8..21b28fc6d91 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #define PHANTOM_VERSION "n0.9.8" diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c index 6f0e9403004..97e6954304e 100644 --- a/drivers/net/atlx/atl1.c +++ b/drivers/net/atlx/atl1.c @@ -44,7 +44,7 @@ * SMP torture testing */ -#include +#include #include #include diff --git a/drivers/net/atlx/atl2.c b/drivers/net/atlx/atl2.c index e0f87cf1e2b..d4f7dda3972 100644 --- a/drivers/net/atlx/atl2.c +++ b/drivers/net/atlx/atl2.c @@ -20,7 +20,7 @@ * Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include +#include #include #include #include diff --git a/drivers/net/atlx/atl2.h b/drivers/net/atlx/atl2.h index 78344ddf4bf..bf9016ebdd9 100644 --- a/drivers/net/atlx/atl2.h +++ b/drivers/net/atlx/atl2.h @@ -25,7 +25,7 @@ #ifndef _ATL2_H_ #define _ATL2_H_ -#include +#include #include #ifndef _ATL2_HW_H_ diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c index b414f5ae0da..646c86bcc54 100644 --- a/drivers/net/cassini.c +++ b/drivers/net/cassini.c @@ -98,7 +98,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c index 086ce0418b2..e0638cb4b07 100644 --- a/drivers/net/cpmac.c +++ b/drivers/net/cpmac.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include MODULE_AUTHOR("Eugene Konev "); MODULE_DESCRIPTION("TI AR7 ethernet driver (CPMAC)"); diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c index 32636a1d62a..805076c54f1 100644 --- a/drivers/net/cxgb3/cxgb3_offload.c +++ b/drivers/net/cxgb3/cxgb3_offload.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/cxgb3/l2t.h b/drivers/net/cxgb3/l2t.h index fd3eb07e3f4..7a12d52ed4f 100644 --- a/drivers/net/cxgb3/l2t.h +++ b/drivers/net/cxgb3/l2t.h @@ -34,7 +34,7 @@ #include #include "t3cdev.h" -#include +#include enum { L2T_STATE_VALID, /* entry is up to date */ diff --git a/drivers/net/cxgb3/t3cdev.h b/drivers/net/cxgb3/t3cdev.h index be55e9ae74d..705713b5663 100644 --- a/drivers/net/cxgb3/t3cdev.h +++ b/drivers/net/cxgb3/t3cdev.h @@ -33,7 +33,7 @@ #define _T3CDEV_H_ #include -#include +#include #include #include #include diff --git a/drivers/net/cxgb4/cxgb4_uld.h b/drivers/net/cxgb4/cxgb4_uld.h index 1b48c017014..b1d39b8d141 100644 --- a/drivers/net/cxgb4/cxgb4_uld.h +++ b/drivers/net/cxgb4/cxgb4_uld.h @@ -38,7 +38,7 @@ #include #include #include -#include +#include /* CPL message priority levels */ enum { diff --git a/drivers/net/cxgb4/l2t.h b/drivers/net/cxgb4/l2t.h index 7bd8f42378f..02b31d0c641 100644 --- a/drivers/net/cxgb4/l2t.h +++ b/drivers/net/cxgb4/l2t.h @@ -37,7 +37,7 @@ #include #include -#include +#include struct adapter; struct l2t_data; diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 0d283781bc5..2a5a34d2d67 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #define SIXPACK_VERSION "Revision: 0.3.0" diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c index 52b14256e2c..ce555d9ac02 100644 --- a/drivers/net/hamradio/dmascc.c +++ b/drivers/net/hamradio/dmascc.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index 838c5b67376..ba99af05bf6 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index a4759576075..3cbda0851f8 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 4609bc0e2f5..10e5d985afa 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h index 5eacc653a94..c421a614185 100644 --- a/drivers/net/wimax/i2400m/i2400m.h +++ b/drivers/net/wimax/i2400m/i2400m.h @@ -155,7 +155,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/wireless/b43legacy/b43legacy.h b/drivers/net/wireless/b43legacy/b43legacy.h index 17a130d18dc..a610a352102 100644 --- a/drivers/net/wireless/b43legacy/b43legacy.h +++ b/drivers/net/wireless/b43legacy/b43legacy.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wireless/b43legacy/dma.h b/drivers/net/wireless/b43legacy/dma.h index f89c3422628..686941c242f 100644 --- a/drivers/net/wireless/b43legacy/dma.h +++ b/drivers/net/wireless/b43legacy/dma.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "b43legacy.h" diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h index 0b54e46c3c1..38b6fc02898 100644 --- a/drivers/oprofile/oprofile_stats.h +++ b/drivers/oprofile/oprofile_stats.h @@ -10,7 +10,7 @@ #ifndef OPROFILE_STATS_H #define OPROFILE_STATS_H -#include +#include struct oprofile_stat_struct { atomic_t sample_lost_no_mm; diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index d703e73fffa..3fadf2f135e 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include "cpci_hotplug.h" diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 492b7d807fe..6fa215a3861 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 77f778b7b07..16c5208c3dc 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include "dasd_int.h" diff --git a/drivers/s390/char/sclp_quiesce.c b/drivers/s390/char/sclp_quiesce.c index 05909a7df8b..a90a02c28d6 100644 --- a/drivers/s390/char/sclp_quiesce.c +++ b/drivers/s390/char/sclp_quiesce.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index c837d7419a6..524d988d89d 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h index 7e297c7bb5f..0b7245c72d5 100644 --- a/drivers/s390/cio/device.h +++ b/drivers/s390/cio/device.h @@ -2,7 +2,7 @@ #define S390_DEVICE_H #include -#include +#include #include #include #include "io_sch.h" diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 570d4da1069..e58169c3247 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 68be6e15712..2a1d4dfaf85 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index f8134a44cef..b77ae519d79 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 8e65447f76b..88ad33ed5d3 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c index 2176d00b395..da171b5f399 100644 --- a/drivers/s390/crypto/zcrypt_cex2a.c +++ b/drivers/s390/crypto/zcrypt_cex2a.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_mono.c b/drivers/s390/crypto/zcrypt_mono.c index 44253fdd413..eb313c3fb2d 100644 --- a/drivers/s390/crypto/zcrypt_mono.c +++ b/drivers/s390/crypto/zcrypt_mono.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcica.c b/drivers/s390/crypto/zcrypt_pcica.c index 1afb69c75fe..d84816f144d 100644 --- a/drivers/s390/crypto/zcrypt_pcica.c +++ b/drivers/s390/crypto/zcrypt_pcica.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcicc.c b/drivers/s390/crypto/zcrypt_pcicc.c index aa4c050a569..bdbdbe19299 100644 --- a/drivers/s390/crypto/zcrypt_pcicc.c +++ b/drivers/s390/crypto/zcrypt_pcicc.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcixcc.c b/drivers/s390/crypto/zcrypt_pcixcc.c index 4f85eb725f4..dd4737808e0 100644 --- a/drivers/s390/crypto/zcrypt_pcixcc.c +++ b/drivers/s390/crypto/zcrypt_pcixcc.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/net/fsm.h b/drivers/s390/net/fsm.h index 1e8b235d95b..a4510cf5903 100644 --- a/drivers/s390/net/fsm.h +++ b/drivers/s390/net/fsm.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include /** * Define this to get debugging messages. diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 2a4991d6d4d..7cac873c738 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include "zfcp_ext.h" #include "zfcp_dbf.h" #include "zfcp_fc.h" diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c index 740da446544..965a1fccd66 100644 --- a/drivers/sbus/char/display7seg.c +++ b/drivers/sbus/char/display7seg.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include /* put_/get_user */ #include diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h index 179ad77f6cc..bd9e31e1624 100644 --- a/drivers/scsi/dpt/dpti_i2o.h +++ b/drivers/scsi/dpt/dpti_i2o.h @@ -22,7 +22,7 @@ #include #include -#include +#include /* diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 6bba23a2630..c6f99b1d238 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include "hpsa_cmd.h" #include "hpsa.h" diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index aa05e661d11..b97c8ab0c20 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -54,7 +54,7 @@ #include #include #include -#include +#include #include "pm8001_defs.h" #define DRV_NAME "pm8001" diff --git a/drivers/staging/octeon/ethernet-rx.c b/drivers/staging/octeon/ethernet-rx.c index 0f22f0f4744..1a7c19ae766 100644 --- a/drivers/staging/octeon/ethernet-rx.c +++ b/drivers/staging/octeon/ethernet-rx.c @@ -42,7 +42,7 @@ #include #endif /* CONFIG_XFRM */ -#include +#include #include diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c index 6227571149f..b445cd63f90 100644 --- a/drivers/staging/octeon/ethernet-tx.c +++ b/drivers/staging/octeon/ethernet-tx.c @@ -38,7 +38,7 @@ #include #endif /* CONFIG_XFRM */ -#include +#include #include diff --git a/drivers/staging/solo6x10/solo6x10.h b/drivers/staging/solo6x10/solo6x10.h index fd59b093dd4..17c06bd6cc9 100644 --- a/drivers/staging/solo6x10/solo6x10.h +++ b/drivers/staging/solo6x10/solo6x10.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/staging/tidspbridge/include/dspbridge/host_os.h b/drivers/staging/tidspbridge/include/dspbridge/host_os.h index 1a38896f433..a2f31c69d12 100644 --- a/drivers/staging/tidspbridge/include/dspbridge/host_os.h +++ b/drivers/staging/tidspbridge/include/dspbridge/host_os.h @@ -18,7 +18,7 @@ #define _HOST_OS_H_ #include -#include +#include #include #include #include diff --git a/drivers/staging/winbond/mds_s.h b/drivers/staging/winbond/mds_s.h index eeedf018636..07d835b3b70 100644 --- a/drivers/staging/winbond/mds_s.h +++ b/drivers/staging/winbond/mds_s.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include "localpara.h" #include "mac_structures.h" diff --git a/drivers/staging/winbond/wb35reg_s.h b/drivers/staging/winbond/wb35reg_s.h index eb274ffdd1b..dc79faa4029 100644 --- a/drivers/staging/winbond/wb35reg_s.h +++ b/drivers/staging/winbond/wb35reg_s.h @@ -3,7 +3,7 @@ #include #include -#include +#include struct hw_data; diff --git a/drivers/tty/bfin_jtag_comm.c b/drivers/tty/bfin_jtag_comm.c index 03c285bb2f1..3a997760ec3 100644 --- a/drivers/tty/bfin_jtag_comm.c +++ b/drivers/tty/bfin_jtag_comm.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #define pr_init(fmt, args...) ({ static const __initconst char __fmt[] = fmt; printk(__fmt, ## args); }) diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c index 13043e8d37f..6a1241c7f84 100644 --- a/drivers/tty/rocket.c +++ b/drivers/tty/rocket.c @@ -83,7 +83,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c index 57421d77632..ddc487a2d42 100644 --- a/drivers/tty/serial/dz.c +++ b/drivers/tty/serial/dz.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c index ea2340b814e..6bc2e3f876f 100644 --- a/drivers/tty/serial/sb1250-duart.c +++ b/drivers/tty/serial/sb1250-duart.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c index 1a7fd3e7031..0aebd7121b5 100644 --- a/drivers/tty/serial/zs.c +++ b/drivers/tty/serial/zs.c @@ -65,7 +65,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c index 02a02700b51..a9a4eade7e8 100644 --- a/drivers/usb/gadget/f_audio.c +++ b/drivers/usb/gadget/f_audio.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "u_audio.h" diff --git a/drivers/usb/gadget/f_rndis.c b/drivers/usb/gadget/f_rndis.c index 8f3eae90919..3ea4666be3d 100644 --- a/drivers/usb/gadget/f_rndis.c +++ b/drivers/usb/gadget/f_rndis.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include "u_ether.h" #include "rndis.h" diff --git a/drivers/usb/gadget/uvc_queue.c b/drivers/usb/gadget/uvc_queue.c index f7395ac5dc1..aa0ad34e0f1 100644 --- a/drivers/usb/gadget/uvc_queue.c +++ b/drivers/usb/gadget/uvc_queue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include "uvc.h" diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c index a0037961e5b..27e209a7222 100644 --- a/drivers/usb/image/microtek.c +++ b/drivers/usb/image/microtek.c @@ -131,7 +131,7 @@ #include #include -#include +#include #include #include "../../scsi/scsi.h" #include diff --git a/drivers/usb/misc/appledisplay.c b/drivers/usb/misc/appledisplay.c index 68ab460a735..ac0d75a9005 100644 --- a/drivers/usb/misc/appledisplay.c +++ b/drivers/usb/misc/appledisplay.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #define APPLE_VENDOR_ID 0x05AC diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index b0a7a9e909a..1a49ca9c8ea 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/usb/wusbcore/wa-rpipe.c b/drivers/usb/wusbcore/wa-rpipe.c index ca80171f42c..2acc7f504c5 100644 --- a/drivers/usb/wusbcore/wa-rpipe.c +++ b/drivers/usb/wusbcore/wa-rpipe.c @@ -58,7 +58,7 @@ * destination address. */ #include -#include +#include #include #include diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 14c9abf0d80..a801e2821d0 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* This is for zerocopy, used buffer len is set to 1 when lower device DMA * done */ diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c index 019dbd3f12b..b048417247e 100644 --- a/drivers/video/sh_mobile_lcdcfb.c +++ b/drivers/video/sh_mobile_lcdcfb.c @@ -24,7 +24,7 @@ #include #include #include