summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/module.c7
-rw-r--r--kernel/perf_event.c351
-rw-r--r--kernel/sched.c24
-rw-r--r--kernel/sched_fair.c22
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/trace_event_perf.c15
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c4
10 files changed, 263 insertions, 170 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8b92539b475..97d1b426a4a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,7 +34,7 @@ void cpu_maps_update_done(void)
mutex_unlock(&cpu_add_remove_lock);
}
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
diff --git a/kernel/module.c b/kernel/module.c
index 333fbcc9697..0129769301e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -403,7 +403,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
const char *secstrings)
{
- return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+ return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
}
static void percpu_modcopy(struct module *mod,
@@ -2014,6 +2014,7 @@ static noinline struct module *load_module(void __user *umod,
long err = 0;
void *ptr = NULL; /* Stops spurious gcc warning */
unsigned long symoffs, stroffs, *strmap;
+ void __percpu *percpu;
mm_segment_t old_fs;
@@ -2158,6 +2159,8 @@ static noinline struct module *load_module(void __user *umod,
goto free_mod;
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
}
+ /* Keep this around for failure path. */
+ percpu = mod_percpu(mod);
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
@@ -2463,7 +2466,7 @@ static noinline struct module *load_module(void __user *umod,
module_free(mod, mod->module_core);
/* mod will be freed with core. Don't access it beyond this line! */
free_percpu:
- percpu_modfree(mod);
+ free_percpu(percpu);
free_mod:
kfree(args);
kfree(strmap);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index bd7ce8ca5bb..31d6afe9259 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event *group_leader = event->group_leader;
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ event->attach_state |= PERF_ATTACH_CONTEXT;
/*
- * Depending on whether it is a standalone or sibling event,
- * add it straight to the context's event list, or to the group
- * leader's sibling list:
+ * If we're a stand alone event or group leader, we go to the context
+ * list, group events are kept attached to the group so that
+ * perf_group_detach can, at all times, locate all siblings.
*/
- if (group_leader == event) {
+ if (event->group_leader == event) {
struct list_head *list;
if (is_software_event(event))
@@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list = ctx_group_list(event, ctx);
list_add_tail(&event->group_entry, list);
- } else {
- if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
- !is_software_event(event))
- group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
-
- list_add_tail(&event->group_entry, &group_leader->sibling_list);
- group_leader->nr_siblings++;
}
list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_stat++;
}
+static void perf_group_attach(struct perf_event *event)
+{
+ struct perf_event *group_leader = event->group_leader;
+
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+ event->attach_state |= PERF_ATTACH_GROUP;
+
+ if (group_leader == event)
+ return;
+
+ if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+ !is_software_event(event))
+ group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
+ list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+}
+
/*
* Remove a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- if (list_empty(&event->group_entry))
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_CONTEXT))
return;
+
+ event->attach_state &= ~PERF_ATTACH_CONTEXT;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
- list_del_init(&event->group_entry);
list_del_rcu(&event->event_entry);
- if (event->group_leader != event)
- event->group_leader->nr_siblings--;
+ if (event->group_leader == event)
+ list_del_init(&event->group_entry);
update_group_times(event);
@@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->state = PERF_EVENT_STATE_OFF;
}
-static void
-perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
+ struct list_head *list = NULL;
+
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_GROUP))
+ return;
+
+ event->attach_state &= ~PERF_ATTACH_GROUP;
+
+ /*
+ * If this is a sibling, remove it from its group.
+ */
+ if (event->group_leader != event) {
+ list_del_init(&event->group_entry);
+ event->group_leader->nr_siblings--;
+ return;
+ }
+
+ if (!list_empty(&event->group_entry))
+ list = &event->group_entry;
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
- * to the context list directly:
+ * to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- struct list_head *list;
-
- list = ctx_group_list(event, ctx);
- list_move_tail(&sibling->group_entry, list);
+ if (list)
+ list_move_tail(&sibling->group_entry, list);
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
@@ -652,8 +687,11 @@ group_sched_in(struct perf_event *group_event,
if (txn)
pmu->start_txn(pmu);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, cpuctx, ctx)) {
+ if (txn)
+ pmu->cancel_txn(pmu);
return -EAGAIN;
+ }
/*
* Schedule in siblings as one group (if any):
@@ -675,9 +713,6 @@ group_sched_in(struct perf_event *group_event,
}
group_error:
- if (txn)
- pmu->cancel_txn(pmu);
-
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
@@ -689,6 +724,9 @@ group_error:
}
event_sched_out(group_event, cpuctx, ctx);
+ if (txn)
+ pmu->cancel_txn(pmu);
+
return -EAGAIN;
}
@@ -727,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
list_add_event(event, ctx);
+ perf_group_attach(event);
event->tstamp_enabled = ctx->time;
event->tstamp_running = ctx->time;
event->tstamp_stopped = ctx->time;
@@ -1841,6 +1880,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void perf_pending_sync(struct perf_event *event);
+static void perf_mmap_data_put(struct perf_mmap_data *data);
static void free_event(struct perf_event *event)
{
@@ -1856,9 +1896,9 @@ static void free_event(struct perf_event *event)
atomic_dec(&nr_task_events);
}
- if (event->output) {
- fput(event->output->filp);
- event->output = NULL;
+ if (event->data) {
+ perf_mmap_data_put(event->data);
+ event->data = NULL;
}
if (event->destroy)
@@ -1893,8 +1933,8 @@ int perf_event_release_kernel(struct perf_event *event)
*/
mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
list_del_event(event, ctx);
- perf_destroy_group(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
mutex_unlock(&ctx->mutex);
@@ -2175,7 +2215,27 @@ unlock:
return ret;
}
-static int perf_event_set_output(struct perf_event *event, int output_fd);
+static const struct file_operations perf_fops;
+
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+{
+ struct file *file;
+
+ file = fget_light(fd, fput_needed);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &perf_fops) {
+ fput_light(file, *fput_needed);
+ *fput_needed = 0;
+ return ERR_PTR(-EBADF);
+ }
+
+ return file->private_data;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+ struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2202,7 +2262,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return perf_event_period(event, (u64 __user *)arg);
case PERF_EVENT_IOC_SET_OUTPUT:
- return perf_event_set_output(event, arg);
+ {
+ struct perf_event *output_event = NULL;
+ int fput_needed = 0;
+ int ret;
+
+ if (arg != -1) {
+ output_event = perf_fget_light(arg, &fput_needed);
+ if (IS_ERR(output_event))
+ return PTR_ERR(output_event);
+ }
+
+ ret = perf_event_set_output(event, output_event);
+ if (output_event)
+ fput_light(output_event->filp, fput_needed);
+
+ return ret;
+ }
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
@@ -2335,8 +2411,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
unsigned long size;
int i;
- WARN_ON(atomic_read(&event->mmap_count));
-
size = sizeof(struct perf_mmap_data);
size += nr_pages * sizeof(void *);
@@ -2452,8 +2526,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
unsigned long size;
void *all_buf;
- WARN_ON(atomic_read(&event->mmap_count));
-
size = sizeof(struct perf_mmap_data);
size += sizeof(void *);
@@ -2536,7 +2608,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
if (!data->watermark)
data->watermark = max_size / 2;
-
+ atomic_set(&data->refcount, 1);
rcu_assign_pointer(event->data, data);
}
@@ -2548,13 +2620,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
perf_mmap_data_free(data);
}
-static void perf_mmap_data_release(struct perf_event *event)
+static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
{
- struct perf_mmap_data *data = event->data;
+ struct perf_mmap_data *data;
- WARN_ON(atomic_read(&event->mmap_count));
+ rcu_read_lock();
+ data = rcu_dereference(event->data);
+ if (data) {
+ if (!atomic_inc_not_zero(&data->refcount))
+ data = NULL;
+ }
+ rcu_read_unlock();
+
+ return data;
+}
+
+static void perf_mmap_data_put(struct perf_mmap_data *data)
+{
+ if (!atomic_dec_and_test(&data->refcount))
+ return;
- rcu_assign_pointer(event->data, NULL);
call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
}
@@ -2569,15 +2654,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- WARN_ON_ONCE(event->ctx->parent_ctx);
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
unsigned long size = perf_data_size(event->data);
- struct user_struct *user = current_user();
+ struct user_struct *user = event->mmap_user;
+ struct perf_mmap_data *data = event->data;
atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->locked_vm -= event->data->nr_locked;
- perf_mmap_data_release(event);
+ vma->vm_mm->locked_vm -= event->mmap_locked;
+ rcu_assign_pointer(event->data, NULL);
mutex_unlock(&event->mmap_mutex);
+
+ perf_mmap_data_put(data);
+ free_uid(user);
}
}
@@ -2629,13 +2717,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->mmap_mutex);
- if (event->output) {
- ret = -EINVAL;
- goto unlock;
- }
-
- if (atomic_inc_not_zero(&event->mmap_count)) {
- if (nr_pages != event->data->nr_pages)
+ if (event->data) {
+ if (event->data->nr_pages == nr_pages)
+ atomic_inc(&event->data->refcount);
+ else
ret = -EINVAL;
goto unlock;
}
@@ -2667,21 +2752,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON(event->data);
data = perf_mmap_data_alloc(event, nr_pages);
- ret = -ENOMEM;
- if (!data)
+ if (!data) {
+ ret = -ENOMEM;
goto unlock;
+ }
- ret = 0;
perf_mmap_data_init(event, data);
-
- atomic_set(&event->mmap_count, 1);
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->locked_vm += extra;
- event->data->nr_locked = extra;
if (vma->vm_flags & VM_WRITE)
event->data->writable = 1;
+ atomic_long_add(user_extra, &user->locked_vm);
+ event->mmap_locked = extra;
+ event->mmap_user = get_current_user();
+ vma->vm_mm->locked_vm += event->mmap_locked;
+
unlock:
+ if (!ret)
+ atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
vma->vm_flags |= VM_RESERVED;
@@ -2977,6 +3064,7 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
len -= size;
handle->addr += size;
+ buf += size;
handle->size -= size;
if (!handle->size) {
struct perf_mmap_data *data = handle->data;
@@ -2993,7 +3081,6 @@ int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
int nmi, int sample)
{
- struct perf_event *output_event;
struct perf_mmap_data *data;
unsigned long tail, offset, head;
int have_lost;
@@ -3010,10 +3097,6 @@ int perf_output_begin(struct perf_output_handle *handle,
if (event->parent)
event = event->parent;
- output_event = rcu_dereference(event->output);
- if (output_event)
- event = output_event;
-
data = rcu_dereference(event->data);
if (!data)
goto out;
@@ -3972,13 +4055,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
}
}
-static void perf_swevent_unthrottle(struct perf_event *event)
-{
- /*
- * Nothing to do, we already reset hwc->interrupts.
- */
-}
-
static void perf_swevent_add(struct perf_event *event, u64 nr,
int nmi, struct perf_sample_data *data,
struct pt_regs *regs)
@@ -4193,11 +4269,22 @@ static void perf_swevent_disable(struct perf_event *event)
hlist_del_rcu(&event->hlist_entry);
}
+static void perf_swevent_void(struct perf_event *event)
+{
+}
+
+static int perf_swevent_int(struct perf_event *event)
+{
+ return 0;
+}
+
static const struct pmu perf_ops_generic = {
.enable = perf_swevent_enable,
.disable = perf_swevent_disable,
+ .start = perf_swevent_int,
+ .stop = perf_swevent_void,
.read = perf_swevent_read,
- .unthrottle = perf_swevent_unthrottle,
+ .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
};
/*
@@ -4478,8 +4565,10 @@ static int swevent_hlist_get(struct perf_event *event)
static const struct pmu perf_ops_tracepoint = {
.enable = perf_trace_enable,
.disable = perf_trace_disable,
+ .start = perf_swevent_int,
+ .stop = perf_swevent_void,
.read = perf_swevent_read,
- .unthrottle = perf_swevent_unthrottle,
+ .unthrottle = perf_swevent_void,
};
static int perf_tp_filter_match(struct perf_event *event,
@@ -4912,39 +5001,17 @@ err_size:
goto out;
}
-static int perf_event_set_output(struct perf_event *event, int output_fd)
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct perf_event *output_event = NULL;
- struct file *output_file = NULL;
- struct perf_event *old_output;
- int fput_needed = 0;
+ struct perf_mmap_data *data = NULL, *old_data = NULL;
int ret = -EINVAL;
- /*
- * Don't allow output of inherited per-task events. This would
- * create performance issues due to cross cpu access.
- */
- if (event->cpu == -1 && event->attr.inherit)
- return -EINVAL;
-
- if (!output_fd)
+ if (!output_event)
goto set;
- output_file = fget_light(output_fd, &fput_needed);
- if (!output_file)
- return -EBADF;
-
- if (output_file->f_op != &perf_fops)
- goto out;
-
- output_event = output_file->private_data;
-
- /* Don't chain output fds */
- if (output_event->output)
- goto out;
-
- /* Don't set an output fd when we already have an output channel */
- if (event->data)
+ /* don't allow circular references */
+ if (event == output_event)
goto out;
/*
@@ -4959,26 +5026,28 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
- atomic_long_inc(&output_file->f_count);
-
set:
mutex_lock(&event->mmap_mutex);
- old_output = event->output;
- rcu_assign_pointer(event->output, output_event);
- mutex_unlock(&event->mmap_mutex);
+ /* Can't redirect output if we've got an active mmap() */
+ if (atomic_read(&event->mmap_count))
+ goto unlock;
- if (old_output) {
- /*
- * we need to make sure no existing perf_output_*()
- * is still referencing this event.
- */
- synchronize_rcu();
- fput(old_output->filp);
+ if (output_event) {
+ /* get the buffer we want to redirect to */
+ data = perf_mmap_data_get(output_event);
+ if (!data)
+ goto unlock;
}
+ old_data = event->data;
+ rcu_assign_pointer(event->data, data);
ret = 0;
+unlock:
+ mutex_unlock(&event->mmap_mutex);
+
+ if (old_data)
+ perf_mmap_data_put(old_data);
out:
- fput_light(output_file, fput_needed);
return ret;
}
@@ -4994,7 +5063,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
- struct perf_event *event, *group_leader;
+ struct perf_event *event, *group_leader = NULL, *output_event = NULL;
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
@@ -5034,19 +5103,25 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_fd;
}
+ if (group_fd != -1) {
+ group_leader = perf_fget_light(group_fd, &fput_needed);
+ if (IS_ERR(group_leader)) {
+ err = PTR_ERR(group_leader);
+ goto err_put_context;
+ }
+ group_file = group_leader->filp;
+ if (flags & PERF_FLAG_FD_OUTPUT)
+ output_event = group_leader;
+ if (flags & PERF_FLAG_FD_NO_GROUP)
+ group_leader = NULL;
+ }
+
/*
* Look up the group leader (we will attach this event to it):
*/
- group_leader = NULL;
- if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+ if (group_leader) {
err = -EINVAL;
- group_file = fget_light(group_fd, &fput_needed);
- if (!group_file)
- goto err_put_context;
- if (group_file->f_op != &perf_fops)
- goto err_put_context;
- group_leader = group_file->private_data;
/*
* Do not allow a recursive hierarchy (this new sibling
* becoming part of another group-sibling):
@@ -5068,9 +5143,16 @@ SYSCALL_DEFINE5(perf_event_open,
event = perf_event_alloc(&attr, cpu, ctx, group_leader,
NULL, NULL, GFP_KERNEL);
- err = PTR_ERR(event);
- if (IS_ERR(event))
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
goto err_put_context;
+ }
+
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_free_put_context;
+ }
event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
if (IS_ERR(event_file)) {
@@ -5078,12 +5160,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_free_put_context;
}
- if (flags & PERF_FLAG_FD_OUTPUT) {
- err = perf_event_set_output(event, group_fd);
- if (err)
- goto err_fput_free_put_context;
- }
-
event->filp = event_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
@@ -5097,12 +5173,16 @@ SYSCALL_DEFINE5(perf_event_open,
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
+ /*
+ * Drop the reference on the group_event after placing the
+ * new event on the sibling_list. This ensures destruction
+ * of the group leader will find the pointer to itself in
+ * perf_group_detach().
+ */
fput_light(group_file, fput_needed);
fd_install(event_fd, event_file);
return event_fd;
-err_fput_free_put_context:
- fput(event_file);
err_free_put_context:
free_event(event);
err_put_context:
@@ -5420,6 +5500,7 @@ static void perf_free_event(struct perf_event *event,
fput(parent->filp);
+ perf_group_detach(event);
list_del_event(event, ctx);
free_event(event);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index d4840814250..f8b8996228d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -544,6 +544,8 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
+ unsigned long cpu_power;
+
unsigned char idle_at_tick;
/* For active balancing */
int post_schedule;
@@ -1499,24 +1501,9 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static struct sched_group *group_of(int cpu)
-{
- struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
-
- if (!sd)
- return NULL;
-
- return sd->groups;
-}
-
static unsigned long power_of(int cpu)
{
- struct sched_group *group = group_of(cpu);
-
- if (!group)
- return SCHED_LOAD_SCALE;
-
- return group->cpu_power;
+ return cpu_rq(cpu)->cpu_power;
}
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1854,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
if (task_has_rt_policy(p)) {
- p->se.load.weight = prio_to_weight[0] * 2;
- p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+ p->se.load.weight = 0;
+ p->se.load.inv_weight = WMULT_CONST;
return;
}
@@ -7605,6 +7592,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
+ rq->cpu_power = SCHED_LOAD_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 217e4a9393e..eed35eded60 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
- unsigned int imbalance;
struct task_group *tg;
unsigned long weight;
int balanced;
@@ -1252,8 +1251,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
tg = task_group(p);
weight = p->se.load.weight;
- imbalance = 100 + (sd->imbalance_pct - 100) / 2;
-
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
@@ -1263,9 +1260,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- balanced = !this_load ||
- 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
- imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+ if (this_load) {
+ unsigned long this_eff_load, prev_eff_load;
+
+ this_eff_load = 100;
+ this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+ balanced = this_eff_load <= prev_eff_load;
+ } else
+ balanced = true;
/*
* If the currently running task will sleep within
@@ -2298,6 +2307,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
if (!power)
power = 1;
+ cpu_rq(cpu)->cpu_power = power;
sdg->cpu_power = power;
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4e7431e7c7..70f8d90331e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
- case CPU_DEAD:
+ case CPU_POST_DEAD:
{
struct cpu_stop_work *work;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 36ea2b65dcd..638711c1750 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -842,6 +842,7 @@ static void blk_add_trace_split(void *ignore,
/**
* blk_add_trace_remap - Add a trace for a remap operation
+ * @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @bio: the source bio
* @dev: target device
@@ -873,6 +874,7 @@ static void blk_add_trace_remap(void *ignore,
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @rq: the source request
* @dev: target device
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cb6f365016e..e6f65887842 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -116,7 +116,7 @@ int perf_trace_enable(struct perf_event *p_event)
if (WARN_ON_ONCE(!list))
return -EINVAL;
- list = per_cpu_ptr(list, smp_processor_id());
+ list = this_cpu_ptr(list);
hlist_add_head_rcu(&p_event->hlist_entry, list);
return 0;
@@ -132,8 +132,9 @@ void perf_trace_destroy(struct perf_event *p_event)
struct ftrace_event_call *tp_event = p_event->tp_event;
int i;
+ mutex_lock(&event_mutex);
if (--tp_event->perf_refcount > 0)
- return;
+ goto out;
if (tp_event->class->reg)
tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
@@ -142,6 +143,12 @@ void perf_trace_destroy(struct perf_event *p_event)
tp_event->class->perf_probe,
tp_event);
+ /*
+ * Ensure our callback won't be called anymore. See
+ * tracepoint_probe_unregister() and __DO_TRACE().
+ */
+ synchronize_sched();
+
free_percpu(tp_event->perf_events);
tp_event->perf_events = NULL;
@@ -151,6 +158,8 @@ void perf_trace_destroy(struct perf_event *p_event)
perf_trace_buf[i] = NULL;
}
}
+out:
+ mutex_unlock(&event_mutex);
}
__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -169,7 +178,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
if (*rctxp < 0)
return NULL;
- raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id());
+ raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
/* zero the dead bytes from align to not leak stack to user */
memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index faf7cefd15d..f52b5f50299 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1359,7 +1359,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
for (i = 0; i < tp->nr_args; i++)
call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
- head = per_cpu_ptr(call->perf_events, smp_processor_id());
+ head = this_cpu_ptr(call->perf_events);
perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
}
@@ -1392,7 +1392,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
for (i = 0; i < tp->nr_args; i++)
call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
- head = per_cpu_ptr(call->perf_events, smp_processor_id());
+ head = this_cpu_ptr(call->perf_events);
perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d2c859cec9e..34e35804304 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -519,7 +519,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id());
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
}
@@ -595,7 +595,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id());
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
}