From 3d9a854c2dac3e888393b23ba7adafcce4d6d4b9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 20 Feb 2010 01:03:43 +0100 Subject: Rename .data[.percpu][.XXX] to .data[..percpu][..XXX]. Signed-off-by: Denys Vlasenko Signed-off-by: Michal Marek --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f82386bd9ee..5daf0abd63c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -397,7 +397,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) { - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); + return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); } static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -- cgit v1.2.3-70-g09d2 From 54e88fad223c4e1d94289611a90c7fe3ebe5631b Mon Sep 17 00:00:00 2001 From: "Amit K. Arora" Date: Tue, 25 May 2010 18:53:46 +0530 Subject: sched: Make sure timers have migrated before killing the migration_thread Problem: In a stress test where some heavy tests were running along with regular CPU offlining and onlining, a hang was observed. The system seems to be hung at a point where migration_call() tries to kill the migration_thread of the dying CPU, which just got moved to the current CPU. This migration thread does not get a chance to run (and die) since rt_throttled is set to 1 on current, and it doesn't get cleared as the hrtimer which is supposed to reset the rt bandwidth (sched_rt_period_timer) is tied to the CPU which we just marked dead! Solution: This patch pushes the killing of migration thread to "CPU_POST_DEAD" event. By then all the timers (including sched_rt_period_timer) should have got migrated (along with other callbacks). Signed-off-by: Amit Arora Signed-off-by: Gautham R Shenoy Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Cc: Thomas Gleixner LKML-Reference: <20100525132346.GA14986@amitarora.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4e7431e7c7..70f8d90331e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: - case CPU_DEAD: + case CPU_POST_DEAD: { struct cpu_stop_work *work; -- cgit v1.2.3-70-g09d2 From ac9721f3f54b27a16c7e1afb2481e7ee95a70318 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 May 2010 12:54:41 +0200 Subject: perf_events: Fix races and clean up perf_event and perf_mmap_data interaction In order to move toward separate buffer objects, rework the whole perf_mmap_data construct to be a more self-sufficient entity, one with its own lifetime rules. This greatly sanitizes the whole output redirection code, which was riddled with bugs and races. Signed-off-by: Peter Zijlstra Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 +- kernel/perf_event.c | 224 +++++++++++++++++++++++++-------------------- 2 files changed, 129 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb6c91eac7e..490698590d6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -585,6 +585,7 @@ enum perf_event_active_state { struct file; struct perf_mmap_data { + atomic_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; @@ -592,7 +593,6 @@ struct perf_mmap_data { #endif int nr_pages; /* nr of data pages */ int writable; /* are we writable */ - int nr_locked; /* nr pages mlocked */ atomic_t poll; /* POLL_ for wakeups */ @@ -643,7 +643,6 @@ struct perf_event { int nr_siblings; int group_flags; struct perf_event *group_leader; - struct perf_event *output; const struct pmu *pmu; enum perf_event_active_state state; @@ -704,6 +703,8 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; + int mmap_locked; + struct user_struct *mmap_user; struct perf_mmap_data *data; /* poll related */ diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bd7ce8ca5bb..848d49a043e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1841,6 +1841,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); +static void perf_mmap_data_put(struct perf_mmap_data *data); static void free_event(struct perf_event *event) { @@ -1856,9 +1857,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->output) { - fput(event->output->filp); - event->output = NULL; + if (event->data) { + perf_mmap_data_put(event->data); + event->data = NULL; } if (event->destroy) @@ -2175,7 +2176,27 @@ unlock: return ret; } -static int perf_event_set_output(struct perf_event *event, int output_fd); +static const struct file_operations perf_fops; + +static struct perf_event *perf_fget_light(int fd, int *fput_needed) +{ + struct file *file; + + file = fget_light(fd, fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &perf_fops) { + fput_light(file, *fput_needed); + *fput_needed = 0; + return ERR_PTR(-EBADF); + } + + return file->private_data; +} + +static int perf_event_set_output(struct perf_event *event, + struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -2202,7 +2223,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return perf_event_period(event, (u64 __user *)arg); case PERF_EVENT_IOC_SET_OUTPUT: - return perf_event_set_output(event, arg); + { + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { + output_event = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_event)) + return PTR_ERR(output_event); + } + + ret = perf_event_set_output(event, output_event); + if (output_event) + fput_light(output_event->filp, fput_needed); + + return ret; + } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); @@ -2335,8 +2372,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; int i; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += nr_pages * sizeof(void *); @@ -2452,8 +2487,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; void *all_buf; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += sizeof(void *); @@ -2536,7 +2569,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) if (!data->watermark) data->watermark = max_size / 2; - + atomic_set(&data->refcount, 1); rcu_assign_pointer(event->data, data); } @@ -2548,13 +2581,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) perf_mmap_data_free(data); } -static void perf_mmap_data_release(struct perf_event *event) +static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) { - struct perf_mmap_data *data = event->data; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) { + if (!atomic_inc_not_zero(&data->refcount)) + data = NULL; + } + rcu_read_unlock(); + + return data; +} - WARN_ON(atomic_read(&event->mmap_count)); +static void perf_mmap_data_put(struct perf_mmap_data *data) +{ + if (!atomic_dec_and_test(&data->refcount)) + return; - rcu_assign_pointer(event->data, NULL); call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } @@ -2569,15 +2615,18 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { unsigned long size = perf_data_size(event->data); - struct user_struct *user = current_user(); + struct user_struct *user = event->mmap_user; + struct perf_mmap_data *data = event->data; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_release(event); + vma->vm_mm->locked_vm -= event->mmap_locked; + rcu_assign_pointer(event->data, NULL); mutex_unlock(&event->mmap_mutex); + + perf_mmap_data_put(data); + free_uid(user); } } @@ -2629,13 +2678,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&event->mmap_count)) { - if (nr_pages != event->data->nr_pages) + if (event->data) { + if (event->data->nr_pages == nr_pages) + atomic_inc(&event->data->refcount); + else ret = -EINVAL; goto unlock; } @@ -2667,21 +2713,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON(event->data); data = perf_mmap_data_alloc(event, nr_pages); - ret = -ENOMEM; - if (!data) + if (!data) { + ret = -ENOMEM; goto unlock; + } - ret = 0; perf_mmap_data_init(event, data); - - atomic_set(&event->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - event->data->nr_locked = extra; if (vma->vm_flags & VM_WRITE) event->data->writable = 1; + atomic_long_add(user_extra, &user->locked_vm); + event->mmap_locked = extra; + event->mmap_user = get_current_user(); + vma->vm_mm->locked_vm += event->mmap_locked; + unlock: + if (!ret) + atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); vma->vm_flags |= VM_RESERVED; @@ -2993,7 +3041,6 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_event *output_event; struct perf_mmap_data *data; unsigned long tail, offset, head; int have_lost; @@ -3010,10 +3057,6 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - output_event = rcu_dereference(event->output); - if (output_event) - event = output_event; - data = rcu_dereference(event->data); if (!data) goto out; @@ -4912,39 +4955,17 @@ err_size: goto out; } -static int perf_event_set_output(struct perf_event *event, int output_fd) +static int +perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_event *output_event = NULL; - struct file *output_file = NULL; - struct perf_event *old_output; - int fput_needed = 0; + struct perf_mmap_data *data = NULL, *old_data = NULL; int ret = -EINVAL; - /* - * Don't allow output of inherited per-task events. This would - * create performance issues due to cross cpu access. - */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; - - if (!output_fd) + if (!output_event) goto set; - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) - goto out; - - output_event = output_file->private_data; - - /* Don't chain output fds */ - if (output_event->output) - goto out; - - /* Don't set an output fd when we already have an output channel */ - if (event->data) + /* don't allow circular references */ + if (event == output_event) goto out; /* @@ -4959,26 +4980,28 @@ static int perf_event_set_output(struct perf_event *event, int output_fd) if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; - atomic_long_inc(&output_file->f_count); - set: mutex_lock(&event->mmap_mutex); - old_output = event->output; - rcu_assign_pointer(event->output, output_event); - mutex_unlock(&event->mmap_mutex); + /* Can't redirect output if we've got an active mmap() */ + if (atomic_read(&event->mmap_count)) + goto unlock; - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this event. - */ - synchronize_rcu(); - fput(old_output->filp); + if (output_event) { + /* get the buffer we want to redirect to */ + data = perf_mmap_data_get(output_event); + if (!data) + goto unlock; } + old_data = event->data; + rcu_assign_pointer(event->data, data); ret = 0; +unlock: + mutex_unlock(&event->mmap_mutex); + + if (old_data) + perf_mmap_data_put(old_data); out: - fput_light(output_file, fput_needed); return ret; } @@ -4994,7 +5017,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader; + struct perf_event *event, *group_leader = NULL, *output_event = NULL; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; @@ -5034,19 +5057,25 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } + if (group_fd != -1) { + group_leader = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_leader)) { + err = PTR_ERR(group_leader); + goto err_put_context; + } + group_file = group_leader->filp; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) + group_leader = NULL; + } + /* * Look up the group leader (we will attach this event to it): */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + if (group_leader) { err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - group_leader = group_file->private_data; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): @@ -5068,9 +5097,16 @@ SYSCALL_DEFINE5(perf_event_open, event = perf_event_alloc(&attr, cpu, ctx, group_leader, NULL, NULL, GFP_KERNEL); - err = PTR_ERR(event); - if (IS_ERR(event)) + if (IS_ERR(event)) { + err = PTR_ERR(event); goto err_put_context; + } + + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_free_put_context; + } event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); if (IS_ERR(event_file)) { @@ -5078,12 +5114,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_free_put_context; } - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_event_set_output(event, group_fd); - if (err) - goto err_fput_free_put_context; - } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -5101,8 +5131,6 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; -err_fput_free_put_context: - fput(event_file); err_free_put_context: free_event(event); err_put_context: -- cgit v1.2.3-70-g09d2 From 8a49542c0554af7d0073aac0ee73ee65b807ef34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 May 2010 15:47:49 +0200 Subject: perf_events: Fix races in group composition Group siblings don't pin each-other or the parent, so when we destroy events we must make sure to clean up all cross referencing pointers. In particular, for destruction of a group leader we must be able to find all its siblings and remove their reference to it. This means that detaching an event from its context must not detach it from the group, otherwise we can end up failing to clear all pointers. Solve this by clearly separating the attachment to a context and attachment to a group, and keep the group composed until we destroy the events. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++ kernel/perf_event.c | 91 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 71 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 490698590d6..5d0266d9498 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -631,6 +631,9 @@ struct swevent_hlist { struct rcu_head rcu_head; }; +#define PERF_ATTACH_CONTEXT 0x01 +#define PERF_ATTACH_GROUP 0x02 + /** * struct perf_event - performance event kernel representation: */ @@ -646,6 +649,7 @@ struct perf_event { const struct pmu *pmu; enum perf_event_active_state state; + unsigned int attach_state; atomic64_t count; /* diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 848d49a043e..10a1aee2309 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *group_leader = event->group_leader; + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; /* - * Depending on whether it is a standalone or sibling event, - * add it straight to the context's event list, or to the group - * leader's sibling list: + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. */ - if (group_leader == event) { + if (event->group_leader == event) { struct list_head *list; if (is_software_event(event)) @@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); - } else { - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; } list_add_rcu(&event->event_entry, &ctx->event_list); @@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader; + + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; +} + /* * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - if (list_empty(&event->group_entry)) + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; - list_del_init(&event->group_entry); list_del_rcu(&event->event_entry); - if (event->group_leader != event) - event->group_leader->nr_siblings--; + if (event->group_leader == event) + list_del_init(&event->group_entry); update_group_times(event); @@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->state = PERF_EVENT_STATE_OFF; } -static void -perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + return; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them - * to the context list directly: + * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - struct list_head *list; - - list = ctx_group_list(event, ctx); - list_move_tail(&sibling->group_entry, list); + if (list) + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ @@ -727,6 +762,7 @@ static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); + perf_group_attach(event); event->tstamp_enabled = ctx->time; event->tstamp_running = ctx->time; event->tstamp_stopped = ctx->time; @@ -1894,8 +1930,8 @@ int perf_event_release_kernel(struct perf_event *event) */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); list_del_event(event, ctx); - perf_destroy_group(event, ctx); raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); @@ -5127,6 +5163,12 @@ SYSCALL_DEFINE5(perf_event_open, list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ fput_light(group_file, fput_needed); fd_install(event_fd, event_file); return event_fd; @@ -5448,6 +5490,7 @@ static void perf_free_event(struct perf_event *event, fput(parent->filp); + perf_group_detach(event); list_del_event(event, ctx); free_event(event); } -- cgit v1.2.3-70-g09d2 From 3771f0771154675d4a0ca780be2411f3cc357208 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 12:31:09 +0200 Subject: perf_events, trace: Fix probe unregister race tracepoint_probe_unregister() does not synchronize against the probe callbacks, so do that explicitly. This properly serializes the callbacks and the free of the data used therein. Also, use this_cpu_ptr() where possible. Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1274438476.1674.1702.camel@laptop> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 2 +- kernel/trace/trace_event_perf.c | 10 ++++++++-- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- 4 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3d685d1f2a0..5a64905d727 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -725,7 +725,7 @@ perf_trace_##call(void *__data, proto) \ \ { assign; } \ \ - head = per_cpu_ptr(event_call->perf_events, smp_processor_id());\ + head = this_cpu_ptr(event_call->perf_events); \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ __count, &__regs, head); \ } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cb6f365016e..49c7abf2ba5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -116,7 +116,7 @@ int perf_trace_enable(struct perf_event *p_event) if (WARN_ON_ONCE(!list)) return -EINVAL; - list = per_cpu_ptr(list, smp_processor_id()); + list = this_cpu_ptr(list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; @@ -142,6 +142,12 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->class->perf_probe, tp_event); + /* + * Ensure our callback won't be called anymore. See + * tracepoint_probe_unregister() and __DO_TRACE(). + */ + synchronize_sched(); + free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; @@ -169,7 +175,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, if (*rctxp < 0) return NULL; - raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id()); + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index faf7cefd15d..f52b5f50299 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1359,7 +1359,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - head = per_cpu_ptr(call->perf_events, smp_processor_id()); + head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); } @@ -1392,7 +1392,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - head = per_cpu_ptr(call->perf_events, smp_processor_id()); + head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d2c859cec9e..34e35804304 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -519,7 +519,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id()); + head = this_cpu_ptr(sys_data->enter_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } @@ -595,7 +595,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id()); + head = this_cpu_ptr(sys_data->exit_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } -- cgit v1.2.3-70-g09d2 From 2e97942fe57864588774f173cf4cd7bb68968b76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 16:22:33 +0200 Subject: perf_events, trace: Fix perf_trace_destroy(), mutex went missing Steve spotted I forgot to do the destroy under event_mutex. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <1274451913.1674.1707.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 49c7abf2ba5..e6f65887842 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -132,8 +132,9 @@ void perf_trace_destroy(struct perf_event *p_event) struct ftrace_event_call *tp_event = p_event->tp_event; int i; + mutex_lock(&event_mutex); if (--tp_event->perf_refcount > 0) - return; + goto out; if (tp_event->class->reg) tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); @@ -157,6 +158,8 @@ void perf_trace_destroy(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } +out: + mutex_unlock(&event_mutex); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, -- cgit v1.2.3-70-g09d2 From 90151c35b19633e0cab5a6c80f1ba4a51e7c913b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 25 May 2010 16:23:10 +0200 Subject: perf_events: Fix event scheduling issues introduced by transactional API The transactional API patch between the generic and model-specific code introduced several important bugs with event scheduling, at least on X86. If you had pinned events, e.g., watchdog, and were over-committing the PMU, you would get bogus counts. The bug was showing up on Intel CPU because events would move around more often that on AMD. But the problem also existed on AMD, though harder to expose. The issues were: - group_sched_in() was missing a cancel_txn() in the error path - cpuc->n_added was not properly maintained, leading to missing actions in hw_perf_enable(), i.e., n_running being 0. You cannot update n_added until you know the transaction has succeeded. In case of failed transaction n_added was not adjusted back. - in case of failed transactions, event_sched_out() was called and eventually invoked x86_disable_event() to touch the HW reg. But with transactions, on X86, event_sched_in() does not touch HW registers, it simply collects events into a list. Thus, you could end up calling x86_disable_event() on a counter which did not correspond to the current event when idx != -1. The patch modifies the generic and X86 code to avoid all those problems. First, we keep track of the number of events added last. In case the transaction fails, we substract them from n_added. This approach is necessary (as opposed to delaying updates to n_added) because not all event updates use the transaction API, e.g., single events. Second, we encapsulate the event_sched_in() and event_sched_out() in group_sched_in() inside the transaction. That makes the operations symmetrical and you can also detect that you are inside a transaction and skip the HW reg access by checking cpuc->group_flag. With this patch, you can now overcommit the PMU even with pinned system-wide events present and still get valid counts. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1274796225.5882.1389.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 22 ++++++++++++++++++++++ kernel/perf_event.c | 11 +++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c77586061bc..5db5b7d65a1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -106,6 +106,7 @@ struct cpu_hw_events { int n_events; int n_added; + int n_txn; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ @@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event) out: cpuc->n_events = n; cpuc->n_added += n - n0; + cpuc->n_txn += n - n0; return 0; } @@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; + /* + * If we're called during a txn, we don't need to do anything. + * The events never got scheduled and ->cancel_txn will truncate + * the event_list. + */ + if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + return; + x86_pmu_stop(event); for (i = 0; i < cpuc->n_events; i++) { @@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); cpuc->group_flag |= PERF_EVENT_TXN_STARTED; + cpuc->n_txn = 0; } /* @@ -1391,6 +1402,11 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; + /* + * Truncate the collected events. + */ + cpuc->n_added -= cpuc->n_txn; + cpuc->n_events -= cpuc->n_txn; } /* @@ -1419,6 +1435,12 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); + /* + * Clear out the txn count so that ->cancel_txn() which gets + * run after ->commit_txn() doesn't undo things. + */ + cpuc->n_txn = 0; + return 0; } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 10a1aee2309..42a0e9191af 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -687,8 +687,11 @@ group_sched_in(struct perf_event *group_event, if (txn) pmu->start_txn(pmu); - if (event_sched_in(group_event, cpuctx, ctx)) + if (event_sched_in(group_event, cpuctx, ctx)) { + if (txn) + pmu->cancel_txn(pmu); return -EAGAIN; + } /* * Schedule in siblings as one group (if any): @@ -710,9 +713,6 @@ group_sched_in(struct perf_event *group_event, } group_error: - if (txn) - pmu->cancel_txn(pmu); - /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: @@ -724,6 +724,9 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); + if (txn) + pmu->cancel_txn(pmu); + return -EAGAIN; } -- cgit v1.2.3-70-g09d2 From 74048f895fa8cbf8119b4999f1f44881a825f954 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 May 2010 21:34:58 +0200 Subject: perf_events: Fix unincremented buffer base on partial copy If a sample size crosses to the next page boundary, the copy will be made in more than one step. However we forget to advance the source offset for the next copy, leading to unexpected double copies that completely mess up the traces. This fixes various kinds of bad traces that have irrelevant data inside, as an example: geany-4979 [001] 5758.077775: sched_switch: prev_comm=! prev_pid=121 prev_prio=0 prev_state=S|D|Z|X|x ==> next_comm= next_pid=7497072 next_prio=0 Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1274988898-5639-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 42a0e9191af..858f56fa243 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3064,6 +3064,7 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, len -= size; handle->addr += size; + buf += size; handle->size -= size; if (!handle->size) { struct perf_mmap_data *data = handle->data; -- cgit v1.2.3-70-g09d2 From 546cf44a1b507c1cbb5cf42bbe6169780567f36f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 29 May 2010 11:45:07 -0700 Subject: blktrace: Fix new kernel-doc warnings Fix blktrace.c kernel-doc warnings: Warning(kernel/trace/blktrace.c:858): No description found for parameter 'ignore' Warning(kernel/trace/blktrace.c:890): No description found for parameter 'ignore' Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20100529114507.c466fc1e.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 36ea2b65dcd..638711c1750 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -842,6 +842,7 @@ static void blk_add_trace_split(void *ignore, /** * blk_add_trace_remap - Add a trace for a remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio * @dev: target device @@ -873,6 +874,7 @@ static void blk_add_trace_remap(void *ignore, /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @rq: the source request * @dev: target device -- cgit v1.2.3-70-g09d2 From 293a7cfeedc2b2380a7c7274902323c3cf5f7575 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 31 May 2010 19:53:50 +0930 Subject: module: fix reference to mod->percpu after freeing module. Rafael sees a sometimes crash at precpu_modfree from kernel/module.c; it only occurred with another (since-reverted) patch, but that patch simply changed timing to uncover this bug, it was otherwise unrelated. The comment about the mod being freed is self-explanatory, but neither Tejun nor I read it. This bug was introduced in 259354deaa, after it had previously been fixed in 6e2b75740b. How embarrassing. Reported-by: "Rafael J. Wysocki" Signed-off-by: Rusty Russell Embarrassingly-Acked-by: Tejun Heo Cc: Masami Hiramatsu Tested-by: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/module.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 333fbcc9697..d806e00e445 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2014,6 +2014,7 @@ static noinline struct module *load_module(void __user *umod, long err = 0; void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; + void __percpu *percpu; mm_segment_t old_fs; @@ -2158,6 +2159,8 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; } + /* Keep this around for failure path. */ + percpu = mod_percpu(mod); /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2463,7 +2466,7 @@ static noinline struct module *load_module(void __user *umod, module_free(mod, mod->module_core); /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: - percpu_modfree(mod); + free_percpu(percpu); free_mod: kfree(args); kfree(strmap); -- cgit v1.2.3-70-g09d2 From e51fd5e22e12b39f49b1bb60b37b300b17378a43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 May 2010 12:37:30 +0200 Subject: sched: Fix wake_affine() vs RT tasks Mike reports that since e9e9250b (sched: Scale down cpu_power due to RT tasks), wake_affine() goes funny on RT tasks due to them still having a !0 weight and wake_affine() still subtracts that from the rq weight. Since nobody should be using se->weight for RT tasks, set the value to zero. Also, since we now use ->cpu_power to normalize rq weights to account for RT cpu usage, add that factor into the imbalance computation. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1275316109.27810.22969.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 ++++++------------------ kernel/sched_fair.c | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d4840814250..f8b8996228d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -544,6 +544,8 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned long cpu_power; + unsigned char idle_at_tick; /* For active balancing */ int post_schedule; @@ -1499,24 +1501,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - static unsigned long power_of(int cpu) { - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; + return cpu_rq(cpu)->cpu_power; } static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); @@ -1854,8 +1841,8 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; + p->se.load.weight = 0; + p->se.load.inv_weight = WMULT_CONST; return; } @@ -7605,6 +7592,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->cpu_power = SCHED_LOAD_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 217e4a9393e..eed35eded60 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; - unsigned int imbalance; struct task_group *tg; unsigned long weight; int balanced; @@ -1252,8 +1251,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) tg = task_group(p); weight = p->se.load.weight; - imbalance = 100 + (sd->imbalance_pct - 100) / 2; - /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll @@ -1263,9 +1260,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !this_load || - 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= - imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); + if (this_load) { + unsigned long this_eff_load, prev_eff_load; + + this_eff_load = 100; + this_eff_load *= power_of(prev_cpu); + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= power_of(this_cpu); + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + + balanced = this_eff_load <= prev_eff_load; + } else + balanced = true; /* * If the currently running task will sleep within @@ -2298,6 +2307,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) if (!power) power = 1; + cpu_rq(cpu)->cpu_power = power; sdg->cpu_power = power; } -- cgit v1.2.3-70-g09d2 From 5c113fbeed7a5a192d8431a768965f8a45c16475 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 1 Jun 2010 12:15:11 +0100 Subject: fix cpu_chain section mismatch... In commit e9fb7631ebcd ("cpu-hotplug: introduce cpu_notify(), __cpu_notify(), cpu_notify_nofail()") the new helper functions access cpu_chain. As a result, it shouldn't be marked __cpuinitdata (via section mismatch warning). Alternatively, the helper functions should be forced inline, or marked __ref or __cpuinit. In the meantime, this patch silences the warning the trivial way. Signed-off-by: Daniel J Blueman Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8b92539b475..97d1b426a4a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,7 +34,7 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); +static RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock -- cgit v1.2.3-70-g09d2 From ff9da691c0498ff81fdd014e7a0731dab2337dac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jun 2010 14:54:39 +0200 Subject: pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface This changes the interface to be based on bytes instead. The API matches that of F_SETPIPE_SZ in that it rounds up the passed in size so that the resulting page array is a power-of-2 in size. The proc file is renamed to /proc/sys/fs/pipe-max-size to reflect this change. Signed-off-by: Jens Axboe --- fs/pipe.c | 54 ++++++++++++++++++++++++++++++++++++----------- include/linux/pipe_fs_i.h | 4 +++- kernel/sysctl.c | 8 +++---- 3 files changed, 49 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/fs/pipe.c b/fs/pipe.c index f98fae3e36b..69c4c7c13ea 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,9 +26,14 @@ /* * The max size that a non-root user is allowed to grow the pipe. Can - * be set by root in /proc/sys/fs/pipe-max-pages + * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; +unsigned int pipe_max_size = 1048576; + +/* + * Minimum pipe size, as required by POSIX + */ +unsigned int pipe_min_size = PAGE_SIZE; /* * We use a start+len construction, which provides full use of the @@ -1156,6 +1161,35 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) return nr_pages * PAGE_SIZE; } +/* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * will return an error. + */ +int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + if (ret < 0 || !write) + return ret; + + pipe_max_size = round_pipe_size(pipe_max_size); + return ret; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; @@ -1169,23 +1203,19 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case F_SETPIPE_SZ: { - unsigned long nr_pages; + unsigned int size, nr_pages; - /* - * Currently the array must be a power-of-2 size, so adjust - * upwards if needed. - */ - nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = roundup_pow_of_two(nr_pages); + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; - if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) { + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; - } else if (nr_pages < 1) { + } else if (nr_pages < PAGE_SIZE) { ret = -EINVAL; goto out; } - ret = pipe_set_size(pipe, arg); + ret = pipe_set_size(pipe, nr_pages); break; } case F_GETPIPE_SZ: diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 16de3933c45..445796945ac 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -139,7 +139,9 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_pages; +extern unsigned int pipe_max_size, pipe_min_size; +int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 997080f00e0..d24f761f487 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = { }, #endif { - .procname = "pipe-max-pages", - .data = &pipe_max_pages, + .procname = "pipe-max-size", + .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &two, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, }, /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.2.3-70-g09d2 From c6df8d5ab87a246942d138321e1721edbb69f6e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Jun 2010 11:21:20 +0200 Subject: perf: Fix crash in swevents Frederic reported that because swevents handling doesn't disable IRQs anymore, we can get a recursion of perf_adjust_period(), once from overflow handling and once from the tick. If both call ->disable, we get a double hlist_del_rcu() and trigger a LIST_POISON2 dereference. Since we don't actually need to stop/start a swevent to re-programm the hardware (lack of hardware to program), simply nop out these callbacks for the swevent pmu. Reported-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1275557609.27810.35218.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 858f56fa243..31d6afe9259 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4055,13 +4055,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_unthrottle(struct perf_event *event) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - static void perf_swevent_add(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) @@ -4276,11 +4269,22 @@ static void perf_swevent_disable(struct perf_event *event) hlist_del_rcu(&event->hlist_entry); } +static void perf_swevent_void(struct perf_event *event) +{ +} + +static int perf_swevent_int(struct perf_event *event) +{ + return 0; +} + static const struct pmu perf_ops_generic = { .enable = perf_swevent_enable, .disable = perf_swevent_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, .read = perf_swevent_read, - .unthrottle = perf_swevent_unthrottle, + .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ }; /* @@ -4561,8 +4565,10 @@ static int swevent_hlist_get(struct perf_event *event) static const struct pmu perf_ops_tracepoint = { .enable = perf_trace_enable, .disable = perf_trace_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, .read = perf_swevent_read, - .unthrottle = perf_swevent_unthrottle, + .unthrottle = perf_swevent_void, }; static int perf_tp_filter_match(struct perf_event *event, -- cgit v1.2.3-70-g09d2 From 485d527686850d68a0e9006dd9904f19f122485e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Jun 2010 14:14:58 -0700 Subject: sys_personality: change sys_personality() to accept "unsigned int" instead of u_long task_struct->pesonality is "unsigned int", but sys_personality() paths use "unsigned long pesonality". This means that every assignment or comparison is not right. In particular, if this argument does not fit into "unsigned int" __set_personality() changes the caller's personality and then sys_personality() returns -EINVAL. Turn this argument into "unsigned int" and avoid overflows. Obviously, this is the user-visible change, we just ignore the upper bits. But this can't break the sane application. There is another thing which can confuse the poorly written applications. User-space thinks that this syscall returns int, not long. This means that the returned value can be negative and look like the error code. But note that libc won't be confused and thus errno won't be set, and with this patch the user-space can never get -1 unless sys_personality() really fails. And, most importantly, the negative RET != -1 is only possible if that app previously called personality(RET). Pointed-out-by: Wenming Zhang Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/personality.h | 2 +- include/linux/syscalls.h | 2 +- kernel/exec_domain.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/personality.h b/include/linux/personality.h index 126120819a0..eec3bae164d 100644 --- a/include/linux/personality.h +++ b/include/linux/personality.h @@ -12,7 +12,7 @@ struct pt_regs; extern int register_exec_domain(struct exec_domain *); extern int unregister_exec_domain(struct exec_domain *); -extern int __set_personality(unsigned long); +extern int __set_personality(unsigned int); #endif /* __KERNEL__ */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a1a86a53bc7..7f614ce274a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -289,7 +289,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr); asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data); -asmlinkage long sys_personality(u_long personality); +asmlinkage long sys_personality(unsigned int personality); asmlinkage long sys_sigpending(old_sigset_t __user *set); asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set, diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c35452cadde..dd62f8e714c 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain; static DEFINE_RWLOCK(exec_domains_lock); -static u_long ident_map[32] = { +static unsigned long ident_map[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp) } static struct exec_domain * -lookup_exec_domain(u_long personality) +lookup_exec_domain(unsigned int personality) { - struct exec_domain * ep; - u_long pers = personality(personality); + unsigned int pers = personality(personality); + struct exec_domain *ep; read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality) #ifdef CONFIG_MODULES read_unlock(&exec_domains_lock); - request_module("personality-%ld", pers); + request_module("personality-%d", pers); read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -135,7 +135,7 @@ unregister: } int -__set_personality(u_long personality) +__set_personality(unsigned int personality) { struct exec_domain *ep, *oep; @@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void) module_init(proc_execdomains_init); #endif -SYSCALL_DEFINE1(personality, u_long, personality) +SYSCALL_DEFINE1(personality, unsigned int, personality) { - u_long old = current->personality; + unsigned int old = current->personality; if (personality != 0xffffffff) { set_personality(personality); @@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality) return -EINVAL; } - return (long)old; + return old; } -- cgit v1.2.3-70-g09d2 From 94b3dd0f7bb393d93e84a173b1df9b8b64c83ac4 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 4 Jun 2010 14:15:03 -0700 Subject: cgroups: alloc_css_id() increments hierarchy depth Child groups should have a greater depth than their parents. Prior to this change, the parent would incorrectly report zero memory usage for child cgroups when use_hierarchy is enabled. test script: mount -t cgroup none /cgroups -o memory cd /cgroups mkdir cg1 echo 1 > cg1/memory.use_hierarchy mkdir cg1/cg11 echo $$ > cg1/cg11/tasks dd if=/dev/zero of=/tmp/foo bs=1M count=1 echo echo CHILD grep cache cg1/cg11/memory.stat echo echo PARENT grep cache cg1/memory.stat echo $$ > tasks rmdir cg1/cg11 cg1 cd / umount /cgroups Using fae9c79, a recent patch that changed alloc_css_id() depth computation, the parent incorrectly reports zero usage: root@ubuntu:~# ./test 1+0 records in 1+0 records out 1048576 bytes (1.0 MB) copied, 0.0151844 s, 69.1 MB/s CHILD cache 1048576 total_cache 1048576 PARENT cache 0 total_cache 0 With this patch, the parent correctly includes child usage: root@ubuntu:~# ./test 1+0 records in 1+0 records out 1048576 bytes (1.0 MB) copied, 0.0136827 s, 76.6 MB/s CHILD cache 1052672 total_cache 1052672 PARENT cache 0 total_cache 1052672 Signed-off-by: Greg Thelen Acked-by: Paul Menage Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: [2.6.34.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 422cb19f156..3ac6f5b0a64 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4598,7 +4598,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; parent_id = parent_css->id; - depth = parent_id->depth; + depth = parent_id->depth + 1; child_id = get_new_cssid(ss, depth); if (IS_ERR(child_id)) -- cgit v1.2.3-70-g09d2 From 9e506f7adce8e6165a104d3d78fddd8ff0cdccf8 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 4 Jun 2010 14:15:04 -0700 Subject: kernel/: fix BUG_ON checks for cpu notifier callbacks direct call The commit 80b5184cc537718122e036afe7e62d202b70d077 ("kernel/: convert cpu notifier to return encapsulate errno value") changed the return value of cpu notifier callbacks. Those callbacks don't return NOTIFY_BAD on failures anymore. But there are a few callbacks which are called directly at init time and checking the return value. I forgot to change BUG_ON checking by the direct callers in the commit. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- kernel/timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 825e1126008..07b4f1b1a73 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void) void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/timer.c b/kernel/timer.c index 2454172a80d..ee305c8d4e1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1717,7 +1717,7 @@ void __init init_timers(void) init_timer_stats(); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.2.3-70-g09d2 From 2c02dfe7fe3fba97a5665d329d039d2415ea5607 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 31 May 2010 12:19:37 -0700 Subject: module: Make the 'usage' lists be two-way When adding a module that depends on another one, we used to create a one-way list of "modules_which_use_me", so that module unloading could see who needs a module. It's actually quite simple to make that list go both ways: so that we not only can see "who uses me", but also see a list of modules that are "used by me". In fact, we always wanted that list in "module_unload_free()": when we unload a module, we want to also release all the other modules that are used by that module. But because we didn't have that list, we used to first iterate over all modules, and then iterate over each "used by me" list of that module. By making the list two-way, we simplify module_unload_free(), and it allows for some trivial fixes later too. Signed-off-by: Linus Torvalds Signed-off-by: Rusty Russell (cleaned & rebased) --- include/linux/module.h | 4 ++- kernel/module.c | 79 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 6914fcad467..680db9e2ac3 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -359,7 +359,9 @@ struct module #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ - struct list_head modules_which_use_me; + struct list_head source_list; + /* What modules do I depend on? */ + struct list_head target_list; /* Who is waiting for us to be unloaded */ struct task_struct *waiter; diff --git a/kernel/module.c b/kernel/module.c index 0129769301e..be18c3e3468 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -523,7 +523,8 @@ static void module_unload_init(struct module *mod) { int cpu; - INIT_LIST_HEAD(&mod->modules_which_use_me); + INIT_LIST_HEAD(&mod->source_list); + INIT_LIST_HEAD(&mod->target_list); for_each_possible_cpu(cpu) { per_cpu_ptr(mod->refptr, cpu)->incs = 0; per_cpu_ptr(mod->refptr, cpu)->decs = 0; @@ -538,8 +539,9 @@ static void module_unload_init(struct module *mod) /* modules using other modules */ struct module_use { - struct list_head list; - struct module *module_which_uses; + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; }; /* Does a already use b? */ @@ -547,8 +549,8 @@ static int already_uses(struct module *a, struct module *b) { struct module_use *use; - list_for_each_entry(use, &b->modules_which_use_me, list) { - if (use->module_which_uses == a) { + list_for_each_entry(use, &b->source_list, source_list) { + if (use->source == a) { DEBUGP("%s uses %s!\n", a->name, b->name); return 1; } @@ -557,6 +559,33 @@ static int already_uses(struct module *a, struct module *b) return 0; } +/* + * Module a uses b + * - we add 'a' as a "source", 'b' as a "target" of module use + * - the module_use is added to the list of 'b' sources (so + * 'b' can walk the list to see who sourced them), and of 'a' + * targets (so 'a' can see what modules it targets). + */ +static int add_module_usage(struct module *a, struct module *b) +{ + int no_warn; + struct module_use *use; + + DEBUGP("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) { + printk(KERN_WARNING "%s: out of memory loading\n", a->name); + return -ENOMEM; + } + + use->source = a; + use->target = b; + list_add(&use->source_list, &b->source_list); + list_add(&use->target_list, &a->target_list); + no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); + return 0; +} + /* Module a uses b */ int use_module(struct module *a, struct module *b) { @@ -578,17 +607,11 @@ int use_module(struct module *a, struct module *b) if (err) return 0; - DEBUGP("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - printk("%s: out of memory loading\n", a->name); + err = add_module_usage(a, b); + if (err) { module_put(b); return 0; } - - use->module_which_uses = a; - list_add(&use->list, &b->modules_which_use_me); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 1; } EXPORT_SYMBOL_GPL(use_module); @@ -596,22 +619,16 @@ EXPORT_SYMBOL_GPL(use_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) { - struct module *i; - - list_for_each_entry(i, &modules, list) { - struct module_use *use; + struct module_use *use, *tmp; - list_for_each_entry(use, &i->modules_which_use_me, list) { - if (use->module_which_uses == mod) { - DEBUGP("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->list); - kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); - /* There can be at most one match. */ - break; - } - } + list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { + struct module *i = use->target; + DEBUGP("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->source_list); + list_del(&use->target_list); + kfree(use); + sysfs_remove_link(i->holders_dir, mod->name); } } @@ -735,7 +752,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, goto out; } - if (!list_empty(&mod->modules_which_use_me)) { + if (!list_empty(&mod->source_list)) { /* Other modules depend on us: get rid of them first. */ ret = -EWOULDBLOCK; goto out; @@ -799,9 +816,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ - list_for_each_entry(use, &mod->modules_which_use_me, list) { + list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; - seq_printf(m, "%s,", use->module_which_uses->name); + seq_printf(m, "%s,", use->source->name); } if (mod->init != NULL && mod->exit == NULL) { -- cgit v1.2.3-70-g09d2 From c8e21ced08b39ef8dfe7236fb2a923a95f645262 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:35 -0600 Subject: module: fix kdb's illicit use of struct module_use. Linus changed the structure, and luckily this didn't compile any more. Reported-by: Stephen Rothwell Signed-off-by: Rusty Russell Cc: Jason Wessel Cc: Martin Hicks --- include/linux/module.h | 7 +++++++ kernel/debug/kdb/kdb_main.c | 12 +++--------- kernel/module.c | 11 +---------- 3 files changed, 11 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 680db9e2ac3..5d8fca5dcff 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -181,6 +181,13 @@ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); #define symbol_get(x) ((typeof(&x))(__symbol_get(MODULE_SYMBOL_PREFIX #x))) +/* modules using other modules: kdb wants to see this. */ +struct module_use { + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; +}; + #ifndef __GENKSYMS__ #ifdef CONFIG_MODVERSIONS /* Mark the CRC weak since genksyms apparently decides not to diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b724c791b6d..184cd8209c3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1857,12 +1857,6 @@ static int kdb_ef(int argc, const char **argv) } #if defined(CONFIG_MODULES) -/* modules using other modules */ -struct module_use { - struct list_head list; - struct module *module_which_uses; -}; - /* * kdb_lsmod - This function implements the 'lsmod' command. Lists * currently loaded kernel modules. @@ -1894,9 +1888,9 @@ static int kdb_lsmod(int argc, const char **argv) { struct module_use *use; kdb_printf(" [ "); - list_for_each_entry(use, &mod->modules_which_use_me, - list) - kdb_printf("%s ", use->module_which_uses->name); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); kdb_printf("]\n"); } #endif diff --git a/kernel/module.c b/kernel/module.c index be18c3e3468..bbb1d812c79 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -536,14 +536,6 @@ static void module_unload_init(struct module *mod) mod->waiter = current; } -/* modules using other modules */ -struct module_use -{ - struct list_head source_list; - struct list_head target_list; - struct module *source, *target; -}; - /* Does a already use b? */ static int already_uses(struct module *a, struct module *b) { @@ -589,8 +581,7 @@ static int add_module_usage(struct module *a, struct module *b) /* Module a uses b */ int use_module(struct module *a, struct module *b) { - struct module_use *use; - int no_warn, err; + int err; if (b == NULL || already_uses(a, b)) return 1; -- cgit v1.2.3-70-g09d2 From 80a3d1bb410e000e176931a076cdf19a1e89a955 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: move sysfs exposure to end of load_module This means a little extra work, but is more logical: we don't put anything in sysfs until we're about to put the module into the global list an parse its parameters. This also gives us a logical place to put duplicate module detection in the next patch. Signed-off-by: Rusty Russell --- kernel/module.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index bbb1d812c79..c690d988579 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -560,7 +560,6 @@ static int already_uses(struct module *a, struct module *b) */ static int add_module_usage(struct module *a, struct module *b) { - int no_warn; struct module_use *use; DEBUGP("Allocating new usage for %s.\n", a->name); @@ -574,7 +573,6 @@ static int add_module_usage(struct module *a, struct module *b) use->target = b; list_add(&use->source_list, &b->source_list); list_add(&use->target_list, &a->target_list); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 0; } @@ -619,7 +617,6 @@ static void module_unload_free(struct module *mod) list_del(&use->source_list); list_del(&use->target_list); kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); } } @@ -1303,6 +1300,29 @@ static inline void remove_notes_attrs(struct module *mod) #endif #ifdef CONFIG_SYSFS +static void add_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + int nowarn; + + list_for_each_entry(use, &mod->target_list, target_list) { + nowarn = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + } +#endif +} + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); +#endif +} + int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; @@ -1385,6 +1405,10 @@ int mod_sysfs_setup(struct module *mod, { int err; + err = mod_sysfs_init(mod); + if (err) + goto out; + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); if (!mod->holders_dir) { err = -ENOMEM; @@ -1399,6 +1423,8 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; + add_usage_links(mod); + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1408,6 +1434,7 @@ out_unreg_holders: kobject_put(mod->holders_dir); out_unreg: kobject_put(&mod->mkobj.kobj); +out: return err; } @@ -1422,10 +1449,15 @@ static void mod_sysfs_fini(struct module *mod) { } +static void del_usage_links(struct module *mod) +{ +} + #endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { + del_usage_links(mod); module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); @@ -2242,11 +2274,6 @@ static noinline struct module *load_module(void __user *umod, /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); - /* add kobject, so we can reference it. */ - err = mod_sysfs_init(mod); - if (err) - goto free_unload; - /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); @@ -2443,6 +2470,7 @@ static noinline struct module *load_module(void __user *umod, err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); if (err < 0) goto unlink; + add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); @@ -2461,9 +2489,6 @@ static noinline struct module *load_module(void __user *umod, module_arch_cleanup(mod); cleanup: free_modinfo(mod); - kobject_del(&mod->mkobj.kobj); - kobject_put(&mod->mkobj.kobj); - free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) free_percpu(mod->refptr); -- cgit v1.2.3-70-g09d2 From 6407ebb271fc34440b306f305e1efb7685eece26 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: Make module sysfs functions private. These were placed in the header in ef665c1a06 to get the various SYSFS/MODULE config combintations to compile. That may have been necessary then, but it's not now. These functions are all local to module.c. Signed-off-by: Rusty Russell Cc: Randy Dunlap --- include/linux/module.h | 33 --------------------------------- kernel/module.c | 29 +++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 5d8fca5dcff..8a6b9fdc7ff 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -672,43 +672,10 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) #endif /* CONFIG_MODULES */ -struct device_driver; #ifdef CONFIG_SYSFS -struct module; - extern struct kset *module_kset; extern struct kobj_type module_ktype; extern int module_sysfs_initialized; - -int mod_sysfs_init(struct module *mod); -int mod_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params); -int module_add_modinfo_attrs(struct module *mod); -void module_remove_modinfo_attrs(struct module *mod); - -#else /* !CONFIG_SYSFS */ - -static inline int mod_sysfs_init(struct module *mod) -{ - return 0; -} - -static inline int mod_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params) -{ - return 0; -} - -static inline int module_add_modinfo_attrs(struct module *mod) -{ - return 0; -} - -static inline void module_remove_modinfo_attrs(struct module *mod) -{ } - #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) diff --git a/kernel/module.c b/kernel/module.c index c690d988579..808aa18dd66 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1323,7 +1323,7 @@ static void del_usage_links(struct module *mod) #endif } -int module_add_modinfo_attrs(struct module *mod) +static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; struct module_attribute *temp_attr; @@ -1349,7 +1349,7 @@ int module_add_modinfo_attrs(struct module *mod) return error; } -void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { struct module_attribute *attr; int i; @@ -1365,7 +1365,7 @@ void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } -int mod_sysfs_init(struct module *mod) +static int mod_sysfs_init(struct module *mod) { int err; struct kobject *kobj; @@ -1399,7 +1399,7 @@ out: return err; } -int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { @@ -1445,6 +1445,27 @@ static void mod_sysfs_fini(struct module *mod) #else /* CONFIG_SYSFS */ +static inline int mod_sysfs_init(struct module *mod) +{ + return 0; +} + +static inline int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline int module_add_modinfo_attrs(struct module *mod) +{ + return 0; +} + +static inline void module_remove_modinfo_attrs(struct module *mod) +{ +} + static void mod_sysfs_fini(struct module *mod) { } -- cgit v1.2.3-70-g09d2 From 75676500f8298f0ee89db12db97294883c4b768e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: make locking more fine-grained. Kay Sievers reports that we still have some contention over module loading which is slowing boot. Linus also disliked a previous "drop lock and regrab" patch to fix the bne2 "gave up waiting for init of module libcrc32c" message. This is more ambitious: we only grab the lock where we need it. Signed-off-by: Rusty Russell Cc: Brandon Philips Cc: Kay Sievers Cc: Linus Torvalds --- kernel/module.c | 65 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 808aa18dd66..d293c213c22 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -72,7 +72,11 @@ /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -/* List of modules, protected by module_mutex or preempt_disable +/* + * Mutex protects: + * 1) List of modules (also safely readable with preempt_disable), + * 2) module_use links, + * 3) module_addr_min/module_addr_max. * (delete uses stop_machine/add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); @@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); -/* Bounds of module allocation, for speeding __module_address */ +/* Bounds of module allocation, for speeding __module_address. + * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; int register_module_notifier(struct notifier_block * nb) @@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, } /* Find a symbol and return it, along with, (optional) crc and - * (optional) module which owns it */ + * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const unsigned long **crc, @@ -576,7 +581,7 @@ static int add_module_usage(struct module *a, struct module *b) return 0; } -/* Module a uses b */ +/* Module a uses b: caller needs module_mutex() */ int use_module(struct module *a, struct module *b) { int err; @@ -610,6 +615,7 @@ static void module_unload_free(struct module *mod) { struct module_use *use, *tmp; + mutex_lock(&module_mutex); list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { struct module *i = use->target; DEBUGP("%s unusing %s\n", mod->name, i->name); @@ -618,6 +624,7 @@ static void module_unload_free(struct module *mod) list_del(&use->target_list); kfree(use); } + mutex_unlock(&module_mutex); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -784,13 +791,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); async_synchronize_full(); - mutex_lock(&module_mutex); + /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); ddebug_remove_module(mod->name); - free_module(mod); - out: + free_module(mod); + return 0; +out: mutex_unlock(&module_mutex); return ret; } @@ -1006,6 +1014,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; + /* Since this should be found in kernel (which can't be removed), + * no locking is necessary. */ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); @@ -1048,8 +1058,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, } #endif /* CONFIG_MODVERSIONS */ -/* Resolve a symbol for this module. I.e. if we find one, record usage. - Must be holding module_mutex. */ +/* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, @@ -1059,6 +1068,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, const struct kernel_symbol *sym; const unsigned long *crc; + mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); /* use_module can fail due to OOM, @@ -1068,6 +1078,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, || !use_module(mod, owner)) sym = NULL; } + mutex_unlock(&module_mutex); return sym; } @@ -1306,10 +1317,12 @@ static void add_usage_links(struct module *mod) struct module_use *use; int nowarn; + mutex_lock(&module_mutex); list_for_each_entry(use, &mod->target_list, target_list) { nowarn = sysfs_create_link(use->target->holders_dir, &mod->mkobj.kobj, mod->name); } + mutex_unlock(&module_mutex); #endif } @@ -1318,8 +1331,10 @@ static void del_usage_links(struct module *mod) #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; + mutex_lock(&module_mutex); list_for_each_entry(use, &mod->target_list, target_list) sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); #endif } @@ -1497,13 +1512,15 @@ static int __unlink_module(void *_mod) return 0; } -/* Free a module, remove from lists, etc (must hold module_mutex). */ +/* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { trace_module_free(mod); /* Delete from various lists */ + mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); @@ -1575,7 +1592,14 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + const struct kernel_symbol *sym; + + /* Stopping preemption makes find_symbol safe. */ + preempt_disable(); + sym = find_symbol(s->name, &owner, NULL, true, false); + preempt_enable(); + + if (sym) { printk(KERN_ERR "%s: exports duplicate symbol %s" " (owned by %s)\n", @@ -2021,11 +2045,13 @@ static void *module_alloc_update_bounds(unsigned long size) void *ret = module_alloc(size); if (ret) { + mutex_lock(&module_mutex); /* Update module bounds. */ if ((unsigned long)ret < module_addr_min) module_addr_min = (unsigned long)ret; if ((unsigned long)ret + size > module_addr_max) module_addr_max = (unsigned long)ret + size; + mutex_unlock(&module_mutex); } return ret; } @@ -2482,7 +2508,9 @@ static noinline struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ + mutex_lock(&module_mutex); list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) @@ -2504,8 +2532,10 @@ static noinline struct module *load_module(void __user *umod, return mod; unlink: + mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); cleanup: @@ -2556,19 +2586,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - /* Only one module load at a time, please */ - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - /* Do all the hard work */ mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) { - mutex_unlock(&module_mutex); + if (IS_ERR(mod)) return PTR_ERR(mod); - } - - /* Drop lock so they can recurse */ - mutex_unlock(&module_mutex); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -2585,9 +2606,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); - mutex_lock(&module_mutex); free_module(mod); - mutex_unlock(&module_mutex); wake_up(&module_wq); return ret; } -- cgit v1.2.3-70-g09d2 From 3bafeb6247042dcbb72b0141ec7c7107de9f0b99 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: move find_module check to end I think Rusty may have made the lock a bit _too_ finegrained there, and didn't add it to some places that needed it. It looks, for example, like PATCH 1/2 actually drops the lock in places where it's needed ("find_module()" is documented to need it, but now load_module() didn't hold it at all when it did the find_module()). Rather than adding a new "module_loading" list, I think we should be able to just use the existing "modules" list, and just fix up the locking a bit. In fact, maybe we could just move the "look up existing module" a bit later - optimistically assuming that the module doesn't exist, and then just undoing the work if it turns out that we were wrong, just before adding ourselves to the list. Signed-off-by: Rusty Russell --- kernel/module.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d293c213c22..28450047852 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2226,11 +2226,6 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; } - if (find_module(mod->name)) { - err = -EEXIST; - goto free_mod; - } - mod->state = MODULE_STATE_COMING; /* Allow arches to frob section contents and sizes. */ @@ -2509,6 +2504,12 @@ static noinline struct module *load_module(void __user *umod, * The mutex protects against concurrent writers. */ mutex_lock(&module_mutex); + if (find_module(mod->name)) { + err = -EEXIST; + /* This will also unlock the mutex */ + goto already_exists; + } + list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2535,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + already_exists: mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); -- cgit v1.2.3-70-g09d2 From be593f4ce4eb1bd40e38fdc403371f149f6f12eb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:37 -0600 Subject: module: verify_export_symbols under the lock It disabled preempt so it was "safe", but nothing stops another module slipping in before this module is added to the global list now we don't hold the lock the whole time. So we check this just after we check for duplicate modules, and just before we put the module in the global list. (find_symbol finds symbols in coming and going modules, too). Signed-off-by: Rusty Russell --- kernel/module.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 28450047852..f99558e1945 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1571,6 +1571,8 @@ EXPORT_SYMBOL_GPL(__symbol_get); /* * Ensure that an exported symbol [global namespace] does not already exist * in the kernel or in some other module's exported symbol table. + * + * You must hold the module_mutex. */ static int verify_export_symbols(struct module *mod) { @@ -1592,14 +1594,7 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - const struct kernel_symbol *sym; - - /* Stopping preemption makes find_symbol safe. */ - preempt_disable(); - sym = find_symbol(s->name, &owner, NULL, true, false); - preempt_enable(); - - if (sym) { + if (find_symbol(s->name, &owner, NULL, true, false)) { printk(KERN_ERR "%s: exports duplicate symbol %s" " (owned by %s)\n", @@ -2440,11 +2435,6 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - if (err < 0) - goto cleanup; - /* Set up and sort exception table */ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -2506,10 +2496,14 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); if (find_module(mod->name)) { err = -EEXIST; - /* This will also unlock the mutex */ - goto already_exists; + goto unlock; } + /* Find duplicate symbols */ + err = verify_export_symbols(mod); + if (err < 0) + goto unlock; + list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2536,7 +2530,7 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); - already_exists: + unlock: mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); -- cgit v1.2.3-70-g09d2 From 9bea7f23952d5948f8e5dfdff4de09bb9981fb5f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:37 -0600 Subject: module: fix bne2 "gave up waiting for init of module libcrc32c" Problem: it's hard to avoid an init routine stumbling over a request_module these days. And it's not clear it's always a bad idea: for example, a module like kvm with dynamic dependencies on kvm-intel or kvm-amd would be neater if it could simply request_module the right one. In this particular case, it's libcrc32c: libcrc32c_mod_init crypto_alloc_shash crypto_alloc_tfm crypto_find_alg crypto_alg_mod_lookup crypto_larval_lookup request_module If another module is waiting inside resolve_symbol() for libcrc32c to finish initializing (ie. bne2 depends on libcrc32c) then it does so holding the module lock, and our request_module() can't make progress until that is released. Waiting inside resolve_symbol() without the lock isn't all that hard: we just need to pass the -EBUSY up the call chain so we can sleep where we don't hold the lock. Error reporting is a bit trickier: we need to copy the name of the unfinished module before releasing the lock. Other notes: 1) This also fixes a theoretical issue where a weak dependency would allow symbol version mismatches to be ignored. 2) We rename use_module to ref_module to make life easier for the only external user (the out-of-tree ksplice patches). Signed-off-by: Rusty Russell Cc: Linus Torvalds Cc: Tim Abbot Tested-by: Brandon Philips --- kernel/module.c | 91 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f99558e1945..8c6b42840dd 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -582,33 +582,26 @@ static int add_module_usage(struct module *a, struct module *b) } /* Module a uses b: caller needs module_mutex() */ -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { int err; - if (b == NULL || already_uses(a, b)) return 1; - - /* If we're interrupted or time out, we fail. */ - if (wait_event_interruptible_timeout( - module_wq, (err = strong_try_module_get(b)) != -EBUSY, - 30 * HZ) <= 0) { - printk("%s: gave up waiting for init of module %s.\n", - a->name, b->name); + if (b == NULL || already_uses(a, b)) return 0; - } - /* If strong_try_module_get() returned a different error, we fail. */ + /* If module isn't available, we fail. */ + err = strong_try_module_get(b); if (err) - return 0; + return err; err = add_module_usage(a, b); if (err) { module_put(b); - return 0; + return err; } - return 1; + return 0; } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) @@ -893,11 +886,11 @@ static inline void module_unload_free(struct module *mod) { } -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { - return strong_try_module_get(b) == 0; + return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); static inline void module_unload_init(struct module *mod) { @@ -1062,26 +1055,58 @@ static inline int same_magic(const char *amagic, const char *bmagic, static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, - struct module *mod) + struct module *mod, + char ownername[]) { struct module *owner; const struct kernel_symbol *sym; const unsigned long *crc; + int err; mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); - /* use_module can fail due to OOM, - or module initialization or unloading */ - if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc, owner) - || !use_module(mod, owner)) - sym = NULL; + if (!sym) + goto unlock; + + if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + sym = ERR_PTR(-EINVAL); + goto getname; } + + err = ref_module(mod, owner); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + +getname: + /* We must make copy under the lock if we failed to get ref. */ + strncpy(ownername, module_name(owner), MODULE_NAME_LEN); +unlock: mutex_unlock(&module_mutex); return sym; } +static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *name, + struct module *mod) +{ + const struct kernel_symbol *ksym; + char ownername[MODULE_NAME_LEN]; + + if (wait_event_interruptible_timeout(module_wq, + !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, + mod, ownername)) || + PTR_ERR(ksym) != -EBUSY, + 30 * HZ) <= 0) { + printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", + mod->name, ownername); + } + return ksym; +} + /* * /sys/module/foo/sections stuff * J. Corbet @@ -1638,21 +1663,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol(sechdrs, versindex, - strtab + sym[i].st_name, mod); + ksym = resolve_symbol_wait(sechdrs, versindex, + strtab + sym[i].st_name, + mod); /* Ok if resolved. */ - if (ksym) { + if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; break; } /* Ok if weak. */ - if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - printk(KERN_WARNING "%s: Unknown symbol %s\n", - mod->name, strtab + sym[i].st_name); - ret = -ENOENT; + printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", + mod->name, strtab + sym[i].st_name, + PTR_ERR(ksym)); + ret = PTR_ERR(ksym) ?: -ENOENT; break; default: -- cgit v1.2.3-70-g09d2 From f6ab91add6355e231e1c47897027b2a6ee4fa268 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Jun 2010 15:18:01 +0200 Subject: perf: Fix signed comparison in perf_adjust_period() Frederic reported that frequency driven swevents didn't work properly and even caused a division-by-zero error. It turns out there are two bugs, the division-by-zero comes from a failure to deal with that in perf_calculate_period(). The other was more interesting and turned out to be a wrong comparison in perf_adjust_period(). The comparison was between an s64 and u64 and got implicitly converted to an unsigned comparison. The problem is that period_left is typically < 0, so it ended up being always true. Cure this by making the local period variables s64. Reported-by: Frederic Weisbecker Tested-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 31d6afe9259..ff86c558af4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1507,6 +1507,9 @@ do { \ divisor = nsec * frequency; } + if (!divisor) + return dividend; + return div64_u64(dividend, divisor); } @@ -1529,7 +1532,7 @@ static int perf_event_start(struct perf_event *event) static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; - u64 period, sample_period; + s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); -- cgit v1.2.3-70-g09d2