summaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c190
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c126
-rw-r--r--kernel/events/uprobes.c223
4 files changed, 351 insertions, 223 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cb4238e85b3..72348dc192c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
-static atomic_t perf_sample_allowed_ns __read_mostly =
- ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+static int perf_sample_allowed_ns __read_mostly =
+ DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
void update_perf_cpu_limits(void)
{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
tmp *= sysctl_perf_cpu_time_max_percent;
do_div(tmp, 100);
- atomic_set(&perf_sample_allowed_ns, tmp);
+ ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
* we detect that events are taking too long.
*/
#define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
+static DEFINE_PER_CPU(u64, running_sample_length);
void perf_sample_event_took(u64 sample_len_ns)
{
u64 avg_local_sample_len;
u64 local_samples_len;
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- if (atomic_read(&perf_sample_allowed_ns) == 0)
+ if (allowed_ns == 0)
return;
/* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
*/
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
- if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+ if (avg_local_sample_len <= allowed_ns)
return;
if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
printk_ratelimited(KERN_WARNING
- "perf samples too long (%lld > %d), lowering "
+ "perf samples too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len,
- atomic_read(&perf_sample_allowed_ns),
+ avg_local_sample_len, allowed_ns,
sysctl_perf_event_sample_rate);
update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
ctx->parent_ctx = NULL;
}
+ ctx->generation++;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+
+ ctx->generation++;
}
/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ size += sizeof(data->txn);
+
event->header_size = size;
}
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
+
+ ctx->generation++;
}
static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
}
/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled events.
- * If the number of enabled events is the same, then the set
- * of enabled events should be the same, because these are both
- * inherited contexts, therefore we can't access individual events
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
*/
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
- return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
- && ctx1->parent_gen == ctx2->parent_gen
- && !ctx1->pin_count && !ctx2->pin_count;
+ /* Pinning disables the swap optimization */
+ if (ctx1->pin_count || ctx2->pin_count)
+ return 0;
+
+ /* If ctx1 is the parent of ctx2 */
+ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+ return 1;
+
+ /* If ctx2 is the parent of ctx1 */
+ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+ return 1;
+
+ /*
+ * If ctx1 and ctx2 have the same parent; we flatten the parent
+ * hierarchy, see perf_event_init_context().
+ */
+ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+ ctx1->parent_gen == ctx2->parent_gen)
+ return 1;
+
+ /* Unmatched */
+ return 0;
}
static void __perf_event_sync_stat(struct perf_event *event,
@@ -2210,9 +2234,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
perf_event_update_userpage(next_event);
}
-#define list_next_entry(pos, member) \
- list_entry(pos->member.next, typeof(*pos), member)
-
static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
@@ -2244,7 +2265,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
- struct perf_event_context *parent;
+ struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
@@ -2256,10 +2277,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
return;
rcu_read_lock();
- parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp[ctxn];
- if (parent && next_ctx &&
- rcu_dereference(next_ctx->parent_ctx) == parent) {
+ if (!next_ctx)
+ goto unlock;
+
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+ /* If neither context have a parent context; they cannot be clones. */
+ if (!parent && !next_parent)
+ goto unlock;
+
+ if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
/*
* Looks like the two contexts are clones, so we might be
* able to optimize the context switch. We lock both
@@ -2287,6 +2316,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);
}
+unlock:
rcu_read_unlock();
if (do_switch) {
@@ -4572,6 +4602,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ perf_output_put(handle, data->txn);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5100,27 +5133,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- const char *name;
-
- memset(tmp, 0, sizeof(tmp));
+ char *name;
if (file) {
struct inode *inode;
dev_t dev;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ name = "//enomem";
+ goto cpy_name;
+ }
/*
- * d_path works from the end of the rb backwards, so we
+ * d_path() works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
- if (!buf) {
- name = strncpy(tmp, "//enomem", sizeof(tmp));
- goto got_name;
- }
- name = d_path(&file->f_path, buf, PATH_MAX);
+ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
- name = strncpy(tmp, "//toolong", sizeof(tmp));
- goto got_name;
+ name = "//toolong";
+ goto cpy_name;
}
inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
@@ -5128,34 +5160,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
-
+ goto got_name;
} else {
- if (arch_vma_name(mmap_event->vma)) {
- name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp) - 1);
- tmp[sizeof(tmp) - 1] = '\0';
- goto got_name;
- }
+ name = (char *)arch_vma_name(vma);
+ if (name)
+ goto cpy_name;
- if (!vma->vm_mm) {
- name = strncpy(tmp, "[vdso]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+ if (vma->vm_start <= vma->vm_mm->start_brk &&
vma->vm_end >= vma->vm_mm->brk) {
- name = strncpy(tmp, "[heap]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+ name = "[heap]";
+ goto cpy_name;
+ }
+ if (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack) {
- name = strncpy(tmp, "[stack]", sizeof(tmp));
- goto got_name;
+ name = "[stack]";
+ goto cpy_name;
}
- name = strncpy(tmp, "//anon", sizeof(tmp));
- goto got_name;
+ name = "//anon";
+ goto cpy_name;
}
+cpy_name:
+ strlcpy(tmp, name, sizeof(tmp));
+ name = tmp;
got_name:
- size = ALIGN(strlen(name)+1, sizeof(u64));
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(name)+1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ name[size++] = '\0';
mmap_event->file_name = name;
mmap_event->file_size = size;
@@ -5643,11 +5680,6 @@ static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
- if (event->cpu != -1) {
- swevent_hlist_put_cpu(event, event->cpu);
- return;
- }
-
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
@@ -5681,9 +5713,6 @@ static int swevent_hlist_get(struct perf_event *event)
int err;
int cpu, failed_cpu;
- if (event->cpu != -1)
- return swevent_hlist_get_cpu(event, event->cpu);
-
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
@@ -6292,6 +6321,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
+static DEVICE_ATTR_RO(type);
static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
@@ -6336,17 +6366,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
return count;
}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-static struct device_attribute pmu_dev_attrs[] = {
- __ATTR_RO(type),
- __ATTR_RW(perf_event_mux_interval_ms),
- __ATTR_NULL,
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
.name = "event_source",
- .dev_attrs = pmu_dev_attrs,
+ .dev_groups = pmu_dev_groups,
};
static void pmu_dev_release(struct device *dev)
@@ -6767,6 +6799,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
+ /* disabled for now */
+ if (attr->mmap2)
+ return -EINVAL;
+
if (attr->__reserved_1)
return -EINVAL;
@@ -7122,7 +7158,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
perf_install_in_context(ctx, event, event->cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -7205,7 +7240,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -7234,15 +7268,15 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
perf_remove_from_context(event);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
- list_add(&event->event_entry, &events);
+ list_add(&event->migrate_entry, &events);
}
mutex_unlock(&src_ctx->mutex);
synchronize_rcu();
mutex_lock(&dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &events, event_entry) {
- list_del(&event->event_entry);
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_del(&event->migrate_entry);
if (event->state >= PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_INACTIVE;
account_event_cpu(event, dst_cpu);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be..569b218782a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
}
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned int \
+static inline unsigned long \
func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned int len) \
+ const void *buf, unsigned long len) \
{ \
unsigned long size, written; \
\
do { \
- size = min_t(unsigned long, handle->size, len); \
- \
+ size = min(handle->size, len); \
written = memcpy_func(handle->addr, buf, size); \
+ written = size - written; \
\
len -= written; \
handle->addr += written; \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
-static inline int memcpy_common(void *dst, const void *src, size_t n)
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
{
memcpy(dst, src, n);
- return n;
+ return 0;
}
DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-#define MEMCPY_SKIP(dst, src, n) (n)
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+ return 0;
+}
-DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
#ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user __copy_from_user_inatomic
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+ unsigned long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ return ret;
+}
#endif
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b..e8b168af135 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/circ_buf.h>
#include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
- unsigned long offset, unsigned long head)
-{
- unsigned long sz = perf_data_size(rb);
- unsigned long mask = sz - 1;
-
- /*
- * check if user-writable
- * overwrite : over-write its own tail
- * !overwrite: buffer possibly drops events.
- */
- if (rb->overwrite)
- return true;
-
- /*
- * verify that payload is not bigger than buffer
- * otherwise masking logic may fail to detect
- * the "not enough space" condition
- */
- if ((head - offset) > sz)
- return false;
-
- offset = (offset - tail) & mask;
- head = (head - tail) & mask;
-
- if ((int)(head - offset) < 0)
- return false;
-
- return true;
-}
-
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLL_IN);
@@ -87,15 +57,36 @@ again:
goto out;
/*
- * Publish the known good head. Rely on the full barrier implied
- * by atomic_dec_and_test() order the rb->head read and this
- * write.
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ * kernel user
+ *
+ * READ ->data_tail READ ->data_head
+ * smp_mb() (A) smp_rmb() (C)
+ * WRITE $data READ $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->data_head WRITE ->data_tail
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * I don't think A needs to be a full barrier because we won't in fact
+ * write data until we see the store from userspace. So we simply don't
+ * issue the data WRITE until we observe it. Be conservative for now.
+ *
+ * OTOH, D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ *
+ * See perf_output_begin().
*/
+ smp_wmb();
rb->user_page->data_head = head;
/*
- * Now check if we missed an update, rely on the (compiler)
- * barrier in atomic_dec_and_test() to re-read rb->head.
+ * Now check if we missed an update -- rely on previous implied
+ * compiler barriers to force a re-read.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
@@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
- int have_lost;
- struct perf_sample_data sample_data;
+ int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
@@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
event = event->parent;
rb = rcu_dereference(event->rb);
- if (!rb)
+ if (unlikely(!rb))
goto out;
- handle->rb = rb;
- handle->event = event;
-
- if (!rb->nr_pages)
+ if (unlikely(!rb->nr_pages))
goto out;
+ handle->rb = rb;
+ handle->event = event;
+
have_lost = local_read(&rb->lost);
- if (have_lost) {
- lost_event.header.size = sizeof(lost_event);
- perf_event_header__init_id(&lost_event.header, &sample_data,
- event);
- size += lost_event.header.size;
+ if (unlikely(have_lost)) {
+ size += sizeof(lost_event);
+ if (event->attr.sample_id_all)
+ size += event->id_header_size;
}
perf_output_get_handle(handle);
do {
- /*
- * Userspace could choose to issue a mb() before updating the
- * tail pointer. So that all reads will be completed before the
- * write is issued.
- */
tail = ACCESS_ONCE(rb->user_page->data_tail);
- smp_rmb();
offset = head = local_read(&rb->head);
- head += size;
- if (unlikely(!perf_output_space(rb, tail, offset, head)))
+ if (!rb->overwrite &&
+ unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
+ head += size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
- if (head - local_read(&rb->wakeup) > rb->watermark)
+ /*
+ * Separate the userpage->tail read from the data stores below.
+ * Matches the MB userspace SHOULD issue after reading the data
+ * and before storing the new tail position.
+ *
+ * See perf_output_put_handle().
+ */
+ smp_mb();
+
+ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
- handle->page = offset >> (PAGE_SHIFT + page_order(rb));
- handle->page &= rb->nr_pages - 1;
- handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
- handle->addr = rb->data_pages[handle->page];
- handle->addr += handle->size;
- handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+ page_shift = PAGE_SHIFT + page_order(rb);
- if (have_lost) {
+ handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+ offset &= (1UL << page_shift) - 1;
+ handle->addr = rb->data_pages[handle->page] + offset;
+ handle->size = (1UL << page_shift) - offset;
+
+ if (unlikely(have_lost)) {
+ struct perf_sample_data sample_data;
+
+ lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
+ perf_event_header__init_id(&lost_event.header,
+ &sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ad8e1bdca70..24b7d6ca871 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
#include <linux/uprobes.h>
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
* supported by that architecture then we need to modify is_trap_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
- * have fixed length instructions.
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
*/
/*
- * write_opcode - write the opcode at a given virtual address.
+ * uprobe_write_opcode - write the opcode at a given virtual address.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* For mm @mm, write the opcode at @vaddr.
* Return 0 (success) or a negative errno.
*/
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
}
/**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+ return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
}
static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
return ret;
}
-static int
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
- unsigned long nbytes, loff_t offset)
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+ void *insn, int nbytes, loff_t offset)
{
struct page *page;
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
- struct address_space *mapping;
- unsigned long nbytes;
- int bytes;
-
- nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
- mapping = uprobe->inode->i_mapping;
+ struct address_space *mapping = uprobe->inode->i_mapping;
+ loff_t offs = uprobe->offset;
+ void *insn = uprobe->arch.insn;
+ int size = MAX_UINSN_BYTES;
+ int len, err = -EIO;
- /* Instruction at end of binary; copy only available bytes */
- if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
- bytes = uprobe->inode->i_size - uprobe->offset;
- else
- bytes = MAX_UINSN_BYTES;
+ /* Copy only available bytes, -EIO if nothing was read */
+ do {
+ if (offs >= i_size_read(uprobe->inode))
+ break;
- /* Instruction at the page-boundary; copy bytes in second page */
- if (nbytes < bytes) {
- int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
- bytes - nbytes, uprobe->offset + nbytes);
+ len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+ err = __copy_insn(mapping, filp, insn, len, offs);
if (err)
- return err;
- bytes = nbytes;
- }
- return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
+ break;
+
+ insn += len;
+ offs += len;
+ size -= len;
+ } while (size);
+
+ return err;
}
static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* write_opcode() assumes we don't cross page boundary */
+ /* uprobe_write_opcode() assumes we don't cross page boundary */
BUG_ON((uprobe->offset & ~PAGE_MASK) +
UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
}
/* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
- struct mm_struct *mm = current->mm;
int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
- ret = -ENOMEM;
- /* Try to map as high as possible, this is only a hint. */
- area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
- ret = area->vaddr;
- goto fail;
+ if (!area->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
}
ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
- ret = 0;
fail:
up_write(&mm->mmap_sem);
return ret;
}
-/*
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
+static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
- struct xol_area *area;
uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct xol_area *area;
- area = mm->uprobes_state.xol_area;
- if (area)
- goto ret;
-
- area = kzalloc(sizeof(*area), GFP_KERNEL);
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
if (!area->page)
goto free_bitmap;
- /* allocate first slot of task's xol_area for the return probes */
+ area->vaddr = vaddr;
+ init_waitqueue_head(&area->wq);
+ /* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
atomic_set(&area->slot_count, 1);
- init_waitqueue_head(&area->wq);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
- if (!xol_add_vma(area))
+ if (!xol_add_vma(mm, area))
return area;
__free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
free_area:
kfree(area);
out:
+ return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ if (!mm->uprobes_state.xol_area)
+ __create_xol_area(0);
+
area = mm->uprobes_state.xol_area;
- ret:
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
return area;
}
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
return 0;
/* Initialize the slot */
- copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
+ copy_to_page(area->page, xol_vaddr,
+ uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
/*
* We probably need flush_icache_user_range() but it needs vma.
* This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
}
/*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
- t->utask = NULL;
-}
-
-/*
* Allocate a uprobe_task object for the task if if necessary.
* Called when the thread hits a breakpoint.
*
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+ struct uprobe_task *n_utask;
+ struct return_instance **p, *o, *n;
+
+ n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!n_utask)
+ return -ENOMEM;
+ t->utask = n_utask;
+
+ p = &n_utask->return_instances;
+ for (o = o_utask->return_instances; o; o = o->next) {
+ n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ *n = *o;
+ atomic_inc(&n->uprobe->ref);
+ n->next = NULL;
+
+ *p = n;
+ p = &n->next;
+ n_utask->depth++;
+ }
+
+ return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n",
+ current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+ kfree(work);
+
+ if (current->flags & PF_EXITING)
+ return;
+
+ if (!__create_xol_area(current->utask->vaddr))
+ uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+ struct uprobe_task *utask = current->utask;
+ struct mm_struct *mm = current->mm;
+ struct callback_head *work;
+ struct xol_area *area;
+
+ t->utask = NULL;
+
+ if (!utask || !utask->return_instances)
+ return;
+
+ if (mm == t->mm && !(flags & CLONE_VFORK))
+ return;
+
+ if (dup_utask(t, utask))
+ return uprobe_warn(t, "dup ret instances");
+
+ /* The task can fork() after dup_xol_work() fails */
+ area = mm->uprobes_state.xol_area;
+ if (!area)
+ return uprobe_warn(t, "dup xol area");
+
+ if (mm == t->mm)
+ return;
+
+ /* TODO: move it into the union in uprobe_task */
+ work = kmalloc(sizeof(*work), GFP_KERNEL);
+ if (!work)
+ return uprobe_warn(t, "dup xol area");
+
+ t->utask->vaddr = area->vaddr;
+ init_task_work(work, dup_xol_work);
+ task_work_add(t, work, true);
+}
+
/*
* Current area->vaddr notion assume the trampoline address is always
* equal area->vaddr.
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)
return register_die_notifier(&uprobe_exception_nb);
}
-module_init(init_uprobes);
-
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
+__initcall(init_uprobes);