From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Dec 2008 20:12:29 +0100 Subject: performance counters: core code Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/syscalls.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 04fb47bfb92..6cce728a626 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu); #endif -- cgit v1.2.3-70-g09d2 From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:26:59 +0100 Subject: perf counters: clean up 'raw' type API Impact: cleanup Introduce a separate hw_event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++++ include/linux/syscalls.h | 8 +++----- kernel/perf_counter.c | 15 ++++++++------- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5031b5614f2..daedd7d87c2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -38,6 +38,7 @@ enum hw_event_types { * If this bit is set in the type, then trigger NMI sampling: */ PERF_COUNT_NMI = (1 << 30), + PERF_COUNT_RAW = (1 << 31), }; /* @@ -49,6 +50,12 @@ enum perf_record_type { PERF_RECORD_GROUP, }; +struct perf_counter_event { + u32 hw_event_type; + u32 hw_event_period; + u64 hw_raw_ctrl; +}; + /** * struct hw_perf_counter - performance counter hardware details */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 6cce728a626..3ecd73d03da 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct perf_counter_event; #include #include @@ -625,9 +626,6 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu); +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 20508f05365..96c333a5b0f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -734,26 +734,27 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) * @pid: target pid */ asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu) +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd) { struct perf_counter_context *ctx; + struct perf_counter_event event; struct perf_counter *counter; int ret; + if (copy_from_user(&event, uevent, sizeof(event)) != 0) + return -EFAULT; + ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(hw_event_period, cpu, record_type); + counter = perf_counter_alloc(event.hw_event_period, cpu, record_type); if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter, hw_event_type); + ret = hw_perf_counter_init(counter, event.hw_event_type); if (ret) goto err_free_put_context; -- cgit v1.2.3-70-g09d2 From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ++++++----- include/linux/perf_counter.h | 98 ++++++++++++++++++++++++-------------- include/linux/syscalls.h | 12 +++-- kernel/perf_counter.c | 38 ++++++++------- 4 files changed, 106 insertions(+), 71 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 30e7ebf7827..ef1936a871a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); */ int hw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; - u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 0; if (capable(CAP_SYS_ADMIN)) { hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event_type & PERF_COUNT_NMI) + if (hw_event->nmi) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->event.hw_event_period; + hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter) if (!hwc->irq_period) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -((s32) hwc->irq_period); + hwc->next_count = -(s32)hwc->irq_period; /* * Raw event type provide the config in the event structure */ - hw_event_type &= ~PERF_COUNT_NMI; - if (hw_event_type == PERF_COUNT_RAW) { - hwc->config |= counter->event.hw_raw_ctrl; + if (hw_event->raw) { + hwc->config |= hw_event->type; } else { - if (hw_event_type >= max_intel_perfmon_events) + if (hw_event->type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event->type]; } counter->wakeup_pending = 0; @@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) int bit; list_for_each_entry(counter, &ctx->counters, list) { - if (counter->record_type != PERF_RECORD_SIMPLE || + if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || counter == leader) continue; @@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->event.hw_event_type); + perf_store_irq_data(leader, counter->hw_event.type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -410,7 +409,7 @@ again: perf_save_and_restart(counter); - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: continue; case PERF_RECORD_IRQ: @@ -418,7 +417,7 @@ again: break; case PERF_RECORD_GROUP: perf_store_irq_data(counter, - counter->event.hw_event_type); + counter->hw_event.type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1f0017673e7..a2b4852e2d7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -24,65 +24,93 @@ struct task_struct; /* - * Generalized hardware event types, used by the hw_event_type parameter - * of the sys_perf_counter_open() syscall: + * User-space ABI bits: + */ + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: */ enum hw_event_types { - PERF_COUNT_CYCLES, - PERF_COUNT_INSTRUCTIONS, - PERF_COUNT_CACHE_REFERENCES, - PERF_COUNT_CACHE_MISSES, - PERF_COUNT_BRANCH_INSTRUCTIONS, - PERF_COUNT_BRANCH_MISSES, /* - * If this bit is set in the type, then trigger NMI sampling: + * Common hardware events, generalized by the kernel: */ - PERF_COUNT_NMI = (1 << 30), - PERF_COUNT_RAW = (1 << 31), + PERF_COUNT_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, }; /* * IRQ-notification data record type: */ -enum perf_record_type { - PERF_RECORD_SIMPLE, - PERF_RECORD_IRQ, - PERF_RECORD_GROUP, +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; -struct perf_counter_event { - u32 hw_event_type; - u32 hw_event_period; - u64 hw_raw_ctrl; +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + u64 type; + + u64 irq_period; + u32 record_type; + + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + __reserved_1 : 29; + + u64 __reserved_2; }; +/* + * Kernel-internal data types: + */ + /** - * struct hw_perf_counter - performance counter hardware details + * struct hw_perf_counter - performance counter hardware details: */ struct hw_perf_counter { - u64 config; - unsigned long config_base; - unsigned long counter_base; - int nmi; - unsigned int idx; - u64 prev_count; - s32 next_count; - u64 irq_period; + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + u64 irq_period; + s32 next_count; }; /* * Hardcoded buffer length limit for now, for IRQ-fed events: */ -#define PERF_DATA_BUFLEN 2048 +#define PERF_DATA_BUFLEN 2048 /** * struct perf_data - performance counter IRQ data sampling ... */ struct perf_data { - int len; - int rd_idx; - int overrun; - u8 data[PERF_DATA_BUFLEN]; + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; }; /** @@ -96,7 +124,7 @@ struct perf_counter { #else atomic_t count32[2]; #endif - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -110,8 +138,6 @@ struct perf_counter { int oncpu; int cpu; - enum perf_record_type record_type; - /* read() / irq related data */ wait_queue_head_t waitq; /* optional: for NMIs */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3ecd73d03da..a549678b7c3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,7 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; -struct perf_counter_event; +struct perf_counter_hw_event; #include #include @@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd); + +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2557c670a3b..0d323ceda3a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: return perf_read_hw(counter, buf, count); @@ -707,7 +707,7 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) +perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) INIT_LIST_HEAD(&counter->list); init_waitqueue_head(&counter->waitq); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; - counter->cpu = cpu; - counter->record_type = record_type; - counter->event = *event; - counter->wakeup_pending = 0; + counter->irqdata = &counter->data[0]; + counter->usrdata = &counter->data[1]; + counter->cpu = cpu; + counter->hw_event = *hw_event; + counter->wakeup_pending = 0; return counter; } /** - * sys_perf_task_open - open a performance counter associate it to a task - * @hw_event_type: event type for monitoring/sampling... + * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * + * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd) +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd) + { struct perf_counter_context *ctx; - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct perf_counter *counter; int ret; - if (copy_from_user(&event, uevent, sizeof(event)) != 0) + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; ctx = find_get_context(pid, cpu); @@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(&event, cpu, record_type); + counter = perf_counter_alloc(&hw_event, cpu); if (!counter) goto err_put_context; -- cgit v1.2.3-70-g09d2 From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Feb 2009 22:43:46 +1100 Subject: perfcounters: fix a few minor cleanliness issues This fixes three issues noticed by Arnd Bergmann: - Add #ifdef __KERNEL__ and move some things around in perf_counter.h to make sure only the bits that userspace needs are exported to userspace. - Use __u64, __s64, __u32 types in the structs exported to userspace rather than u64, s64, u32. - Make the sys_perf_counter_open syscall available to the SPUs on Cell platforms. And one issue that I noticed in looking at the code again: - Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get the proper handling of int arguments on ppc64 (and some other 64-bit architectures). Reported-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/systbl.h | 2 +- include/linux/perf_counter.h | 43 +++++++++++++++++++++------------------ include/linux/syscalls.h | 9 +++----- kernel/perf_counter.c | 6 +++--- 4 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 4c8095f6bec..d312eec8abb 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) -SYSCALL(perf_counter_open) +SYSCALL_SPU(perf_counter_open) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 32cd1acb738..186efaf4966 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -13,20 +13,8 @@ #ifndef _LINUX_PERF_COUNTER_H #define _LINUX_PERF_COUNTER_H -#include -#include - -#ifdef CONFIG_PERF_COUNTERS -# include -#endif - -#include -#include -#include -#include -#include - -struct task_struct; +#include +#include /* * User-space ABI bits: @@ -78,12 +66,12 @@ enum perf_counter_record_type { * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - s64 type; + __s64 type; - u64 irq_period; - u32 record_type; + __u64 irq_period; + __u32 record_type; - u32 disabled : 1, /* off by default */ + __u32 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -95,7 +83,7 @@ struct perf_counter_hw_event { __reserved_1 : 23; - u64 __reserved_2; + __u64 __reserved_2; }; /* @@ -104,10 +92,24 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +#ifdef __KERNEL__ /* - * Kernel-internal data types: + * Kernel-internal data types and definitions: */ +#ifdef CONFIG_PERF_COUNTERS +# include +#endif + +#include +#include +#include +#include +#include +#include + +struct task_struct; + /** * struct hw_perf_counter - performance counter hardware details: */ @@ -293,4 +295,5 @@ static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } #endif +#endif /* __KERNEL__ */ #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 88255d3261a..28ef2be839c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -696,10 +696,7 @@ asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd); +asmlinkage long sys_perf_counter_open( + const struct perf_counter_hw_event __user *hw_event_uptr, + pid_t pid, int cpu, int group_fd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ad62965828d..16b14ba99d3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1690,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, * @cpu: target cpu * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, int cpu, int group_fd) +SYSCALL_DEFINE4(perf_counter_open, + const struct perf_counter_hw_event __user *, hw_event_uptr, + pid_t, pid, int, cpu, int, group_fd) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; -- cgit v1.2.3-70-g09d2 From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 4 Mar 2009 20:36:51 +1100 Subject: perfcounters: provide expansion room in the ABI Impact: ABI change This expands several fields in the perf_counter_hw_event struct and adds a "flags" argument to the perf_counter_open system call, in order that features can be added in future without ABI changes. In particular the record_type field is expanded to 64 bits, and the space for flag bits has been expanded from 32 to 64 bits. This also adds some new fields: * read_format (64 bits) is intended to provide a way to specify what userspace wants to get back when it does a read() on a simple (non-interrupting) counter; * exclude_idle (1 bit) provides a way for userspace to ask that events that occur when the cpu is idle be excluded; * extra_config_len will provide a way for userspace to supply an arbitrary amount of extra machine-specific PMU configuration data immediately following the perf_counter_hw_event struct, to allow sophisticated users to program things such as instruction matching CAMs and address range registers; * __reserved_3 and __reserved_4 provide space for future expansion. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 12 +++++++++--- include/linux/syscalls.h | 2 +- kernel/perf_counter.c | 10 +++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 186efaf4966..c42455ab155 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -69,9 +69,10 @@ struct perf_counter_hw_event { __s64 type; __u64 irq_period; - __u32 record_type; + __u64 record_type; + __u64 read_format; - __u32 disabled : 1, /* off by default */ + __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -80,10 +81,15 @@ struct perf_counter_hw_event { exclude_user : 1, /* don't count user */ exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 23; + __reserved_1 : 55; + + __u32 extra_config_len; + __u32 __reserved_4; __u64 __reserved_2; + __u64 __reserved_3; }; /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 28ef2be839c..ab1d7724739 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -698,5 +698,5 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( const struct perf_counter_hw_event __user *hw_event_uptr, - pid_t pid, int cpu, int group_fd); + pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 16b14ba99d3..b2e838959f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1683,16 +1683,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, } /** - * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ -SYSCALL_DEFINE4(perf_counter_open, +SYSCALL_DEFINE5(perf_counter_open, const struct perf_counter_hw_event __user *, hw_event_uptr, - pid_t, pid, int, cpu, int, group_fd) + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; @@ -1703,6 +1703,10 @@ SYSCALL_DEFINE4(perf_counter_open, int fput_needed2 = 0; int ret; + /* for future expandability... */ + if (flags) + return -EINVAL; + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; -- cgit v1.2.3-70-g09d2 From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 38 ++++++------ arch/x86/kernel/cpu/perf_counter.c | 16 ++--- include/linux/perf_counter.h | 34 +++++------ include/linux/syscalls.h | 4 +- kernel/perf_counter.c | 116 ++++++++++++++++++------------------- 5 files changed, 104 insertions(+), 104 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index c9633321e7a..ea54686cb78 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], } counter = ctrs[i]; if (first) { - eu = counter->hw_event.exclude_user; - ek = counter->hw_event.exclude_kernel; - eh = counter->hw_event.exclude_hv; + eu = counter->attr.exclude_user; + ek = counter->attr.exclude_kernel; + eh = counter->attr.exclude_hv; first = 0; - } else if (counter->hw_event.exclude_user != eu || - counter->hw_event.exclude_kernel != ek || - counter->hw_event.exclude_hv != eh) { + } else if (counter->attr.exclude_user != eu || + counter->attr.exclude_kernel != ek || + counter->attr.exclude_hv != eh) { return -EAGAIN; } } @@ -483,16 +483,16 @@ void hw_perf_enable(void) /* * Add in MMCR0 freeze bits corresponding to the - * hw_event.exclude_* bits for the first counter. + * attr.exclude_* bits for the first counter. * We have already checked that all counters have the * same values for these bits as the first counter. */ counter = cpuhw->counter[0]; - if (counter->hw_event.exclude_user) + if (counter->attr.exclude_user) cpuhw->mmcr[0] |= MMCR0_FCP; - if (counter->hw_event.exclude_kernel) + if (counter->attr.exclude_kernel) cpuhw->mmcr[0] |= freeze_counters_kernel; - if (counter->hw_event.exclude_hv) + if (counter->attr.exclude_hv) cpuhw->mmcr[0] |= MMCR0_FCHV; /* @@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, int n; u64 alt[MAX_EVENT_ALTERNATIVES]; - if (counter->hw_event.exclude_user - || counter->hw_event.exclude_kernel - || counter->hw_event.exclude_hv - || counter->hw_event.sample_period) + if (counter->attr.exclude_user + || counter->attr.exclude_kernel + || counter->attr.exclude_hv + || counter->attr.sample_period) return 0; if (ppmu->limited_pmc_event(ev)) @@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if (!perf_event_raw(&counter->hw_event)) { - ev = perf_event_id(&counter->hw_event); + if (!perf_event_raw(&counter->attr)) { + ev = perf_event_id(&counter->attr); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; } else { - ev = perf_event_config(&counter->hw_event); + ev = perf_event_config(&counter->attr); } counter->hw.config_base = ev; counter->hw.idx = 0; @@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) * the user set it to. */ if (!firmware_has_feature(FW_FEATURE_LPAR)) - counter->hw_event.exclude_hv = 0; + counter->attr.exclude_hv = 0; /* * If this is a per-task counter, then we can use @@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val, */ if (record) { addr = 0; - if (counter->hw_event.record_type & PERF_RECORD_ADDR) { + if (counter->attr.record_type & PERF_RECORD_ADDR) { /* * The user wants a data address recorded. * If we're not doing instruction sampling, diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 904571bea71..e16e8c13132 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void) } /* - * Setup the hardware configuration for a given hw_event_type + * Setup the hardware configuration for a given attr_type */ static int __hw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; + struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; int err; @@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Count user and OS events unless requested not to. */ - if (!hw_event->exclude_user) + if (!attr->exclude_user) hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!hw_event->exclude_kernel) + if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; if (!hwc->sample_period) @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); + if (perf_event_raw(attr)) { + hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); } else { - if (perf_event_id(hw_event) >= x86_pmu.max_events) + if (perf_event_id(attr) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(attr)); } counter->destroy = hw_perf_counter_destroy; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 45bdd3b95d3..37d5541d74c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -22,7 +22,7 @@ */ /* - * hw_event.type + * attr.type */ enum perf_event_types { PERF_TYPE_HARDWARE = 0, @@ -37,10 +37,10 @@ enum perf_event_types { }; /* - * Generalized performance counter event types, used by the hw_event.event_id + * Generalized performance counter event types, used by the attr.event_id * parameter of the sys_perf_counter_open() syscall: */ -enum hw_event_ids { +enum attr_ids { /* * Common hardware events, generalized by the kernel: */ @@ -94,7 +94,7 @@ enum sw_event_ids { #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) /* - * Bits that can be set in hw_event.sample_type to request information + * Bits that can be set in attr.sample_type to request information * in the overflow packets. */ enum perf_counter_sample_format { @@ -109,7 +109,7 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in hw_event.read_format to request that + * Bits that can be set in attr.read_format to request that * reads on the counter should return the indicated quantities, * in increasing order of bit value, after the counter value. */ @@ -122,7 +122,7 @@ enum perf_counter_read_format { /* * Hardware event to monitor via a performance monitoring counter: */ -struct perf_counter_hw_event { +struct perf_counter_attr { /* * The MSB of the config word signifies if the rest contains cpu * specific (raw) counter configuration data, if unset, the next @@ -323,25 +323,25 @@ enum perf_event_type { struct task_struct; -static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_raw(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_RAW_MASK; + return attr->config & PERF_COUNTER_RAW_MASK; } -static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_config(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_CONFIG_MASK; + return attr->config & PERF_COUNTER_CONFIG_MASK; } -static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_type(struct perf_counter_attr *attr) { - return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> + return (attr->config & PERF_COUNTER_TYPE_MASK) >> PERF_COUNTER_TYPE_SHIFT; } -static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_id(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_EVENT_MASK; + return attr->config & PERF_COUNTER_EVENT_MASK; } /** @@ -457,7 +457,7 @@ struct perf_counter { u64 tstamp_running; u64 tstamp_stopped; - struct perf_counter_hw_event hw_event; + struct perf_counter_attr attr; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -605,8 +605,8 @@ extern int perf_counter_overflow(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !perf_event_raw(&counter->hw_event) && - perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; + return !perf_event_raw(&counter->attr) && + perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE; } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 79faae950e2..c6c84ad8bd7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,7 +55,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; -struct perf_counter_hw_event; +struct perf_counter_attr; #include #include @@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( - const struct perf_counter_hw_event __user *hw_event_uptr, + const struct perf_counter_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index abe2f3b6c42..317cef78a38 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -260,7 +260,7 @@ counter_sched_out(struct perf_counter *counter, if (!is_software_counter(counter)) cpuctx->active_oncpu--; ctx->nr_active--; - if (counter->hw_event.exclusive || !cpuctx->active_oncpu) + if (counter->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -282,7 +282,7 @@ group_sched_out(struct perf_counter *group_counter, list_for_each_entry(counter, &group_counter->sibling_list, list_entry) counter_sched_out(counter, cpuctx, ctx); - if (group_counter->hw_event.exclusive) + if (group_counter->attr.exclusive) cpuctx->exclusive = 0; } @@ -550,7 +550,7 @@ counter_sched_in(struct perf_counter *counter, cpuctx->active_oncpu++; ctx->nr_active++; - if (counter->hw_event.exclusive) + if (counter->attr.exclusive) cpuctx->exclusive = 1; return 0; @@ -642,7 +642,7 @@ static int group_can_go_on(struct perf_counter *counter, * If this group is exclusive and there are already * counters on the CPU, it can't go on. */ - if (counter->hw_event.exclusive && cpuctx->active_oncpu) + if (counter->attr.exclusive && cpuctx->active_oncpu) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -725,7 +725,7 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } @@ -849,7 +849,7 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } @@ -927,7 +927,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh) /* * not supported on inherited counters */ - if (counter->hw_event.inherit) + if (counter->attr.inherit) return -EINVAL; atomic_add(refresh, &counter->event_limit); @@ -1094,7 +1094,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->hw_event.pinned) + !counter->attr.pinned) continue; if (counter->cpu != -1 && counter->cpu != cpu) continue; @@ -1122,7 +1122,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * ignore pinned counters since we did them already. */ if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->hw_event.pinned) + counter->attr.pinned) continue; /* @@ -1204,11 +1204,11 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) interrupts = 2*sysctl_perf_counter_limit/HZ; } - if (!counter->hw_event.freq || !counter->hw_event.sample_freq) + if (!counter->attr.freq || !counter->attr.sample_freq) continue; events = HZ * interrupts * counter->hw.sample_period; - period = div64_u64(events, counter->hw_event.sample_freq); + period = div64_u64(events, counter->attr.sample_freq); delta = (s64)(1 + period - counter->hw.sample_period); delta >>= 1; @@ -1444,11 +1444,11 @@ static void free_counter(struct perf_counter *counter) perf_pending_sync(counter); atomic_dec(&nr_counters); - if (counter->hw_event.mmap) + if (counter->attr.mmap) atomic_dec(&nr_mmap_tracking); - if (counter->hw_event.munmap) + if (counter->attr.munmap) atomic_dec(&nr_munmap_tracking); - if (counter->hw_event.comm) + if (counter->attr.comm) atomic_dec(&nr_comm_tracking); if (counter->destroy) @@ -1504,13 +1504,13 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) mutex_lock(&counter->child_mutex); values[0] = perf_counter_read(counter); n = 1; - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = counter->total_time_enabled + atomic64_read(&counter->child_total_time_enabled); - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); - if (counter->hw_event.read_format & PERF_FORMAT_ID) + if (counter->attr.read_format & PERF_FORMAT_ID) values[n++] = counter->id; mutex_unlock(&counter->child_mutex); @@ -1611,7 +1611,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) int ret = 0; u64 value; - if (!counter->hw_event.sample_period) + if (!counter->attr.sample_period) return -EINVAL; size = copy_from_user(&value, arg, sizeof(value)); @@ -1622,15 +1622,15 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) return -EINVAL; spin_lock_irq(&ctx->lock); - if (counter->hw_event.freq) { + if (counter->attr.freq) { if (value > sysctl_perf_counter_limit) { ret = -EINVAL; goto unlock; } - counter->hw_event.sample_freq = value; + counter->attr.sample_freq = value; } else { - counter->hw_event.sample_period = value; + counter->attr.sample_period = value; counter->hw.sample_period = value; perf_log_period(counter, value); @@ -2299,7 +2299,7 @@ static void perf_output_end(struct perf_output_handle *handle) struct perf_counter *counter = handle->counter; struct perf_mmap_data *data = handle->data; - int wakeup_events = counter->hw_event.wakeup_events; + int wakeup_events = counter->attr.wakeup_events; if (handle->overflow && wakeup_events) { int events = atomic_inc_return(&data->events); @@ -2339,7 +2339,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int ret; - u64 sample_type = counter->hw_event.sample_type; + u64 sample_type = counter->attr.sample_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; @@ -2441,7 +2441,7 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, addr); if (sample_type & PERF_SAMPLE_CONFIG) - perf_output_put(&handle, counter->hw_event.config); + perf_output_put(&handle, counter->attr.config); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); @@ -2512,7 +2512,7 @@ static void perf_counter_comm_output(struct perf_counter *counter, static int perf_counter_comm_match(struct perf_counter *counter, struct perf_comm_event *comm_event) { - if (counter->hw_event.comm && + if (counter->attr.comm && comm_event->event.header.type == PERF_EVENT_COMM) return 1; @@ -2623,11 +2623,11 @@ static void perf_counter_mmap_output(struct perf_counter *counter, static int perf_counter_mmap_match(struct perf_counter *counter, struct perf_mmap_event *mmap_event) { - if (counter->hw_event.mmap && + if (counter->attr.mmap && mmap_event->event.header.type == PERF_EVENT_MMAP) return 1; - if (counter->hw_event.munmap && + if (counter->attr.munmap && mmap_event->event.header.type == PERF_EVENT_MUNMAP) return 1; @@ -2907,8 +2907,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->hw_event.exclude_kernel || !regs) && - !counter->hw_event.exclude_user) + if ((counter->attr.exclude_kernel || !regs) && + !counter->attr.exclude_user) regs = task_pt_regs(current); if (regs) { @@ -2982,14 +2982,14 @@ static int perf_swcounter_match(struct perf_counter *counter, if (!perf_swcounter_is_counting(counter)) return 0; - if (counter->hw_event.config != event_config) + if (counter->attr.config != event_config) return 0; if (regs) { - if (counter->hw_event.exclude_user && user_mode(regs)) + if (counter->attr.exclude_user && user_mode(regs)) return 0; - if (counter->hw_event.exclude_kernel && !user_mode(regs)) + if (counter->attr.exclude_kernel && !user_mode(regs)) return 0; } @@ -3252,12 +3252,12 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->hw_event)); + ftrace_profile_disable(perf_event_id(&counter->attr)); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->hw_event); + int event_id = perf_event_id(&counter->attr); int ret; ret = ftrace_profile_enable(event_id); @@ -3265,7 +3265,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.sample_period = counter->hw_event.sample_period; + counter->hw.sample_period = counter->attr.sample_period; return &perf_ops_generic; } @@ -3287,7 +3287,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (perf_event_id(&counter->hw_event)) { + switch (perf_event_id(&counter->attr)) { case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -3319,7 +3319,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_hw_event *hw_event, +perf_counter_alloc(struct perf_counter_attr *attr, int cpu, struct perf_counter_context *ctx, struct perf_counter *group_leader, @@ -3352,36 +3352,36 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mmap_mutex); counter->cpu = cpu; - counter->hw_event = *hw_event; + counter->attr = *attr; counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; counter->oncpu = -1; counter->state = PERF_COUNTER_STATE_INACTIVE; - if (hw_event->disabled) + if (attr->disabled) counter->state = PERF_COUNTER_STATE_OFF; pmu = NULL; hwc = &counter->hw; - if (hw_event->freq && hw_event->sample_freq) - hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq); + if (attr->freq && attr->sample_freq) + hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq); else - hwc->sample_period = hw_event->sample_period; + hwc->sample_period = attr->sample_period; /* * we currently do not support PERF_SAMPLE_GROUP on inherited counters */ - if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) goto done; - if (perf_event_raw(hw_event)) { + if (perf_event_raw(attr)) { pmu = hw_perf_counter_init(counter); goto done; } - switch (perf_event_type(hw_event)) { + switch (perf_event_type(attr)) { case PERF_TYPE_HARDWARE: pmu = hw_perf_counter_init(counter); break; @@ -3409,11 +3409,11 @@ done: counter->pmu = pmu; atomic_inc(&nr_counters); - if (counter->hw_event.mmap) + if (counter->attr.mmap) atomic_inc(&nr_mmap_tracking); - if (counter->hw_event.munmap) + if (counter->attr.munmap) atomic_inc(&nr_munmap_tracking); - if (counter->hw_event.comm) + if (counter->attr.comm) atomic_inc(&nr_comm_tracking); return counter; @@ -3424,17 +3424,17 @@ static atomic64_t perf_counter_id; /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * - * @hw_event_uptr: event type attributes for monitoring/sampling + * @attr_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ SYSCALL_DEFINE5(perf_counter_open, - const struct perf_counter_hw_event __user *, hw_event_uptr, + const struct perf_counter_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; - struct perf_counter_hw_event hw_event; + struct perf_counter_attr attr; struct perf_counter_context *ctx; struct file *counter_file = NULL; struct file *group_file = NULL; @@ -3446,7 +3446,7 @@ SYSCALL_DEFINE5(perf_counter_open, if (flags) return -EINVAL; - if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) + if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) return -EFAULT; /* @@ -3484,11 +3484,11 @@ SYSCALL_DEFINE5(perf_counter_open, /* * Only a group leader can be exclusive or pinned */ - if (hw_event.exclusive || hw_event.pinned) + if (attr.exclusive || attr.pinned) goto err_put_context; } - counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, + counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, GFP_KERNEL); ret = PTR_ERR(counter); if (IS_ERR(counter)) @@ -3556,7 +3556,7 @@ inherit_counter(struct perf_counter *parent_counter, if (parent_counter->parent) parent_counter = parent_counter->parent; - child_counter = perf_counter_alloc(&parent_counter->hw_event, + child_counter = perf_counter_alloc(&parent_counter->attr, parent_counter->cpu, child_ctx, group_leader, GFP_KERNEL); if (IS_ERR(child_counter)) @@ -3565,7 +3565,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * Make the child state follow the state of the parent counter, - * not its hw_event.disabled bit. We hold the parent's mutex, + * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_counter_{en, dis}able_family. */ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) @@ -3582,7 +3582,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * inherit into child's child as well: */ - child_counter->hw_event.inherit = 1; + child_counter->attr.inherit = 1; /* * Get a reference to the parent filp - we will fput it @@ -3838,7 +3838,7 @@ int perf_counter_init_task(struct task_struct *child) if (counter != counter->group_leader) continue; - if (!counter->hw_event.inherit) { + if (!counter->attr.inherit) { inherited_all = 0; continue; } -- cgit v1.2.3-70-g09d2 From 974802eaa1afdc87e00821df7020a2b3c6fee623 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 Jun 2009 12:46:55 +0200 Subject: perf_counter: Add forward/backward attribute ABI compatibility Provide for means of extending the perf_counter_attr in a 'natural' way. We allow growing the structure by appending fields at the end by specifying the full structure size inside it. When a new kernel sees a smaller (old) structure, it will 0 pad the tail. When an old kernel sees a larger (new) structure, it will verify the tail consists of 0s, otherwise fail. If we fail due to a size-mismatch, we return -E2BIG and write the kernel's native attribe size back into the provided structure. Furthermore, add some attribute verification, so that we'll fail counter creation when unknown bits are present (PERF_SAMPLE, PERF_FORMAT, or in the __reserved fields). (This ABI detail is introduced while keeping the existing syscall ABI.) Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 19 ++++++++-- include/linux/syscalls.h | 2 +- kernel/perf_counter.c | 89 ++++++++++++++++++++++++++++++++++++++++++-- tools/perf/perf.h | 5 ++- 4 files changed, 105 insertions(+), 10 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7c4f32f6ae1..1b3118a1023 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -120,6 +120,8 @@ enum perf_counter_sample_format { PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, + + PERF_SAMPLE_MAX = 1U << 9, /* non-ABI */ }; /* @@ -131,17 +133,26 @@ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + + PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ }; +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + /* * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_attr { + /* * Major type: hardware/software/tracepoint/etc. */ __u32 type; - __u32 __reserved_1; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; /* * Type specific configuration information. @@ -168,12 +179,12 @@ struct perf_counter_attr { comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ - __reserved_2 : 53; + __reserved_1 : 53; __u32 wakeup_events; /* wakeup every n events */ - __u32 __reserved_3; + __u32 __reserved_2; - __u64 __reserved_4; + __u64 __reserved_3; }; /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6c84ad8bd7..418d90f5eff 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( - const struct perf_counter_attr __user *attr_uptr, + struct perf_counter_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 663bbe01505..29b685f551a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3584,6 +3584,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, case PERF_TYPE_TRACEPOINT: pmu = tp_perf_counter_init(counter); break; + + default: + break; } done: err = 0; @@ -3610,6 +3613,85 @@ done: return counter; } +static int perf_copy_attr(struct perf_counter_attr __user *uattr, + struct perf_counter_attr *attr) +{ + int ret; + u32 size; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0. + */ + if (size > sizeof(*attr)) { + unsigned long val; + unsigned long __user *addr; + unsigned long __user *end; + + addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), + sizeof(unsigned long)); + end = PTR_ALIGN((void __user *)uattr + size, + sizeof(unsigned long)); + + for (; addr < end; addr += sizeof(unsigned long)) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -3619,7 +3701,7 @@ done: * @group_fd: group leader counter fd */ SYSCALL_DEFINE5(perf_counter_open, - const struct perf_counter_attr __user *, attr_uptr, + struct perf_counter_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; @@ -3635,8 +3717,9 @@ SYSCALL_DEFINE5(perf_counter_open, if (flags) return -EINVAL; - if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) - return -EFAULT; + ret = perf_copy_attr(attr_uptr, &attr); + if (ret) + return ret; if (!attr.exclude_kernel) { if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index af0a5046d74..87a1aca4a42 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -53,11 +53,12 @@ static inline unsigned long long rdclock(void) _min1 < _min2 ? _min1 : _min2; }) static inline int -sys_perf_counter_open(struct perf_counter_attr *attr_uptr, +sys_perf_counter_open(struct perf_counter_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { - return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu, + attr->size = sizeof(*attr); + return syscall(__NR_perf_counter_open, attr, pid, cpu, group_fd, flags); } -- cgit v1.2.3-70-g09d2 From cc6f26774136b7f5307abcd3887f08360c9b7554 Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Tue, 16 Jun 2009 15:33:49 -0700 Subject: syscalls.h: remove duplicated declarations for sys_pipe2 sys_pipe2 is declared twice in include/linux/syscalls.h. Signed-off-by: Masatake YAMATO Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/syscalls.h') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 418d90f5eff..fa4242cdade 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -434,6 +434,7 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg); asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg); #endif +asmlinkage long sys_pipe(int __user *fildes); asmlinkage long sys_pipe2(int __user *fildes, int flags); asmlinkage long sys_dup(unsigned int fildes); asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd); @@ -751,8 +752,6 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, struct timespec __user *, const sigset_t __user *, size_t); -asmlinkage long sys_pipe2(int __user *, int); -asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -- cgit v1.2.3-70-g09d2