From 7699ad35ed06044c4fc1be162553880f98658616 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 15 Jun 2009 01:10:18 -0400 Subject: mtd: let include/linux/mtd/partitions.h stand on its own When declaring static MTD partitions in board specific code, only including should suffice without gcc nagging us with: In file included from arch/arm/mach-kirkwood/sheevaplug-setup.c:14: include/linux/mtd/partitions.h:50: warning: 'struct mtd_info' declared inside parameter list include/linux/mtd/partitions.h:50: warning: its scope is only this definition or declaration, which is probably not what you want include/linux/mtd/partitions.h:51: warning: 'struct mtd_info' declared inside parameter list include/linux/mtd/partitions.h:61: warning: 'struct mtd_info' declared inside parameter list include/linux/mtd/partitions.h:67: warning: 'struct mtd_info' declared inside parameter list Signed-off-by: Nicolas Pitre Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/partitions.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index af6dcb992bc..b70313d33ff 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -47,6 +47,8 @@ struct mtd_partition { #define MTDPART_SIZ_FULL (0) +struct mtd_info; + int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *, int); int del_mtd_partitions(struct mtd_info *); -- cgit v1.2.3-70-g09d2 From 6afc4fdb3e94ba60cd566cb878b60c6c01979277 Mon Sep 17 00:00:00 2001 From: Saeed Bishara Date: Tue, 28 Jul 2009 04:56:43 -0700 Subject: mtd: fix the conversion from dev to mtd_info The patch fixes a bug when converting dev to mtd_info by using the drvdata of the dev, the previous code used container_of(dev, struct mtd_info, dev), but won't work for the mtdXro devices as they created without being contained inside mtd_info structure. Signed-off-by: Saeed Bishara Signed-off-by: David Woodhouse --- drivers/mtd/mtdcore.c | 7 ++++--- include/linux/mtd/mtd.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index fac54a3fa3f..00ebf7af746 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -65,8 +65,8 @@ static void mtd_release(struct device *dev) static int mtd_cls_suspend(struct device *dev, pm_message_t state) { struct mtd_info *mtd = dev_to_mtd(dev); - - if (mtd->suspend) + + if (mtd && mtd->suspend) return mtd->suspend(mtd); else return 0; @@ -76,7 +76,7 @@ static int mtd_cls_resume(struct device *dev) { struct mtd_info *mtd = dev_to_mtd(dev); - if (mtd->resume) + if (mtd && mtd->resume) mtd->resume(mtd); return 0; } @@ -298,6 +298,7 @@ int add_mtd_device(struct mtd_info *mtd) mtd->dev.class = &mtd_class; mtd->dev.devt = MTD_DEVT(i); dev_set_name(&mtd->dev, "mtd%d", i); + dev_set_drvdata(&mtd->dev, mtd); if (device_register(&mtd->dev) != 0) { mtd_table[i] = NULL; break; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 5675b63a063..0f32a9b6ff5 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -251,7 +251,7 @@ struct mtd_info { static inline struct mtd_info *dev_to_mtd(struct device *dev) { - return dev ? container_of(dev, struct mtd_info, dev) : NULL; + return dev ? dev_get_drvdata(dev) : NULL; } static inline uint32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd) -- cgit v1.2.3-70-g09d2 From 5116d8f6b977970ebefc1932c0f313163a6ec91f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 17:10:01 +0300 Subject: KVM: fix ack not being delivered when msi present kvm_notify_acked_irq does not check irq type, so that it sometimes interprets msi vector as irq. As a result, ack notifiers are not called, which typially hangs the guest. The fix is to track and check irq type. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 + virt/kvm/irq_comm.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 16713dc672e..3060bdc35ff 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -110,6 +110,7 @@ struct kvm_memory_slot { struct kvm_kernel_irq_routing_entry { u32 gsi; + u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int level); union { diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a8bd466d00c..ddc17f0e2f3 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -160,7 +160,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) unsigned gsi = pin; list_for_each_entry(e, &kvm->irq_routing, link) - if (e->irqchip.irqchip == irqchip && + if (e->type == KVM_IRQ_ROUTING_IRQCHIP && + e->irqchip.irqchip == irqchip && e->irqchip.pin == pin) { gsi = e->gsi; break; @@ -259,6 +260,7 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, int delta; e->gsi = ue->gsi; + e->type = ue->type; switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; -- cgit v1.2.3-70-g09d2 From af6af30c0fcd77e621638e53ef8b176bca8bd3b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Aug 2009 20:41:04 +0200 Subject: ftrace: Fix perf-tracepoint OOPS Not all tracepoints are created equal, in specific the ftrace tracepoints are created with TRACE_EVENT_FORMAT() which does not generate the needed bits to tie them into perf counters. For those events, don't create the 'id' file and fail ->profile_enable when their ID is specified through other means. Reported-by: Chris Mason Signed-off-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1249497664.5890.4.camel@laptop> [ v2: fix build error in the !CONFIG_EVENT_PROFILE case ] Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 8 +++----- kernel/trace/trace_event_profile.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5c093ffc655..d7cd193c227 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -119,11 +119,9 @@ struct ftrace_event_call { void *filter; void *mod; -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); }; #define MAX_FILTER_PRED 32 diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895afecf..11ba5bb4ed0 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { + if (event->id == event_id && event->profile_enable) { ret = event->profile_enable(event); break; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 23d2972b22d..e75276a49cf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, entry = trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id) + if (call->id && call->profile_enable) entry = trace_create_file("id", 0444, call->dir, call, id); -- cgit v1.2.3-70-g09d2 From d82f1c35348cebe2fb2d4a4d31ce0ab0769e3d93 Mon Sep 17 00:00:00 2001 From: Eric Miao Date: Wed, 5 Aug 2009 01:24:41 -0700 Subject: Input: matrix_keypad - make matrix keymap size dynamic Remove assumption on the shift and size of rows/columns form matrix_keypad driver. Signed-off-by: Eric Miao Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/matrix_keypad.c | 18 +++++++++--------- include/linux/input/matrix_keypad.h | 13 +++++++------ 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c index e9b2e7cb05b..541b981ff07 100644 --- a/drivers/input/keyboard/matrix_keypad.c +++ b/drivers/input/keyboard/matrix_keypad.c @@ -27,6 +27,7 @@ struct matrix_keypad { const struct matrix_keypad_platform_data *pdata; struct input_dev *input_dev; unsigned short *keycodes; + unsigned int row_shift; uint32_t last_key_state[MATRIX_MAX_COLS]; struct delayed_work work; @@ -136,7 +137,7 @@ static void matrix_keypad_scan(struct work_struct *work) if ((bits_changed & (1 << row)) == 0) continue; - code = (row << 4) + col; + code = MATRIX_SCAN_CODE(row, col, keypad->row_shift); input_event(input_dev, EV_MSC, MSC_SCAN, code); input_report_key(input_dev, keypad->keycodes[code], @@ -317,6 +318,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) struct matrix_keypad *keypad; struct input_dev *input_dev; unsigned short *keycodes; + unsigned int row_shift; int i; int err; @@ -332,14 +334,11 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) return -EINVAL; } - if (!keymap_data->max_keymap_size) { - dev_err(&pdev->dev, "invalid keymap data supplied\n"); - return -EINVAL; - } + row_shift = get_count_order(pdata->num_col_gpios); keypad = kzalloc(sizeof(struct matrix_keypad), GFP_KERNEL); - keycodes = kzalloc(keymap_data->max_keymap_size * - sizeof(keypad->keycodes), + keycodes = kzalloc((pdata->num_row_gpios << row_shift) * + sizeof(*keycodes), GFP_KERNEL); input_dev = input_allocate_device(); if (!keypad || !keycodes || !input_dev) { @@ -350,6 +349,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) keypad->input_dev = input_dev; keypad->pdata = pdata; keypad->keycodes = keycodes; + keypad->row_shift = row_shift; keypad->stopped = true; INIT_DELAYED_WORK(&keypad->work, matrix_keypad_scan); spin_lock_init(&keypad->lock); @@ -363,7 +363,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) input_dev->keycode = keycodes; input_dev->keycodesize = sizeof(*keycodes); - input_dev->keycodemax = keymap_data->max_keymap_size; + input_dev->keycodemax = pdata->num_row_gpios << keypad->row_shift; for (i = 0; i < keymap_data->keymap_size; i++) { unsigned int key = keymap_data->keymap[i]; @@ -371,7 +371,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) unsigned int col = KEY_COL(key); unsigned short code = KEY_VAL(key); - keycodes[(row << 4) + col] = code; + keycodes[MATRIX_SCAN_CODE(row, col, row_shift)] = code; __set_bit(code, input_dev->keybit); } __clear_bit(KEY_RESERVED, input_dev->keybit); diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h index 7964516c695..15d5903af2d 100644 --- a/include/linux/input/matrix_keypad.h +++ b/include/linux/input/matrix_keypad.h @@ -15,12 +15,13 @@ #define KEY_COL(k) (((k) >> 16) & 0xff) #define KEY_VAL(k) ((k) & 0xffff) +#define MATRIX_SCAN_CODE(row, col, row_shift) (((row) << (row_shift)) + (col)) + /** * struct matrix_keymap_data - keymap for matrix keyboards * @keymap: pointer to array of uint32 values encoded with KEY() macro * representing keymap * @keymap_size: number of entries (initialized) in this keymap - * @max_keymap_size: maximum size of keymap supported by the device * * This structure is supposed to be used by platform code to supply * keymaps to drivers that implement matrix-like keypads/keyboards. @@ -28,14 +29,13 @@ struct matrix_keymap_data { const uint32_t *keymap; unsigned int keymap_size; - unsigned int max_keymap_size; }; /** * struct matrix_keypad_platform_data - platform-dependent keypad data * @keymap_data: pointer to &matrix_keymap_data - * @row_gpios: array of gpio numbers reporesenting rows - * @col_gpios: array of gpio numbers reporesenting colums + * @row_gpios: pointer to array of gpio numbers representing rows + * @col_gpios: pointer to array of gpio numbers reporesenting colums * @num_row_gpios: actual number of row gpios used by device * @num_col_gpios: actual number of col gpios used by device * @col_scan_delay_us: delay, measured in microseconds, that is @@ -48,8 +48,9 @@ struct matrix_keymap_data { struct matrix_keypad_platform_data { const struct matrix_keymap_data *keymap_data; - unsigned int row_gpios[MATRIX_MAX_ROWS]; - unsigned int col_gpios[MATRIX_MAX_COLS]; + const unsigned int *row_gpios; + const unsigned int *col_gpios; + unsigned int num_row_gpios; unsigned int num_col_gpios; -- cgit v1.2.3-70-g09d2 From 54e346215e4fe2ca8c94c54e546cc61902060510 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Aug 2009 14:38:25 -0300 Subject: vfs: fix inode_init_always calling convention Currently inode_init_always calls into ->destroy_inode if the additional initialization fails. That's not only counter-intuitive because inode_init_always did not allocate the inode structure, but in case of XFS it's actively harmful as ->destroy_inode might delete the inode from a radix-tree that has never been added. This in turn might end up deleting the inode for the same inum that has been instanciated by another process and cause lots of cause subtile problems. Also in the case of re-initializing a reclaimable inode in XFS it would free an inode we still want to keep alive. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen --- fs/inode.c | 30 +++++++++++++++++------------- fs/xfs/xfs_iget.c | 17 +++++------------ include/linux/fs.h | 2 +- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/fs/inode.c b/fs/inode.c index 901bad1e5f1..af2c05235cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode) * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. */ -struct inode *inode_init_always(struct super_block *sb, struct inode *inode) +int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; static struct inode_operations empty_iops; static const struct file_operations empty_fops; - struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; @@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->dirtied_when = 0; if (security_inode_alloc(inode)) - goto out_free_inode; + goto out; /* allocate and initialize an i_integrity */ if (ima_inode_alloc(inode)) @@ -198,16 +197,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif - return inode; + return 0; out_free_security: security_inode_free(inode); -out_free_inode: - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; +out: + return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); @@ -220,9 +215,18 @@ static struct inode *alloc_inode(struct super_block *sb) else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); - if (inode) - return inode_init_always(sb, inode); - return NULL; + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, inode); + return NULL; + } + + return inode; } void destroy_inode(struct inode *inode) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 5fcec6f020a..719c85b155f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -64,6 +64,10 @@ xfs_inode_alloc( ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); if (!ip) return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -105,17 +109,6 @@ xfs_inode_alloc( #ifdef XFS_DIR2_TRACE ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif - /* - * Now initialise the VFS inode. We do this after the xfs_inode - * initialisation as internal failures will result in ->destroy_inode - * being called and that will pass down through the reclaim path and - * free the XFS inode. This path requires the XFS inode to already be - * initialised. Hence if this call fails, the xfs_inode has already - * been freed and we should not reference it at all in the error - * handling. - */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) - return NULL; /* prevent anyone from using this yet */ VFS_I(ip)->i_state = I_NEW|I_LOCK; @@ -167,7 +160,7 @@ xfs_iget_cache_hit( * errors cleanly, then tag it so it can be set up correctly * later. */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { + if (inode_init_always(mp->m_super, VFS_I(ip))) { error = ENOMEM; goto out_error; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a36ffa5a77a..0c3b5e58a98 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2137,7 +2137,7 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); -extern struct inode * inode_init_always(struct super_block *, struct inode *); +extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); extern void inode_add_to_lists(struct super_block *, struct inode *); extern void iput(struct inode *); -- cgit v1.2.3-70-g09d2 From 2e00c97e2c1d2ffc9e26252ca26b237678b0b772 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Aug 2009 14:38:29 -0300 Subject: vfs: add __destroy_inode When we want to tear down an inode that lost the add to the cache race in XFS we must not call into ->destroy_inode because that would delete the inode that won the race from the inode cache radix tree. This patch provides the __destroy_inode helper needed to fix this, the actual fix will be in th next patch. As XFS was the only reason destroy_inode was exported we shift the export to the new __destroy_inode. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen --- fs/inode.c | 10 +++++++--- include/linux/fs.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/inode.c b/fs/inode.c index af2c05235cc..ae7b67e4866 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -229,7 +229,7 @@ static struct inode *alloc_inode(struct super_block *sb) return inode; } -void destroy_inode(struct inode *inode) +void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); ima_inode_free(inode); @@ -241,13 +241,17 @@ void destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif +} +EXPORT_SYMBOL(__destroy_inode); + +void destroy_inode(struct inode *inode) +{ + __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else kmem_cache_free(inode_cachep, (inode)); } -EXPORT_SYMBOL(destroy_inode); - /* * These are initializations that only need to be done diff --git a/include/linux/fs.h b/include/linux/fs.h index 0c3b5e58a98..67888a9e065 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2164,6 +2164,7 @@ extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void destroy_inode(struct inode *); +extern void __destroy_inode(struct inode *); extern struct inode *new_inode(struct super_block *); extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); -- cgit v1.2.3-70-g09d2 From 4bfc44958e499af9a73f62201543b3a1f617cfeb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 6 Aug 2009 15:07:33 -0700 Subject: mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro Signed-off-by: KAMEZAWA Hiroyuki Cc: Miao Xie Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Paul Menage Cc: Nick Piggin Cc: Yasunori Goto Cc: Pekka Enberg Cc: David Rientjes Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 28 ++++++++++++++++ mm/mempolicy.c | 84 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 86 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 829b94b156f..b359c4a9ec9 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -82,6 +82,12 @@ * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. + * + * NODEMASK_SCRATCH + * When doing above logical AND, OR, XOR, Remap operations the callers tend to + * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, + * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper + * for such situations. See below and CPUMASK_ALLOC also. */ #include @@ -473,4 +479,26 @@ static inline int num_node_state(enum node_states state) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +/* + * For nodemask scrach area.(See CPUMASK_ALLOC() in cpumask.h) + */ + +#if NODES_SHIFT > 8 /* nodemask_t > 64 bytes */ +#define NODEMASK_ALLOC(x, m) struct x *m = kmalloc(sizeof(*m), GFP_KERNEL) +#define NODEMASK_FREE(m) kfree(m) +#else +#define NODEMASK_ALLOC(x, m) struct x _m, *m = &_m +#define NODEMASK_FREE(m) +#endif + +/* A example struture for using NODEMASK_ALLOC, used in mempolicy. */ +struct nodemask_scratch { + nodemask_t mask1; + nodemask_t mask2; +}; + +#define NODEMASK_SCRATCH(x) NODEMASK_ALLOC(nodemask_scratch, x) +#define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) + + #endif /* __LINUX_NODEMASK_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63..7dd9d9f8069 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) { - nodemask_t cpuset_context_nmask; int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); + nodes_and(nsc->mask2, *nodes, nsc->mask1); + if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else @@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) cpuset_current_mems_allowed; } - ret = mpol_ops[pol->mode].create(pol, - nodes ? &cpuset_context_nmask : NULL); + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } @@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, { struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (mm) down_write(&mm->mmap_sem); task_lock(current); - ret = mpol_set_nodemask(new, nodes); + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); if (mm) up_write(&mm->mmap_sem); mpol_put(new); - return ret; + goto out; } old = current->mempolicy; current->mempolicy = new; @@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, up_write(&mm->mmap_sem); mpol_put(old); - return 0; + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* @@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) return err; } - down_write(&mm->mmap_sem); - task_lock(current); - err = mpol_set_nodemask(new, nmask); - task_unlock(current); + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } if (err) { - up_write(&mm->mmap_sem); mpol_put(new); return err; } @@ -1891,6 +1911,7 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) { mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); return; /* no valid nodemask intersection */ } task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ if (ret) { + NODEMASK_SCRATCH_FREE(scratch); mpol_put(new); return; } @@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ + NODEMASK_SCRATCH_FREE(scratch); } } @@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) err = 1; else { int ret; - - task_lock(current); - ret = mpol_set_nodemask(new, &nodes); - task_unlock(current); - if (ret) + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { err = 1; - else if (no_context) { + mpol_put(new); + } else if (no_context) { /* save for contextualization */ new->w.user_nodemask = nodes; } -- cgit v1.2.3-70-g09d2 From daeb6b6fbe27049f465c48a7d0ee5555c3b84064 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 6 Aug 2009 15:09:30 -0700 Subject: bzip2/lzma/gzip: fix comments describing decompressor API Fix and improve comments in decompress/generic.h that describe the decompressor API. Also remove an unused definition, and rename INBUF_LEN in lib/decompress_inflate.c to conform to bzip2/lzma naming. Signed-off-by: Phillip Lougher Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/decompress/generic.h | 32 +++++++++++++++++++------------- lib/decompress_inflate.c | 8 ++++---- 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/decompress/generic.h b/include/linux/decompress/generic.h index 6dfb856327b..0c7111a55a1 100644 --- a/include/linux/decompress/generic.h +++ b/include/linux/decompress/generic.h @@ -1,31 +1,37 @@ #ifndef DECOMPRESS_GENERIC_H #define DECOMPRESS_GENERIC_H -/* Minimal chunksize to be read. - *Bzip2 prefers at least 4096 - *Lzma prefers 0x10000 */ -#define COMPR_IOBUF_SIZE 4096 - typedef int (*decompress_fn) (unsigned char *inbuf, int len, int(*fill)(void*, unsigned int), - int(*writebb)(void*, unsigned int), - unsigned char *output, + int(*flush)(void*, unsigned int), + unsigned char *outbuf, int *posp, void(*error)(char *x)); /* inbuf - input buffer *len - len of pre-read data in inbuf - *fill - function to fill inbuf if empty - *writebb - function to write out outbug + *fill - function to fill inbuf when empty + *flush - function to write out outbuf + *outbuf - output buffer *posp - if non-null, input position (number of bytes read) will be * returned here * - *If len != 0, the inbuf is initialized (with as much data), and fill - *should not be called - *If len = 0, the inbuf is allocated, but empty. Its size is IOBUF_SIZE - *fill should be called (repeatedly...) to read data, at most IOBUF_SIZE + *If len != 0, inbuf should contain all the necessary input data, and fill + *should be NULL + *If len = 0, inbuf can be NULL, in which case the decompressor will allocate + *the input buffer. If inbuf != NULL it must be at least XXX_IOBUF_SIZE bytes. + *fill will be called (repeatedly...) to read data, at most XXX_IOBUF_SIZE + *bytes should be read per call. Replace XXX with the appropriate decompressor + *name, i.e. LZMA_IOBUF_SIZE. + * + *If flush = NULL, outbuf must be large enough to buffer all the expected + *output. If flush != NULL, the output buffer will be allocated by the + *decompressor (outbuf = NULL), and the flush function will be called to + *flush the output buffer at the appropriate time (decompressor and stream + *dependent). */ + /* Utility routine to detect the decompression method */ decompress_fn decompress_method(const unsigned char *inbuf, int len, const char **name); diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index e36b296fc9f..bfe605ac534 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -25,7 +25,7 @@ #include #include -#define INBUF_LEN (16*1024) +#define GZIP_IOBUF_SIZE (16*1024) /* Included from initramfs et al code */ STATIC int INIT gunzip(unsigned char *buf, int len, @@ -55,7 +55,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, if (buf) zbuf = buf; else { - zbuf = malloc(INBUF_LEN); + zbuf = malloc(GZIP_IOBUF_SIZE); len = 0; } if (!zbuf) { @@ -77,7 +77,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, } if (len == 0) - len = fill(zbuf, INBUF_LEN); + len = fill(zbuf, GZIP_IOBUF_SIZE); /* verify the gzip header */ if (len < 10 || @@ -113,7 +113,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, while (rc == Z_OK) { if (strm->avail_in == 0) { /* TODO: handle case where both pos and fill are set */ - len = fill(zbuf, INBUF_LEN); + len = fill(zbuf, GZIP_IOBUF_SIZE); if (len < 0) { rc = -1; error("read error"); -- cgit v1.2.3-70-g09d2 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- include/linux/perf_counter.h | 9 ++- include/trace/ftrace.h | 130 ++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 18 +++++- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 4 -- tools/perf/builtin-record.c | 1 + 7 files changed, 126 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d7cd193c227..a81170de7f6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -89,7 +89,9 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; - +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e604e6ef72d..a67dd5c5b6d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,8 +121,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_TP_RECORD = 1U << 10, - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; /* @@ -413,6 +414,11 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; +struct perf_tracepoint_record { + int size; + char *record; +}; + struct task_struct; /** @@ -681,6 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; + void *private; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fec71f8dbc4..7fb16d90e7b 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \ /* * Generate the functions needed for tracepoint perf_counter support. * - * static void ftrace_profile_(proto) - * { - * extern void perf_tpcounter_event(int, u64, u64); - * u64 __addr = 0, __count = 1; - * - * <-- here we expand the TP_perf_assign() macro - * - * perf_tpcounter_event(event_.id, __addr, __count); - * } + * NOTE: The insertion profile callback (ftrace_profile_) is defined later * * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) * { @@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \ * */ -#undef TP_fast_assign -#define TP_fast_assign(args...) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args - -#undef __perf_addr -#define __perf_addr(a) __addr = (a) - -#undef __perf_count -#define __perf_count(c) __count = (c) - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int, u64, u64); \ - u64 __addr = 0, __count = 1; \ - { assign; } \ - perf_tpcounter_event(event_##call.id, __addr, __count); \ -} \ +static void ftrace_profile_##call(proto); \ \ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ @@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - #endif /* @@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Define the insertion callback to profile events + * + * The job is very similar to ftrace_raw_event_ except that we don't + * insert in the ring buffer but in a perf counter. + * + * static void ftrace_profile_(proto) + * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; + * struct ftrace_event_call *event_call = &event_; + * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * struct ftrace_raw_##call *entry; + * u64 __addr = 0, __count = 1; + * unsigned long irq_flags; + * int __entry_size; + * int __data_size; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * __entry_size = __data_size + sizeof(*entry); + * + * do { + * char raw_data[__entry_size]; <- allocate our sample in the stack + * struct trace_entry *ent; + * + * entry = (struct ftrace_raw_ *)raw_data; + * ent = &entry->ent; + * tracing_generic_entry_update(ent, irq_flags, pc); + * ent->type = event_call->id; + * + * <- do some jobs with dynamic arrays + * + * <- affect our values + * + * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * __entry_size); <- submit them to perf counter + * } while (0); + * + * } + */ + +#ifdef CONFIG_EVENT_PROFILE + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static void ftrace_profile_##call(proto) \ +{ \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + struct ftrace_event_call *event_call = &event_##call; \ + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + struct ftrace_raw_##call *entry; \ + u64 __addr = 0, __count = 1; \ + unsigned long irq_flags; \ + int __entry_size; \ + int __data_size; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + \ + do { \ + char raw_data[__entry_size]; \ + struct trace_entry *ent; \ + \ + entry = (struct ftrace_raw_##call *)raw_data; \ + ent = &entry->ent; \ + tracing_generic_entry_update(ent, irq_flags, pc); \ + ent->type = event_call->id; \ + \ + tstruct \ + \ + { assign; } \ + \ + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + __entry_size); \ + } while (0); \ + \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_EVENT_PROFILE */ + #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b68d34..868102172aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39b9d8..c22b40f8f57 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc78..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6da09928130..90c98082af1 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + attr->mmap = track; attr->comm = track; attr->inherit = (cpu < 0) && inherit; -- cgit v1.2.3-70-g09d2 From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a67dd5c5b6d..2aabe43c1d0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,7 +121,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_TP_RECORD = 1U << 10, + PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; @@ -414,9 +414,9 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; -struct perf_tracepoint_record { - int size; - char *record; +struct perf_raw_record { + u32 size; + void *data; }; struct task_struct; @@ -687,7 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; - void *private; + struct perf_raw_record *raw; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622cb73a..00231054041 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit v1.2.3-70-g09d2 From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ include/trace/ftrace.h | 3 ++- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d0..a9d823a93fe 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -369,6 +369,8 @@ enum perf_event_type { * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW * }; */ PERF_EVENT_SAMPLE = 9, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7fb16d90e7b..7167b9b97da 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto) \ pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ - __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ + sizeof(u64)); \ \ do { \ char raw_data[__entry_size]; \ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d6294..5229d1666fa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit v1.2.3-70-g09d2