From 627ad9fd0733f0a31a266ff98a4a933eee710f0b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Jun 2009 23:21:41 -0400 Subject: ext4: Fix type warning on 64-bit platforms in tracing events header Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index acf4cc9cd36..b456fb0a3c5 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -34,7 +34,8 @@ TRACE_EVENT(ext4_free_inode, TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, - __entry->uid, __entry->gid, __entry->blocks) + __entry->uid, __entry->gid, + (unsigned long long) __entry->blocks) ); TRACE_EVENT(ext4_request_inode, -- cgit v1.2.3-70-g09d2 From 43ce1d23b43330634507a049b55c36e91d27282e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 14 Jun 2009 17:58:45 -0400 Subject: ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc This patch fixes the mmap/truncate race that was fixed for delayed allocation by merging ext4_{journalled,normal,da}_writepage() into ext4_writepage(). Signed-off-by: Aneesh Kumar K.V Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 234 +++++++++++--------------------------------- include/trace/events/ext4.h | 45 +-------- 2 files changed, 58 insertions(+), 221 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1275f34589c..97c48b5b057 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,6 +47,10 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc, + unsigned int len); + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -2392,7 +2396,7 @@ static int __mpage_da_writepage(struct page *page, * We need to try to allocate * unmapped blocks in the same page. * Otherwise we won't make progress - * with the page in ext4_da_writepage + * with the page in ext4_writepage */ if (ext4_bh_delay_or_unwritten(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, @@ -2519,13 +2523,47 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, } /* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * * This function can get called via... * - ext4_da_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via pdflush (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other bufer_heads would be unmapped but dirty(dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. */ -static int ext4_da_writepage(struct page *page, +static int ext4_writepage(struct page *page, struct writeback_control *wbc) { int ret = 0; @@ -2534,7 +2572,7 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; - trace_ext4_da_writepage(inode, page); + trace_ext4_writepage(inode, page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2596,6 +2634,15 @@ static int ext4_da_writepage(struct page *page, block_commit_write(page, 0, len); } + if (PageChecked(page) && ext4_should_journal_data(inode)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + return __ext4_journalled_writepage(page, wbc, len); + } + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); else @@ -3135,112 +3182,10 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), noone guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * In all journaling modes block_write_full_page() will start the I/O. - * - * Problem: - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * Similar for: - * - * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext4_get_block(). We will deadlock on various things like - * lock_journal and i_data_sem - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * - * We don't honour synchronous mounts for writepage(). That would be - * disastrous. Any write() or metadata operation will sync the fs for - * us. - * - */ -static int __ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - - if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, noalloc_get_block_write, wbc); - else - return block_write_full_page(page, noalloc_get_block_write, - wbc); -} - -static int ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_normal_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (!ext4_journal_current_handle()) - return __ext4_normal_writepage(page, wbc); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc, + unsigned int len) { - loff_t size; - unsigned int len; struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct buffer_head *page_bufs; @@ -3248,16 +3193,8 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0; int err; - size = i_size_read(inode); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - ret = block_prepare_write(page, 0, len, noalloc_get_block_write); - if (ret != 0) - goto out_unlock; - page_bufs = page_buffers(page); + BUG_ON(!page_bufs); walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ @@ -3282,67 +3219,10 @@ static int __ext4_journalled_writepage(struct page *page, walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - goto out; - -out_unlock: - unlock_page(page); out: return ret; } -static int ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_journalled_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (ext4_journal_current_handle()) - goto no_write; - - if (PageChecked(page)) { - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc); - } else { - /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. - */ - return block_write_full_page(page, noalloc_get_block_write, - wbc); - } -no_write: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int ext4_readpage(struct file *file, struct page *page) { return mpage_readpage(page, ext4_get_block); @@ -3489,7 +3369,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_ordered_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_ordered_write_end, @@ -3504,7 +3384,7 @@ static const struct address_space_operations ext4_ordered_aops = { static const struct address_space_operations ext4_writeback_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_writeback_write_end, @@ -3519,7 +3399,7 @@ static const struct address_space_operations ext4_writeback_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3533,7 +3413,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_da_writepage, + .writepage = ext4_writepage, .writepages = ext4_da_writepages, .sync_page = block_sync_page, .write_begin = ext4_da_write_begin, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b456fb0a3c5..dfbc9b0edc8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -190,7 +190,7 @@ TRACE_EVENT(ext4_journalled_write_end, __entry->copied) ); -TRACE_EVENT(ext4_da_writepage, +TRACE_EVENT(ext4_writepage, TP_PROTO(struct inode *inode, struct page *page), TP_ARGS(inode, page), @@ -342,49 +342,6 @@ TRACE_EVENT(ext4_da_write_end, __entry->copied) ); -TRACE_EVENT(ext4_normal_writepage, - TP_PROTO(struct inode *inode, struct page *page), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( pgoff_t, index ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->index = page->index; - ), - - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) -); - -TRACE_EVENT(ext4_journalled_writepage, - TP_PROTO(struct inode *inode, struct page *page), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( pgoff_t, index ) - - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->index = page->index; - ), - - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) -); - TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), -- cgit v1.2.3-70-g09d2 From d0b6e04a4cd8360e3c9c419f7c30a3081a0c142a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 13 Jul 2009 10:33:21 +0800 Subject: tracing/events: Move TRACE_SYSTEM outside of include guard If TRACE_INCLDUE_FILE is defined, will be included and compiled, otherwise it will be So TRACE_SYSTEM should be defined outside of #if proctection, just like TRACE_INCLUDE_FILE. Imaging this scenario: #include -> TRACE_SYSTEM == foo ... #include -> TRACE_SYSTEM == bar ... #define CREATE_TRACE_POINTS #include -> TRACE_SYSTEM == bar !!! and then bar.h will be included and compiled. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A5A9CF1.2010007@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- fs/gfs2/trace_gfs2.h | 8 +++---- include/trace/events/block.h | 6 ++--- include/trace/events/ext4.h | 6 ++--- include/trace/events/irq.h | 6 ++--- include/trace/events/jbd2.h | 6 ++--- include/trace/events/kmem.h | 6 ++--- include/trace/events/lockdep.h | 6 ++--- include/trace/events/sched.h | 6 ++--- include/trace/events/skb.h | 6 ++--- include/trace/events/workqueue.h | 6 ++--- samples/trace_events/trace-events-sample.h | 37 ++++++++++++++++-------------- 11 files changed, 51 insertions(+), 48 deletions(-) (limited to 'include/trace') diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 98d6ef1c1dc..148d55c1417 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -1,12 +1,11 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 + #if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_GFS2_H #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM gfs2 -#define TRACE_INCLUDE_FILE trace_gfs2 - #include #include #include @@ -403,5 +402,6 @@ TRACE_EVENT(gfs2_block_alloc, /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_gfs2 #include diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d6b05f42dd4..9a74b468a22 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -1,3 +1,6 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM block + #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H @@ -5,9 +8,6 @@ #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM block - TRACE_EVENT(block_rq_abort, TP_PROTO(struct request_queue *q, struct request *rq), diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index acf4cc9cd36..8eb82cc8d14 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1,9 +1,9 @@ -#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_EXT4_H - #undef TRACE_SYSTEM #define TRACE_SYSTEM ext4 +#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EXT4_H + #include #include "../../../fs/ext4/ext4.h" #include "../../../fs/ext4/mballoc.h" diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index b0c7ede55eb..1cb0c3aa11e 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq + #if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM irq - #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ __print_symbolic(val, \ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 845b0b4b48f..10813fa0c8d 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM jbd2 + #if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_JBD2_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM jbd2 - TRACE_EVENT(jbd2_checkpoint, TP_PROTO(journal_t *journal, int result), diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 9baba50d651..1493c541f9c 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem + #if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kmem - /* * The order of these masks is important. Matching masks will be seen * first and the left over flags will end up showing by themselves. diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 0e956c9dfd7..bcf1d209a00 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lockdep + #if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_LOCKDEP_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM lockdep - #ifdef CONFIG_LOCKDEP TRACE_EVENT(lock_acquire, diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 24ab5bcff7b..8949bb7eb08 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched + #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM sched - /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 1e8fabb57c0..e499863b966 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb + #if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SKB_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM skb - /* * Tracepoint for free an sk_buff: */ diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 035f1bff288..fcfd9a1e4b9 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -1,3 +1,6 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM workqueue + #if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WORKQUEUE_H @@ -5,9 +8,6 @@ #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM workqueue - TRACE_EVENT(workqueue_insertion, TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 9977a756fb3..f24ae370e51 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -1,20 +1,3 @@ -/* - * Notice that this file is not protected like a normal header. - * We also must allow for rereading of this file. The - * - * || defined(TRACE_HEADER_MULTI_READ) - * - * serves this purpose. - */ -#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_EVENT_SAMPLE_H - -/* - * All trace headers should include tracepoint.h, until we finally - * make it into a standard header. - */ -#include - /* * If TRACE_SYSTEM is defined, that will be the directory created * in the ftrace directory under /debugfs/tracing/events/ @@ -34,10 +17,30 @@ * #define TRACE_INCLUDE_FILE trace-events-sample * * As we do an the bottom of this file. + * + * Notice that TRACE_SYSTEM should be defined outside of #if + * protection, just like TRACE_INCLUDE_FILE. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sample +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENT_SAMPLE_H + +/* + * All trace headers should include tracepoint.h, until we finally + * make it into a standard header. + */ +#include + /* * The TRACE_EVENT macro is broken up into 5 parts. * -- cgit v1.2.3-70-g09d2 From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 110 ++++++++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 6 +-- 2 files changed, 88 insertions(+), 28 deletions(-) (limited to 'include/trace') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1867553c61e..fec71f8dbc4 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -144,6 +144,9 @@ #undef TP_fast_assign #define TP_fast_assign(args...) args +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ @@ -345,6 +348,88 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_PROFILE + +/* + * Generate the functions needed for tracepoint perf_counter support. + * + * static void ftrace_profile_(proto) + * { + * extern void perf_tpcounter_event(int, u64, u64); + * u64 __addr = 0, __count = 1; + * + * <-- here we expand the TP_perf_assign() macro + * + * perf_tpcounter_event(event_.id, __addr, __count); + * } + * + * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) + * { + * int ret = 0; + * + * if (!atomic_inc_return(&event_call->profile_count)) + * ret = register_trace_(ftrace_profile_); + * + * return ret; + * } + * + * static void ftrace_profile_disable_(struct ftrace_event_call *event_call) + * { + * if (atomic_add_negative(-1, &event->call->profile_count)) + * unregister_trace_(ftrace_profile_); + * } + * + */ + +#undef TP_fast_assign +#define TP_fast_assign(args...) + +#undef TP_perf_assign +#define TP_perf_assign(args...) args + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int, u64, u64); \ + u64 __addr = 0, __count = 1; \ + { assign; } \ + perf_tpcounter_event(event_##call.id, __addr, __count); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&event_call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + +#endif + /* * Stage 4 of the trace events. * @@ -447,28 +532,6 @@ static inline int ftrace_get_offsets_##call( \ #define TP_FMT(fmt, args...) fmt "\n", ##args #ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&event_call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ -{ \ - if (atomic_add_negative(-1, &event_call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} #define _TRACE_PROFILE_INIT(call) \ .profile_count = ATOMIC_INIT(-1), \ @@ -476,7 +539,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ .profile_disable = ftrace_profile_disable_##call, #else -#define _TRACE_PROFILE(call, proto, args) #define _TRACE_PROFILE_INIT(call) #endif @@ -502,7 +564,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ @@ -586,6 +647,5 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 673c1aaf733..52eb4b68d34 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count) { struct perf_sample_data data = { .regs = get_irq_regs(), - .addr = 0, + .addr = addr, }; if (!data.regs) data.regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3-70-g09d2 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- include/linux/perf_counter.h | 9 ++- include/trace/ftrace.h | 130 ++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 18 +++++- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 4 -- tools/perf/builtin-record.c | 1 + 7 files changed, 126 insertions(+), 41 deletions(-) (limited to 'include/trace') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d7cd193c227..a81170de7f6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -89,7 +89,9 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; - +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e604e6ef72d..a67dd5c5b6d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,8 +121,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_TP_RECORD = 1U << 10, - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; /* @@ -413,6 +414,11 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; +struct perf_tracepoint_record { + int size; + char *record; +}; + struct task_struct; /** @@ -681,6 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; + void *private; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fec71f8dbc4..7fb16d90e7b 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \ /* * Generate the functions needed for tracepoint perf_counter support. * - * static void ftrace_profile_(proto) - * { - * extern void perf_tpcounter_event(int, u64, u64); - * u64 __addr = 0, __count = 1; - * - * <-- here we expand the TP_perf_assign() macro - * - * perf_tpcounter_event(event_.id, __addr, __count); - * } + * NOTE: The insertion profile callback (ftrace_profile_) is defined later * * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) * { @@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \ * */ -#undef TP_fast_assign -#define TP_fast_assign(args...) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args - -#undef __perf_addr -#define __perf_addr(a) __addr = (a) - -#undef __perf_count -#define __perf_count(c) __count = (c) - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int, u64, u64); \ - u64 __addr = 0, __count = 1; \ - { assign; } \ - perf_tpcounter_event(event_##call.id, __addr, __count); \ -} \ +static void ftrace_profile_##call(proto); \ \ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ @@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - #endif /* @@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Define the insertion callback to profile events + * + * The job is very similar to ftrace_raw_event_ except that we don't + * insert in the ring buffer but in a perf counter. + * + * static void ftrace_profile_(proto) + * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; + * struct ftrace_event_call *event_call = &event_; + * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * struct ftrace_raw_##call *entry; + * u64 __addr = 0, __count = 1; + * unsigned long irq_flags; + * int __entry_size; + * int __data_size; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * __entry_size = __data_size + sizeof(*entry); + * + * do { + * char raw_data[__entry_size]; <- allocate our sample in the stack + * struct trace_entry *ent; + * + * entry = (struct ftrace_raw_ *)raw_data; + * ent = &entry->ent; + * tracing_generic_entry_update(ent, irq_flags, pc); + * ent->type = event_call->id; + * + * <- do some jobs with dynamic arrays + * + * <- affect our values + * + * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * __entry_size); <- submit them to perf counter + * } while (0); + * + * } + */ + +#ifdef CONFIG_EVENT_PROFILE + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static void ftrace_profile_##call(proto) \ +{ \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + struct ftrace_event_call *event_call = &event_##call; \ + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + struct ftrace_raw_##call *entry; \ + u64 __addr = 0, __count = 1; \ + unsigned long irq_flags; \ + int __entry_size; \ + int __data_size; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + \ + do { \ + char raw_data[__entry_size]; \ + struct trace_entry *ent; \ + \ + entry = (struct ftrace_raw_##call *)raw_data; \ + ent = &entry->ent; \ + tracing_generic_entry_update(ent, irq_flags, pc); \ + ent->type = event_call->id; \ + \ + tstruct \ + \ + { assign; } \ + \ + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + __entry_size); \ + } while (0); \ + \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_EVENT_PROFILE */ + #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b68d34..868102172aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39b9d8..c22b40f8f57 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc78..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6da09928130..90c98082af1 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + attr->mmap = track; attr->comm = track; attr->inherit = (cpu < 0) && inherit; -- cgit v1.2.3-70-g09d2