From 8df9675f8b498d0bfa1f0b5b06f56bf1ff366dd5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 08:50:38 -0400 Subject: ext4: Avoid races caused by on-line resizing and SMP memory reordering Ext4's on-line resizing adds a new block group and then, only at the last step adjusts s_groups_count. However, it's possible on SMP systems that another CPU could see the updated the s_group_count and not see the newly initialized data structures for the just-added block group. For this reason, it's important to insert a SMP read barrier after reading s_groups_count and before reading any (for example) the new block group descriptors allowed by the increased value of s_groups_count. Unfortunately, we rather blatently violate this locking protocol documented in fs/ext4/resize.c. Fortunately, (1) on-line resizes happen relatively rarely, and (2) it seems rare that the filesystem code will immediately try to use just-added block group before any memory ordering issues resolve themselves. So apparently problems here are relatively hard to hit, since ext3 has been vulnerable to the same issue for years with no one apparently complaining. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18e0a08a6b..55ba419ca00 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -316,7 +316,7 @@ error_return: static int find_group_dir(struct super_block *sb, struct inode *parent, ext4_group_t *best_group) { - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned int freei, avefreei; struct ext4_group_desc *desc, *best_desc = NULL; ext4_group_t group; @@ -353,7 +353,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, struct flex_groups *flex_group = sbi->s_flex_groups; ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); int flex_size = ext4_flex_bg_size(sbi); ext4_group_t best_flex = parent_fbg_group; int blocks_per_flex = sbi->s_blocks_per_group * flex_size; @@ -362,7 +362,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, ext4_group_t n_fbg_groups; ext4_group_t i; - n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + n_fbg_groups = (ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; find_close_to_parent: @@ -478,20 +478,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i, grp, g; + ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); + ngroups = real_ngroups; if (flex_size > 1) { - ngroups = (ngroups + flex_size - 1) >> + ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } @@ -543,7 +544,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, */ grp *= flex_size; for (i = 0; i < flex_size; i++) { - if (grp+i >= sbi->s_groups_count) + if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { @@ -583,7 +584,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, } fallback: - ngroups = sbi->s_groups_count; + ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; @@ -613,9 +614,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; - ext4_group_t i, last; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* @@ -799,11 +799,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; - ext4_group_t group = 0; + ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; - struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err = 0; @@ -818,15 +817,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); - es = sbi->s_es; if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); @@ -856,7 +854,7 @@ got_group: if (ret2 == -1) goto out; - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); @@ -917,7 +915,7 @@ repeat_in_this_group: * group descriptor metadata has not yet been updated. * So we just go onto the next blockgroup. */ - if (++group == sbi->s_groups_count) + if (++group == ngroups) group = 0; } err = -ENOSPC; @@ -1158,7 +1156,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; @@ -1168,7 +1166,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) desc_count = 0; bitmap_count = 0; gdp = NULL; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1190,7 +1188,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) return desc_count; #else desc_count = 0; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1205,9 +1203,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; -- cgit v1.2.3-70-g09d2 From bb23c20a851a5038b255a3c0d0aa56093c1da3f8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 19:44:44 -0400 Subject: ext4: Move fs/ext4/group.h into ext4.h Move the function prototypes in group.h into ext4.h so they are all defined in one place. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 1 - fs/ext4/ext4.h | 17 +++++++++++++++++ fs/ext4/group.h | 29 ----------------------------- fs/ext4/ialloc.c | 1 - fs/ext4/mballoc.h | 1 - fs/ext4/resize.c | 1 - fs/ext4/super.c | 1 - 7 files changed, 17 insertions(+), 34 deletions(-) delete mode 100644 fs/ext4/group.h (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a5ba039850c..92f557d957d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -19,7 +19,6 @@ #include #include "ext4.h" #include "ext4_jbd2.h" -#include "group.h" #include "mballoc.h" /* diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d9c5251d082..5973f3261b0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1270,6 +1270,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +#define ext4_free_blocks_after_init(sb, group, desc) \ + ext4_init_block_bitmap(sb, NULL, group, desc) /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -1294,6 +1302,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); +extern unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); /* mballoc.c */ extern long ext4_mb_stats; @@ -1417,6 +1430,10 @@ extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); +extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { diff --git a/fs/ext4/group.h b/fs/ext4/group.h deleted file mode 100644 index c2c0a8d06d0..00000000000 --- a/fs/ext4/group.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * linux/fs/ext4/group.h - * - * Copyright (C) 2007 Cluster File Systems, Inc - * - * Author: Andreas Dilger - */ - -#ifndef _LINUX_EXT4_GROUP_H -#define _LINUX_EXT4_GROUP_H - -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); -extern unsigned ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -#define ext4_free_blocks_after_init(sb, group, desc) \ - ext4_init_block_bitmap(sb, NULL, group, desc) -extern unsigned ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); -#endif /* _LINUX_EXT4_GROUP_H */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 55ba419ca00..916d05c881c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -27,7 +27,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "group.h" /* * ialloc.c contains the inodes allocation and deallocation routines diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dd9e6cd5f6c..75e34f69215 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -23,7 +23,6 @@ #include #include "ext4_jbd2.h" #include "ext4.h" -#include "group.h" /* * with AGGRESSIVE_CHECK allocator runs consistency checks over diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e8ded13b5cb..27eb289eea3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -15,7 +15,6 @@ #include #include "ext4_jbd2.h" -#include "group.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d79e1c428b4..7903f20c807 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,7 +46,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "group.h" struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; -- cgit v1.2.3-70-g09d2 From 955ce5f5be67dfe0d1d096b543af33fe8a1ce3dd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 2 May 2009 20:35:09 -0400 Subject: ext4: Convert ext4_lock_group to use sb_bgl_lock We have sb_bgl_lock() and ext4_group_info.bb_state bit spinlock to protech group information. The later is only used within mballoc code. Consolidate them to use sb_bgl_lock(). This makes the mballoc.c code much simpler and also avoid confusion with two locks protecting same info. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 12 ++++----- fs/ext4/ext4.h | 26 +++++++------------ fs/ext4/ialloc.c | 29 ++++++++++----------- fs/ext4/mballoc.c | 78 +++++++++++++++++++------------------------------------ fs/ext4/super.c | 6 ++--- 5 files changed, 59 insertions(+), 92 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92f557d957d..e2126d70dff 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, down_write(&grp->alloc_sem); for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit + i, bitmap_bh->b_data)) { ext4_error(sb, __func__, "bit already cleared for block %llu", @@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, blocks_freed++; } } - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); if (sbi->s_log_groups_per_flex) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5973f3261b0..149e02dc360 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -963,12 +963,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } - -static inline spinlock_t * -sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1568,33 +1562,31 @@ struct ext4_group_info { }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spin_lock(ext4_group_lock_ptr(sb, group)); } static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); + spin_unlock(ext4_group_lock_ptr(sb, group)); } static inline int ext4_is_group_locked(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); + return spin_is_locked(ext4_group_lock_ptr(sb, group)); } /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 916d05c881c..82f7d1d7eae 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -122,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -246,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - spin_lock(sb_bgl_lock(sbi, block_group)); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); - spin_unlock(sb_bgl_lock(sbi, block_group)); + cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), + bit, bitmap_bh->b_data); if (!cleared) ext4_error(sb, "ext4_free_inode", "bit already cleared for inode %lu", ino); @@ -260,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (fatal) goto error_return; if (gdp) { - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { @@ -276,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); @@ -707,10 +706,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent, /* * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's sb_bgl_lock + * is uninit we need to take the groups's ext4_group_lock * and clear the uninit flag. The inode bitmap update * and group desc uninit flag clear should be done - * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * after holding ext4_group_lock so that ext4_read_inode_bitmap * doesn't race with the ext4_claim_inode */ static int ext4_claim_inode(struct super_block *sb, @@ -721,7 +720,7 @@ static int ext4_claim_inode(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; @@ -730,7 +729,7 @@ static int ext4_claim_inode(struct super_block *sb, ino++; if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); ext4_error(sb, __func__, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, @@ -779,7 +778,7 @@ static int ext4_claim_inode(struct super_block *sb, } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); return retval; } @@ -935,7 +934,7 @@ got: } free = 0; - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { free = ext4_free_blocks_after_init(sb, group, gdp); @@ -944,7 +943,7 @@ got: gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); /* Don't need to dirty bitmap block if we didn't change it */ if (free) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index df75855ae6f..e76459cedcd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr) ext4_set_bit(bit, addr); } -static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_set_bit_atomic(lock, bit, addr); -} - static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } -static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_clear_bit_atomic(lock, bit, addr); -} - static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -803,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore) unlock_buffer(bh[i]); continue; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_lock_group(sb, first_group + i); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); set_bitmap_uptodate(bh[i]); set_buffer_uptodate(bh[i]); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); unlock_buffer(bh[i]); continue; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); if (buffer_uptodate(bh[i])) { /* * if not uninit if bh is uptodate, @@ -1080,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) return 0; } -static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1093,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_clear_bit_atomic(lock, cur, bm); - else - mb_clear_bit(cur, bm); + mb_clear_bit(cur, bm); cur++; } } -static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1114,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_set_bit_atomic(lock, cur, bm); - else - mb_set_bit(cur, bm); + mb_set_bit(cur, bm); cur++; } } @@ -1332,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; } - mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), - EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -2756,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) return 0; } -/* need to called with ext4 group lock (ext4_lock_group) */ +/* need to called with the ext4 group lock held */ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; @@ -2993,14 +2974,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. */ - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), - bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EAGAIN; goto out_err; } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK { int i; @@ -3010,9 +2994,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - mb_set_bits(NULL, bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, @@ -3022,7 +3004,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_blks_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); - spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -3455,7 +3438,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * the function goes through all block freed in the group * but not yet committed and marks them used in in-core bitmap. * buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with the ext4 group lock held */ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3469,9 +3452,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, entry->start_blk, - entry->count); + mb_set_bits(bitmap, entry->start_blk, entry->count); n = rb_next(n); } return; @@ -3480,7 +3461,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with ext4 group lock held */ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3512,8 +3493,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, start, len); + mb_set_bits(bitmap, start, len); preallocated += len; count++; } @@ -4856,29 +4836,25 @@ do_more: new_entry->group = block_group; new_entry->count = count; new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count); ext4_mb_free_metadata(handle, &e4b, new_entry); - ext4_unlock_group(sb, block_group); } else { - ext4_lock_group(sb, block_group); /* need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); - ext4_unlock_group(sb, block_group); } - spin_lock(sb_bgl_lock(sbi, block_group)); ret = ext4_free_blks_count(sb, gdp) + count; ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, count); if (sbi->s_log_groups_per_flex) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 39223a52bc7..dc34ed3d132 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1784,18 +1784,18 @@ static int ext4_check_descriptors(struct super_block *sb) "(block %llu)!\n", i, inode_table); return 0; } - spin_lock(sb_bgl_lock(sbi, i)); + ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); return 0; } } - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } -- cgit v1.2.3-70-g09d2 From 88b6edd17c62b7d346d21f4087893ce7d4ef828a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 25 May 2009 11:50:39 -0400 Subject: ext4: Clean up calls to ext4_get_group_desc() If the caller isn't planning on modifying the block group descriptors, there's no need to pass in a pointer to a struct buffer_head. Nuking this saves a tiny amount of CPU time and stack space usage. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 3 +-- fs/ext4/super.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 82f7d1d7eae..3743bd849bc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -347,7 +347,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *desc; - struct buffer_head *bh; struct flex_groups *flex_group = sbi->s_flex_groups; ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); @@ -402,7 +401,7 @@ find_close_to_parent: found_flexbg: for (i = best_flex * flex_size; i < ngroups && i < (best_flex + 1) * flex_size; i++) { - desc = ext4_get_group_desc(sb, i, &bh); + desc = ext4_get_group_desc(sb, i, NULL); if (ext4_free_inodes_count(sb, desc)) { *best_group = i; goto out; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eca6c057b11..91b98b58ccb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1660,7 +1660,6 @@ static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; - struct buffer_head *bh; ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; @@ -1693,7 +1692,7 @@ static int ext4_fill_flex_info(struct super_block *sb) } for (i = 0; i < sbi->s_groups_count; i++) { - gdp = ext4_get_group_desc(sb, i, &bh); + gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, -- cgit v1.2.3-70-g09d2 From 9bffad1ed2a003a355ed1b42424a0ae3575275ed Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 17 Jun 2009 11:48:11 -0400 Subject: ext4: convert instrumentation from markers to tracepoints Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 8 +- fs/ext4/ialloc.c | 15 +- fs/ext4/inode.c | 69 +---- fs/ext4/mballoc.c | 77 ++--- fs/ext4/mballoc.h | 1 - fs/ext4/super.c | 6 +- include/trace/events/ext4.h | 719 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 774 insertions(+), 121 deletions(-) create mode 100644 include/trace/events/ext4.h (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5afe4370840..83cf6415f59 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -28,10 +28,12 @@ #include #include #include -#include + #include "ext4.h" #include "ext4_jbd2.h" +#include + /* * akpm: A new design for ext4_sync_file(). * @@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); - trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", - inode->i_sb->s_id, datasync, inode->i_ino, - dentry->d_parent->d_inode->i_ino); + trace_ext4_sync_file(file, dentry, datasync); /* * data=writeback: diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3743bd849bc..7d502f3be91 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -23,11 +23,14 @@ #include #include #include + #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include + /* * ialloc.c contains the inodes allocation and deallocation routines */ @@ -208,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); - trace_mark(ext4_free_inode, - "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu", - sb->s_id, inode->i_ino, inode->i_mode, - (unsigned long) inode->i_uid, (unsigned long) inode->i_gid, - (unsigned long long) inode->i_blocks); + trace_ext4_free_inode(inode); /* * Note: we must free any quota before locking the superblock, @@ -815,8 +814,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) sb = dir->i_sb; ngroups = ext4_get_groups_count(sb); - trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, - dir->i_ino, mode); + trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -1047,8 +1045,7 @@ got: } ext4_debug("allocating inode %lu\n", inode->i_ino); - trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", - sb->s_id, inode->i_ino, dir->i_ino, mode); + trace_ext4_allocate_inode(inode, dir, mode); goto really_out; fail: ext4_std_error(sb, err); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 875db944b22..2418ad36eab 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,11 +37,14 @@ #include #include #include + #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "ext4_extents.h" +#include + #define MPAGE_DA_EXTENT_TAIL 0x01 static inline int ext4_begin_ordered_truncate(struct inode *inode, @@ -1466,10 +1469,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - trace_mark(ext4_write_begin, - "dev %s ino %lu pos %llu len %u flags %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, flags); + trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason @@ -1611,10 +1611,7 @@ static int ext4_ordered_write_end(struct file *file, struct inode *inode = mapping->host; int ret = 0, ret2; - trace_mark(ext4_ordered_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_ordered_write_end(inode, pos, len, copied); ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { @@ -1658,10 +1655,7 @@ static int ext4_writeback_write_end(struct file *file, struct inode *inode = mapping->host; int ret = 0, ret2; - trace_mark(ext4_writeback_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_writeback_write_end(inode, pos, len, copied); ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; @@ -1705,10 +1699,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; loff_t new_i_size; - trace_mark(ext4_journalled_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -2554,9 +2545,7 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; - trace_mark(ext4_da_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); + trace_ext4_da_writepage(inode, page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2667,19 +2656,7 @@ static int ext4_da_writepages(struct address_space *mapping, int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - trace_mark(ext4_da_writepages, - "dev %s ino %lu nr_t_write %ld " - "pages_skipped %ld range_start %llu " - "range_end %llu nonblocking %d " - "for_kupdate %d for_reclaim %d " - "for_writepages %d range_cyclic %d", - inode->i_sb->s_id, inode->i_ino, - wbc->nr_to_write, wbc->pages_skipped, - (unsigned long long) wbc->range_start, - (unsigned long long) wbc->range_end, - wbc->nonblocking, wbc->for_kupdate, - wbc->for_reclaim, wbc->for_writepages, - wbc->range_cyclic); + trace_ext4_da_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting @@ -2845,14 +2822,7 @@ out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; - trace_mark(ext4_da_writepage_result, - "dev %s ino %lu ret %d pages_written %d " - "pages_skipped %ld congestion %d " - "more_io %d no_nrwrite_index_update %d", - inode->i_sb->s_id, inode->i_ino, ret, - pages_written, wbc->pages_skipped, - wbc->encountered_congestion, wbc->more_io, - wbc->no_nrwrite_index_update); + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; } @@ -2904,11 +2874,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; - - trace_mark(ext4_da_write_begin, - "dev %s ino %lu pos %llu len %u flags %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, flags); + trace_ext4_da_write_begin(inode, pos, len, flags); retry: /* * With delayed allocation, we don't log the i_disksize update @@ -3001,10 +2967,7 @@ static int ext4_da_write_end(struct file *file, } } - trace_mark(ext4_da_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); end = start + copied - 1; @@ -3255,9 +3218,7 @@ static int ext4_normal_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; - trace_mark(ext4_normal_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); + trace_ext4_normal_writepage(inode, page); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -3343,9 +3304,7 @@ static int ext4_journalled_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; - trace_mark(ext4_journalled_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); + trace_ext4_journalled_writepage(inode, page); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ed8482e22c0..8d98070b48f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -22,6 +22,8 @@ */ #include "mballoc.h" +#include + /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); - - static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 @@ -2859,9 +2859,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + entry->start_blk + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", - sb->s_id, (unsigned long long) discard_block, - entry->count); + trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, + entry->count); sb_issue_discard(sb, discard_block, entry->count); kmem_cache_free(ext4_free_ext_cachep, entry); @@ -3629,10 +3628,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); - trace_mark(ext4_mb_new_inode_pa, - "dev %s ino %lu pstart %llu len %u lstart %u", - sb->s_id, ac->ac_inode->i_ino, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3691,9 +3687,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_type = MB_GROUP_PA; mb_debug("new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); - trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", - sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart); + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3783,10 +3778,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_mb_store_history(ac); } - trace_mark(ext4_mb_release_inode_pa, - "dev %s ino %lu block %llu count %u", - sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit, - next - bit); + trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3820,8 +3813,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (ac) ac->ac_op = EXT4_MB_HISTORY_DISCARD; - trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", - sb->s_id, pa->pa_pstart, pa->pa_len); + trace_ext4_mb_release_group_pa(ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -3889,6 +3881,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, INIT_LIST_HEAD(&list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (ac) + ac->ac_sb = sb; repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3987,12 +3981,15 @@ void ext4_discard_preallocations(struct inode *inode) } mb_debug("discard preallocation for inode %lu\n", inode->i_ino); - trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, - inode->i_ino); + trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (ac) { + ac->ac_sb = sb; + ac->ac_inode = inode; + } repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); @@ -4276,6 +4273,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, INIT_LIST_HEAD(&discard_list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (ac) + ac->ac_sb = sb; spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], @@ -4445,8 +4444,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) int ret; int freed = 0; - trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", - sb->s_id, needed); + trace_ext4_mb_discard_preallocations(sb, needed); for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; @@ -4475,17 +4473,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, sb = ar->inode->i_sb; sbi = EXT4_SB(sb); - trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " - "lblk %llu goal %llu lleft %llu lright %llu " - "pleft %llu pright %llu ", - sb->s_id, ar->flags, ar->len, - ar->inode ? ar->inode->i_ino : 0, - (unsigned long long) ar->logical, - (unsigned long long) ar->goal, - (unsigned long long) ar->lleft, - (unsigned long long) ar->lright, - (unsigned long long) ar->pleft, - (unsigned long long) ar->pright); + trace_ext4_request_blocks(ar); /* * For delayed allocation, we could skip the ENOSPC and @@ -4521,7 +4509,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (!ac) { + if (ac) { + ac->ac_sb = sb; + ac->ac_inode = ar->inode; + } else { ar->len = 0; *errp = -ENOMEM; goto out1; @@ -4594,18 +4585,7 @@ out3: reserv_blks); } - trace_mark(ext4_allocate_blocks, - "dev %s block %llu flags %u len %u ino %lu " - "logical %llu goal %llu lleft %llu lright %llu " - "pleft %llu pright %llu ", - sb->s_id, (unsigned long long) block, - ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0, - (unsigned long long) ar->logical, - (unsigned long long) ar->goal, - (unsigned long long) ar->lleft, - (unsigned long long) ar->lright, - (unsigned long long) ar->pleft, - (unsigned long long) ar->pright); + trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } @@ -4740,10 +4720,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %lu\n", block); - trace_mark(ext4_free_blocks, - "dev %s block %llu count %lu metadata %d ino %lu", - sb->s_id, (unsigned long long) block, count, metadata, - inode ? inode->i_ino : 0); + trace_ext4_free_blocks(inode, block, count, metadata); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 75e34f69215..c96bb19f58f 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "ext4_jbd2.h" #include "ext4.h" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 012c4251397..e8f0b2af460 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -47,6 +46,9 @@ #include "xattr.h" #include "acl.h" +#define CREATE_TRACE_POINTS +#include + static int default_mb_history_length = 1000; module_param_named(default_mb_history_length, default_mb_history_length, @@ -3346,7 +3348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) int ret = 0; tid_t target; - trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); + trace_ext4_sync_fs(sb, wait); if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { if (wait) jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h new file mode 100644 index 00000000000..acf4cc9cd36 --- /dev/null +++ b/include/trace/events/ext4.h @@ -0,0 +1,719 @@ +#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EXT4_H + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ext4 + +#include +#include "../../../fs/ext4/ext4.h" +#include "../../../fs/ext4/mballoc.h" +#include + +TRACE_EVENT(ext4_free_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( uid_t, uid ) + __field( gid_t, gid ) + __field( blkcnt_t, blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->uid = inode->i_uid; + __entry->gid = inode->i_gid; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, + __entry->uid, __entry->gid, __entry->blocks) +); + +TRACE_EVENT(ext4_request_inode, + TP_PROTO(struct inode *dir, int mode), + + TP_ARGS(dir, mode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, dir ) + __field( umode_t, mode ) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %s dir %lu mode %d", + jbd2_dev_to_name(__entry->dev), __entry->dir, __entry->mode) +); + +TRACE_EVENT(ext4_allocate_inode, + TP_PROTO(struct inode *inode, struct inode *dir, int mode), + + TP_ARGS(inode, dir, mode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, dir ) + __field( umode_t, mode ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %s ino %lu dir %lu mode %d", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->dir, __entry->mode) +); + +TRACE_EVENT(ext4_write_begin, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int flags), + + TP_ARGS(inode, pos, len, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + ), + + TP_printk("dev %s ino %lu pos %llu len %u flags %u", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, + __entry->flags) +); + +TRACE_EVENT(ext4_ordered_write_end, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, copied ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->copied = copied; + ), + + TP_printk("dev %s ino %lu pos %llu len %u copied %u", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, + __entry->copied) +); + +TRACE_EVENT(ext4_writeback_write_end, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, copied ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->copied = copied; + ), + + TP_printk("dev %s ino %lu pos %llu len %u copied %u", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, + __entry->copied) +); + +TRACE_EVENT(ext4_journalled_write_end, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + TP_ARGS(inode, pos, len, copied), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, copied ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->copied = copied; + ), + + TP_printk("dev %s ino %lu pos %llu len %u copied %u", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, + __entry->copied) +); + +TRACE_EVENT(ext4_da_writepage, + TP_PROTO(struct inode *inode, struct page *page), + + TP_ARGS(inode, page), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( pgoff_t, index ) + + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->index = page->index; + ), + + TP_printk("dev %s ino %lu page_index %lu", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) +); + +TRACE_EVENT(ext4_da_writepages, + TP_PROTO(struct inode *inode, struct writeback_control *wbc), + + TP_ARGS(inode, wbc), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( long, nr_to_write ) + __field( long, pages_skipped ) + __field( loff_t, range_start ) + __field( loff_t, range_end ) + __field( char, nonblocking ) + __field( char, for_kupdate ) + __field( char, for_reclaim ) + __field( char, for_writepages ) + __field( char, range_cyclic ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nr_to_write = wbc->nr_to_write; + __entry->pages_skipped = wbc->pages_skipped; + __entry->range_start = wbc->range_start; + __entry->range_end = wbc->range_end; + __entry->nonblocking = wbc->nonblocking; + __entry->for_kupdate = wbc->for_kupdate; + __entry->for_reclaim = wbc->for_reclaim; + __entry->for_writepages = wbc->for_writepages; + __entry->range_cyclic = wbc->range_cyclic; + ), + + TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, + __entry->pages_skipped, __entry->range_start, + __entry->range_end, __entry->nonblocking, + __entry->for_kupdate, __entry->for_reclaim, + __entry->for_writepages, __entry->range_cyclic) +); + +TRACE_EVENT(ext4_da_writepages_result, + TP_PROTO(struct inode *inode, struct writeback_control *wbc, + int ret, int pages_written), + + TP_ARGS(inode, wbc, ret, pages_written), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( int, ret ) + __field( int, pages_written ) + __field( long, pages_skipped ) + __field( char, encountered_congestion ) + __field( char, more_io ) + __field( char, no_nrwrite_index_update ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->ret = ret; + __entry->pages_written = pages_written; + __entry->pages_skipped = wbc->pages_skipped; + __entry->encountered_congestion = wbc->encountered_congestion; + __entry->more_io = wbc->more_io; + __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; + ), + + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, + __entry->pages_written, __entry->pages_skipped, + __entry->encountered_congestion, __entry->more_io, + __entry->no_nrwrite_index_update) +); + +TRACE_EVENT(ext4_da_write_begin, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int flags), + + TP_ARGS(inode, pos, len, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + ), + + TP_printk("dev %s ino %lu pos %llu len %u flags %u", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, + __entry->flags) +); + +TRACE_EVENT(ext4_da_write_end, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, copied ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->copied = copied; + ), + + TP_printk("dev %s ino %lu pos %llu len %u copied %u", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, + __entry->copied) +); + +TRACE_EVENT(ext4_normal_writepage, + TP_PROTO(struct inode *inode, struct page *page), + + TP_ARGS(inode, page), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( pgoff_t, index ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->index = page->index; + ), + + TP_printk("dev %s ino %lu page_index %lu", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) +); + +TRACE_EVENT(ext4_journalled_writepage, + TP_PROTO(struct inode *inode, struct page *page), + + TP_ARGS(inode, page), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( pgoff_t, index ) + + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->index = page->index; + ), + + TP_printk("dev %s ino %lu page_index %lu", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) +); + +TRACE_EVENT(ext4_discard_blocks, + TP_PROTO(struct super_block *sb, unsigned long long blk, + unsigned long long count), + + TP_ARGS(sb, blk, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( __u64, blk ) + __field( __u64, count ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blk = blk; + __entry->count = count; + ), + + TP_printk("dev %s blk %llu count %llu", + jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count) +); + +TRACE_EVENT(ext4_mb_new_inode_pa, + TP_PROTO(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa), + + TP_ARGS(ac, pa), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, pa_pstart ) + __field( __u32, pa_len ) + __field( __u64, pa_lstart ) + + ), + + TP_fast_assign( + __entry->dev = ac->ac_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->pa_pstart = pa->pa_pstart; + __entry->pa_len = pa->pa_len; + __entry->pa_lstart = pa->pa_lstart; + ), + + TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, + __entry->pa_len, __entry->pa_lstart) +); + +TRACE_EVENT(ext4_mb_new_group_pa, + TP_PROTO(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa), + + TP_ARGS(ac, pa), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, pa_pstart ) + __field( __u32, pa_len ) + __field( __u64, pa_lstart ) + + ), + + TP_fast_assign( + __entry->dev = ac->ac_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->pa_pstart = pa->pa_pstart; + __entry->pa_len = pa->pa_len; + __entry->pa_lstart = pa->pa_lstart; + ), + + TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, + __entry->pa_len, __entry->pa_lstart) +); + +TRACE_EVENT(ext4_mb_release_inode_pa, + TP_PROTO(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa, + unsigned long long block, unsigned int count), + + TP_ARGS(ac, pa, block, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, block ) + __field( __u32, count ) + + ), + + TP_fast_assign( + __entry->dev = ac->ac_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->block = block; + __entry->count = count; + ), + + TP_printk("dev %s ino %lu block %llu count %u", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, + __entry->count) +); + +TRACE_EVENT(ext4_mb_release_group_pa, + TP_PROTO(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa), + + TP_ARGS(ac, pa), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, pa_pstart ) + __field( __u32, pa_len ) + + ), + + TP_fast_assign( + __entry->dev = ac->ac_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->pa_pstart = pa->pa_pstart; + __entry->pa_len = pa->pa_len; + ), + + TP_printk("dev %s pstart %llu len %u", + jbd2_dev_to_name(__entry->dev), __entry->pa_pstart, __entry->pa_len) +); + +TRACE_EVENT(ext4_discard_preallocations, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + ), + + TP_printk("dev %s ino %lu", + jbd2_dev_to_name(__entry->dev), __entry->ino) +); + +TRACE_EVENT(ext4_mb_discard_preallocations, + TP_PROTO(struct super_block *sb, int needed), + + TP_ARGS(sb, needed), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, needed ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->needed = needed; + ), + + TP_printk("dev %s needed %d", + jbd2_dev_to_name(__entry->dev), __entry->needed) +); + +TRACE_EVENT(ext4_request_blocks, + TP_PROTO(struct ext4_allocation_request *ar), + + TP_ARGS(ar), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( unsigned int, flags ) + __field( unsigned int, len ) + __field( __u64, logical ) + __field( __u64, goal ) + __field( __u64, lleft ) + __field( __u64, lright ) + __field( __u64, pleft ) + __field( __u64, pright ) + ), + + TP_fast_assign( + __entry->dev = ar->inode->i_sb->s_dev; + __entry->ino = ar->inode->i_ino; + __entry->flags = ar->flags; + __entry->len = ar->len; + __entry->logical = ar->logical; + __entry->goal = ar->goal; + __entry->lleft = ar->lleft; + __entry->lright = ar->lright; + __entry->pleft = ar->pleft; + __entry->pright = ar->pright; + ), + + TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, + __entry->len, + (unsigned long long) __entry->logical, + (unsigned long long) __entry->goal, + (unsigned long long) __entry->lleft, + (unsigned long long) __entry->lright, + (unsigned long long) __entry->pleft, + (unsigned long long) __entry->pright) +); + +TRACE_EVENT(ext4_allocate_blocks, + TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), + + TP_ARGS(ar, block), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, block ) + __field( unsigned int, flags ) + __field( unsigned int, len ) + __field( __u64, logical ) + __field( __u64, goal ) + __field( __u64, lleft ) + __field( __u64, lright ) + __field( __u64, pleft ) + __field( __u64, pright ) + ), + + TP_fast_assign( + __entry->dev = ar->inode->i_sb->s_dev; + __entry->ino = ar->inode->i_ino; + __entry->block = block; + __entry->flags = ar->flags; + __entry->len = ar->len; + __entry->logical = ar->logical; + __entry->goal = ar->goal; + __entry->lleft = ar->lleft; + __entry->lright = ar->lright; + __entry->pleft = ar->pleft; + __entry->pright = ar->pright; + ), + + TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, + __entry->len, __entry->block, + (unsigned long long) __entry->logical, + (unsigned long long) __entry->goal, + (unsigned long long) __entry->lleft, + (unsigned long long) __entry->lright, + (unsigned long long) __entry->pleft, + (unsigned long long) __entry->pright) +); + +TRACE_EVENT(ext4_free_blocks, + TP_PROTO(struct inode *inode, __u64 block, unsigned long count, + int metadata), + + TP_ARGS(inode, block, count, metadata), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, block ) + __field( unsigned long, count ) + __field( int, metadata ) + + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->block = block; + __entry->count = count; + __entry->metadata = metadata; + ), + + TP_printk("dev %s ino %lu block %llu count %lu metadata %d", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, + __entry->count, __entry->metadata) +); + +TRACE_EVENT(ext4_sync_file, + TP_PROTO(struct file *file, struct dentry *dentry, int datasync), + + TP_ARGS(file, dentry, datasync), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, datasync ) + ), + + TP_fast_assign( + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = dentry->d_inode->i_ino; + __entry->datasync = datasync; + __entry->parent = dentry->d_parent->d_inode->i_ino; + ), + + TP_printk("dev %s ino %ld parent %ld datasync %d ", + jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->parent, + __entry->datasync) +); + +TRACE_EVENT(ext4_sync_fs, + TP_PROTO(struct super_block *sb, int wait), + + TP_ARGS(sb, wait), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, wait ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->wait = wait; + ), + + TP_printk("dev %s wait %d", jbd2_dev_to_name(__entry->dev), + __entry->wait) +); + +#endif /* _TRACE_EXT4_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3-70-g09d2 From f157a4aa98a18bd3817a72bea90d48494e2586e7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 13 Jun 2009 11:09:42 -0400 Subject: ext4: Use a hash of the topdir directory name for the Orlov parent group Instead of using a random number to determine the goal parent grop for the Orlov top directories, use a hash of the directory name. This allows for repeatable results when trying to benchmark filesystem layout algorithms. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- fs/ext4/ialloc.c | 19 ++++++++++++++----- fs/ext4/migrate.c | 5 ++--- fs/ext4/namei.c | 8 ++++---- 4 files changed, 22 insertions(+), 13 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 06ee5a58291..d035cf149e0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1315,7 +1315,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode * ext4_new_inode(handle_t *, struct inode *, int); +extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, + const struct qstr *qstr); extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7d502f3be91..3f98ee712ff 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -470,7 +470,8 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g, */ static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode) + ext4_group_t *group, int mode, + const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -485,6 +486,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); + struct dx_hash_info hinfo; ngroups = real_ngroups; if (flex_size > 1) { @@ -506,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, int best_ndir = inodes_per_group; int ret = -1; - get_random_bytes(&grp, sizeof(grp)); + if (qstr) { + hinfo.hash_version = DX_HASH_HALF_MD4; + hinfo.seed = sbi->s_hash_seed; + ext4fs_dirhash(qstr->name, qstr->len, &hinfo); + grp = hinfo.hash; + } else + get_random_bytes(&grp, sizeof(grp)); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; @@ -649,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group + flex_size; if (*group > ngroups) *group = 0; - return find_group_orlov(sb, parent, group, mode); + return find_group_orlov(sb, parent, group, mode, 0); } /* @@ -790,7 +798,8 @@ err_ret: * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, + const struct qstr *qstr) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -839,7 +848,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else - ret2 = find_group_orlov(sb, dir, &group, mode); + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); } else ret2 = find_group_other(sb, dir, &group, mode); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index fe64d9f7985..80d075b8aea 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -483,9 +483,8 @@ int ext4_ext_migrate(struct inode *inode) retval = PTR_ERR(handle); return retval; } - tmp_inode = ext4_new_inode(handle, - inode->i_sb->s_root->d_inode, - S_IFREG); + tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG, 0); if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 07eb6649e4f..5f00d2418a8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1782,7 +1782,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode (handle, dir, mode); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; @@ -1816,7 +1816,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1853,7 +1853,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, S_IFDIR | mode); + inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2264,7 +2264,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -- cgit v1.2.3-70-g09d2 From 11013911daea4820147ae6d7094dd7c6894e8651 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Sat, 13 Jun 2009 11:45:35 -0400 Subject: ext4: teach the inode allocator to use a goal inode number Enhance the inode allocator to take a goal inode number as a paremeter; if it is specified, it takes precedence over Orlov or parent directory inode allocation algorithms. The extents migration function uses the goal inode number so that the extent trees allocated the migration function use the correct flex_bg. In the future, the goal inode functionality will also be used to allocate an adjacent inode for the extended attributes. Also, for testing purposes the goal inode number can be specified via /sys/fs/{dev}/inode_goal. This can be useful for testing inode allocation beyond 2^32 blocks on very large filesystems. Signed-off-by: Andreas Dilger Signed-off-by: "Theodore Ts'o" --- Documentation/ABI/testing/sysfs-fs-ext4 | 10 ++++++++++ fs/ext4/ext4.h | 3 ++- fs/ext4/ialloc.c | 16 ++++++++++++---- fs/ext4/migrate.c | 5 ++++- fs/ext4/namei.c | 10 ++++++---- fs/ext4/super.c | 2 ++ 6 files changed, 36 insertions(+), 10 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4 index 4e79074de28..5fb709997d9 100644 --- a/Documentation/ABI/testing/sysfs-fs-ext4 +++ b/Documentation/ABI/testing/sysfs-fs-ext4 @@ -79,3 +79,13 @@ Description: This file is read-only and shows the number of kilobytes of data that have been written to this filesystem since it was mounted. + +What: /sys/fs/ext4//inode_goal +Date: June 2008 +Contact: "Theodore Ts'o" +Description: + Tuning parameter which (if non-zero) controls the goal + inode used by the inode allocator in p0reference to + all other allocation hueristics. This is intended for + debugging use only, and should be 0 on production + systems. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d035cf149e0..746cdcba969 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -863,6 +863,7 @@ struct ext4_sb_info { int s_inode_size; int s_first_ino; unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; spinlock_t s_next_gen_lock; u32 s_next_generation; u32 s_hash_seed[4]; @@ -1316,7 +1317,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, - const struct qstr *qstr); + const struct qstr *qstr, __u32 goal); extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3f98ee712ff..2f645732e3b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -799,7 +799,7 @@ err_ret: * group to find a free inode. */ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, - const struct qstr *qstr) + const struct qstr *qstr, __u32 goal) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -830,6 +830,16 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + if (!goal) + goal = sbi->s_inode_goal; + + if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) { + group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); + ret2 = 0; + goto got_group; + } + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { @@ -858,7 +868,7 @@ got_group: if (ret2 == -1) goto out; - for (i = 0; i < ngroups; i++) { + for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); @@ -870,8 +880,6 @@ got_group: if (!inode_bitmap_bh) goto fail; - ino = 0; - repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 80d075b8aea..313a50b3974 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode) struct inode *tmp_inode = NULL; struct list_blocks_struct lb; unsigned long max_entries; + __u32 goal; /* * If the filesystem does not support extents, or the inode @@ -483,8 +484,10 @@ int ext4_ext_migrate(struct inode *inode) retval = PTR_ERR(handle); return retval; } + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, 0); + S_IFREG, 0, goal); if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5f00d2418a8..de04013d16f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1782,7 +1782,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; @@ -1816,7 +1816,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1853,7 +1853,8 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name); + inode = ext4_new_inode(handle, dir, S_IFDIR | mode, + &dentry->d_name, 0); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2264,7 +2265,8 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name); + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04486a53469..23013d303f8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2206,6 +2206,7 @@ EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); @@ -2218,6 +2219,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), ATTR_LIST(mb_max_to_scan), ATTR_LIST(mb_min_to_scan), -- cgit v1.2.3-70-g09d2