From 19883bd9658d0dc269fc228b1b39db3615f7c7b0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 16 Aug 2013 22:06:55 -0400 Subject: ext4: avoid reusing recently deleted inodes in no journal mode In no journal mode, if an inode has recently been deleted, we shouldn't reuse it right away. Otherwise it's possible, after an unclean shutdown, to hit a situation where a recently deleted inode gets reused for some other purpose before the inode table block has been written to disk. However, if the directory entry has been updated, then the directory entry will be pointing at the old inode contents. E2fsck will make sure the file system is consistent after the unclean shutdown. However, if the recently deleted inode is a character mode device, or an inode with the immutable bit set, even after the file system has been fixed up by e2fsck, it can be possible for a *.pyc file to be pointing at a character mode device, and when python tries to open the *.pyc file, Hilarity Ensues. We could change all of userspace to be very suspicious about stat'ing files before opening them, and clearing the immutable flag if necessary --- or we can just avoid reusing an inode number if it has been recently deleted. Google-Bug-Id: 10017573 Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8bf5999875e..666a5ed48bc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -624,6 +624,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent, return -1; } +/* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -741,6 +786,11 @@ repeat_in_this_group: "inode=%lu", ino + 1); continue; } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -764,6 +814,7 @@ repeat_in_this_group: ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ +next_inode: if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: -- cgit v1.2.3-70-g09d2 From 87a39389be3e3b007d341be510a7e4a0542bdf05 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 28 Aug 2013 18:32:58 -0400 Subject: ext4: mark block group as corrupt on inode bitmap error If we detect either a discrepancy between the inode bitmap and the inode counts or the inode bitmap fails to pass validation checks, mark the block group corrupt and refuse to allocate or deallocate inodes from the group. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ fs/ext4/ialloc.c | 29 +++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 02b764b4e63..06b488dca66 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2481,11 +2481,14 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 666a5ed48bc..d5106078595 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -117,6 +117,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +186,8 @@ verify: put_bh(bh); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } ext4_unlock_group(sb, block_group); @@ -221,6 +224,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " @@ -266,7 +270,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) goto error_return; BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -315,8 +321,10 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - } else + } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } error_return: brelse(bitmap_bh); @@ -697,6 +705,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, struct inode *ret; ext4_group_t i; ext4_group_t flex_group; + struct ext4_group_info *grp; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -770,10 +779,22 @@ got_group: continue; } + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } + brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto out; + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) -- cgit v1.2.3-70-g09d2 From bdfb6ff4a255dcebeb09a901250e13a97eff75af Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 28 Aug 2013 18:46:56 -0400 Subject: ext4: mark group corrupt on group descriptor checksum If the group descriptor fails validation, mark the whole blockgroup corrupt so that the inode/block allocators skip this group. The previous approach takes the risk of writing to a damaged group descriptor; hopefully it was never the case that the [ib]bitmap fields pointed to another valid block and got dirtied, since the memset would fill the page with 1s. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 9 ++++----- fs/ext4/ialloc.c | 10 ++++------ 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'fs/ext4/ialloc.c') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d9b66c4a26e..dc5d572ebd6 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -184,6 +184,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); @@ -191,11 +192,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } memset(bh->b_data, 0, sb->s_blocksize); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index d5106078595..137193ff389 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,18 +70,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } -- cgit v1.2.3-70-g09d2