From 60e07cf515e541ea3e13b888d273c9b19a2ad9dd Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Sun, 18 Dec 2011 15:49:54 -0500
Subject: ext4: do not reference pa_inode from group_pa

pa_inode in group_pa is set NULL in ext4_mb_new_group_pa, so
pa_inode should be not referenced.

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c           | 2 +-
 include/trace/events/ext4.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2d8be8f28b..cb990b21c69 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	trace_ext4_mb_release_group_pa(pa);
+	trace_ext4_mb_release_group_pa(sb, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 748ff7cbe55..319538bf17d 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -573,9 +573,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa,
 );
 
 TRACE_EVENT(ext4_mb_release_group_pa,
-	TP_PROTO(struct ext4_prealloc_space *pa),
+	TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),
 
-	TP_ARGS(pa),
+	TP_ARGS(sb, pa),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
@@ -585,7 +585,7 @@ TRACE_EVENT(ext4_mb_release_group_pa,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= pa->pa_inode->i_sb->s_dev;
+		__entry->dev		= sb->s_dev;
 		__entry->pa_pstart	= pa->pa_pstart;
 		__entry->pa_len		= pa->pa_len;
 	),
-- 
cgit v1.2.3-70-g09d2


From 5635a62b83c04d05e4eb4575a1c3de51a35bacdc Mon Sep 17 00:00:00 2001
From: Zheng Liu <gnehzuil.liu@gmail.com>
Date: Sun, 18 Dec 2011 16:13:58 -0500
Subject: ext4: add missing space to ext4_msg output in ext4_fill_super()

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3e1329e2f82..35377d57ec4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3508,7 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * of the filesystem.
 	 */
 	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
 			 "block %u is beyond end of filesystem (%llu)",
 			 le32_to_cpu(es->s_first_data_block),
 			 ext4_blocks_count(es));
-- 
cgit v1.2.3-70-g09d2


From acd6ad83517639e8f09a8c5525b1dccd81cd2a10 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 18 Dec 2011 17:37:02 -0500
Subject: ext4: fix error handling on inode bitmap corruption

When insert_inode_locked() fails in ext4_new_inode() it most likely means inode
bitmap got corrupted and we allocated again inode which is already in use. Also
doing unlock_new_inode() during error recovery is wrong since the inode does
not have I_NEW set. Fix the problem by jumping to fail: (instead of fail_drop:)
which declares filesystem error and does not call unlock_new_inode().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00beb4f9cc4..8fb6844f973 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -885,8 +885,12 @@ got:
 	if (IS_DIRSYNC(inode))
 		ext4_handle_sync(handle);
 	if (insert_inode_locked(inode) < 0) {
-		err = -EINVAL;
-		goto fail_drop;
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
-- 
cgit v1.2.3-70-g09d2


From 14d7f3efe923bc60839c65f9818793c64b4d708b Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Sun, 18 Dec 2011 17:39:02 -0500
Subject: ext4: remove unused local variable

In get_implied_cluster_alloc(), rr_cluster_end was being
defined and set, but was never used.  Removed this.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 607b1557d29..4423b11476a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3625,7 +3625,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
 	ext4_lblk_t ex_cluster_start, ex_cluster_end;
-	ext4_lblk_t rr_cluster_start, rr_cluster_end;
+	ext4_lblk_t rr_cluster_start;
 	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
 	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
 	unsigned short ee_len = ext4_ext_get_actual_len(ex);
@@ -3636,7 +3636,6 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 
 	/* The requested region passed into ext4_map_blocks() */
 	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
-	rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
 
 	if ((rr_cluster_start == ex_cluster_end) ||
 	    (rr_cluster_start == ex_cluster_start)) {
-- 
cgit v1.2.3-70-g09d2


From 8c48f7e88e293b9dd422bd8884842aea85d30b22 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Sun, 18 Dec 2011 23:05:43 -0500
Subject: ext4: optimize ext4_find_delalloc_range() in nodelalloc mode

We found performance regression when using bigalloc with "nodelalloc"
(1MB cluster size):

1. mke2fs -C 1048576 -O ^has_journal,bigalloc /dev/sda
2. mount -o nodelalloc /dev/sda /test/
3. time dd if=/dev/zero of=/test/io bs=1048576 count=1024

The "dd" will cost about 2 seconds to finish, but if we mke2fs without
"bigalloc", "dd" will only cost less than 1 second.

The reason is: when using ext4 with "nodelalloc", it will call
ext4_find_delalloc_cluster() nearly everytime it call
ext4_ext_map_blocks(), and ext4_find_delalloc_range() will also scan
all pages in cluster because no buffer is "delayed".  A cluster has
256 pages (1MB cluster), so it will scan 256 * 256k pags when creating
a 1G file. That severely hurts the performance.

Therefore, we return immediately from ext4_find_delalloc_range() in
nodelalloc mode, since by definition there can't be any delalloc
pages.

Signed-off-by: Robin Dong <sanbai@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4423b11476a..5684f251092 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3281,6 +3281,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
 	ext4_lblk_t i, pg_lblk;
 	pgoff_t index;
 
+	if (!test_opt(inode->i_sb, DELALLOC))
+		return 0;
+
 	/* reverse search wont work if fs block size is less than page size */
 	if (inode->i_blkbits < PAGE_CACHE_SHIFT)
 		search_hint_reverse = 0;
-- 
cgit v1.2.3-70-g09d2


From 22cdfca5641817060dd724a9c30442f5c0675fcd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 21 Dec 2011 14:14:31 -0500
Subject: ext4: remove unneeded file_remove_suid() from ext4_ioctl()

In the code to support EXT4_IOC_MOVE_EXT, ext4_ioctl calls
file_remove_suid() after the call to ext4_move_extents() if any
extents has been moved.  There are at least three things wrong with
this.  First, file_remove_suid() should be called with i_mutex down,
which is not here.  Second, it should be called before the donor file
has been modified, to avoid a potential race condition.  Third, and
most importantly, it's pointless, because ext4_file_extents() already
checks if the donor file has the setuid or setgid bit set, and will
return an error in that case.  So the first two objections don't
really matter, since file_remove_suid() will never need to modify the
inode in any case.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a56796814d6..ff1aab7cd6e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -247,8 +247,6 @@ setversion_out:
 		err = ext4_move_extents(filp, donor_filp, me.orig_start,
 					me.donor_start, me.len, &me.moved_len);
 		mnt_drop_write(filp->f_path.mnt);
-		if (me.moved_len > 0)
-			file_remove_suid(donor_filp);
 
 		if (copy_to_user((struct move_extent __user *)arg,
 				 &me, sizeof(me)))
-- 
cgit v1.2.3-70-g09d2


From 2aff57b0c052344e8401a8b4a33c2a1ecb0f627c Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 28 Dec 2011 12:02:13 -0500
Subject: ext4: allocate delalloc blocks before changing journal mode

delalloc blocks should be allocated before changing journal mode,
otherwise they can not be allocated and even more truncate on
delalloc blocks could triggre BUG by flushing delalloc buffers.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 92655fd8965..cb0ba9d77a8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4647,6 +4647,17 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
+	/* We have to allocate physical blocks for delalloc blocks
+	 * before flushing journal. otherwise delalloc blocks can not
+	 * be allocated any more. even more truncate on delalloc blocks
+	 * could trigger BUG by flushing delalloc blocks in journal.
+	 * There is no delalloc block in non-journal data mode.
+	 */
+	if (val && test_opt(inode->i_sb, DELALLOC)) {
+		err = ext4_alloc_da_blocks(inode);
+		if (err < 0)
+			return err;
+	}
 
 	jbd2_journal_lock_updates(journal);
 	jbd2_journal_flush(journal);
-- 
cgit v1.2.3-70-g09d2


From 5872ddaaf05bf25e3ab90580295ebc946405928c Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 28 Dec 2011 13:55:51 -0500
Subject: ext4: flush journal when switching from data=journal mode

It's necessary to flush the journal when switching away from
data=journal mode.  This is because there are no revoke records when
data blocks are journalled, but revoke records are required in the
other journal modes.

However, it is not necessary to flush the journal when switching into
data=journal mode, and flushing the journal is expensive.  So let's
avoid it in that case.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cb0ba9d77a8..1254934de69 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4660,7 +4660,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	}
 
 	jbd2_journal_lock_updates(journal);
-	jbd2_journal_flush(journal);
 
 	/*
 	 * OK, there are no updates running now, and all cached data is
@@ -4672,8 +4671,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
-	else
+	else {
+		jbd2_journal_flush(journal);
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	}
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
-- 
cgit v1.2.3-70-g09d2


From 1ba37268cd19e5a2a80924bfe8618bf1ba3e8249 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 28 Dec 2011 17:46:46 -0500
Subject: jbd2: clear revoked flag on buffers before a new transaction started

Currently, we clear revoked flag only when a block is reused.  However,
this can tigger a false journal error.  Consider a situation when a block
is used as a meta block and is deleted(revoked) in ordered mode, then the
block is allocated as a data block to a file.  At this moment, user changes
the file's journal mode from ordered to journaled and truncates the file.
The block will be considered re-revoked by journal because it has revoked
flag still pending from the last transaction and an assertion triggers.

We fix the problem by keeping the revoked status more uptodate - we clear
revoked flag when switching revoke tables to reflect there is no revoked
buffers in current transaction any more.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c     |  6 ++++++
 fs/jbd2/revoke.c     | 34 ++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h |  1 +
 3 files changed, 41 insertions(+)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 68d704db787..5069b847515 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -429,6 +429,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	jbd_debug(3, "JBD2: commit phase 1\n");
 
+	/*
+	 * Clear revoked flag to reflect there is no revoked buffers
+	 * in the next transaction which is going to be started.
+	 */
+	jbd2_clear_buffer_revoked_flags(journal);
+
 	/*
 	 * Switch to a new revoke table.
 	 */
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 69fd9358811..30b2867d6cc 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -47,6 +47,10 @@
  *   overwriting the new data.  We don't even need to clear the revoke
  *   bit here.
  *
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits.  As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
  * Revoke information on buffers is a tri-state value:
  *
  * RevokeValid clear:	no cached revoke status, need to look it up
@@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	return did_revoke;
 }
 
+/*
+ * journal_clear_revoked_flag clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffers in the next
+ * transaction which is going to be started.
+ */
+void jbd2_clear_buffer_revoked_flags(journal_t *journal)
+{
+	struct jbd2_revoke_table_s *revoke = journal->j_revoke;
+	int i = 0;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		struct list_head *hash_list;
+		struct list_head *list_entry;
+		hash_list = &revoke->hash_table[i];
+
+		list_for_each(list_entry, hash_list) {
+			struct jbd2_revoke_record_s *record;
+			struct buffer_head *bh;
+			record = (struct jbd2_revoke_record_s *)list_entry;
+			bh = __find_get_block(journal->j_fs_dev,
+					      record->blocknr,
+					      journal->j_blocksize);
+			if (bh) {
+				clear_buffer_revoked(bh);
+				__brelse(bh);
+			}
+		}
+	}
+}
+
 /* journal_switch_revoke table select j_revoke for next transaction
  * we do not want to suspend any processing until all revokes are
  * written -bzzz
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 2092ea21e46..5557baefed6 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1151,6 +1151,7 @@ extern int	jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
 extern int	jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t);
 extern void	jbd2_journal_clear_revoke(journal_t *);
 extern void	jbd2_journal_switch_revoke_table(journal_t *journal);
+extern void	jbd2_clear_buffer_revoked_flags(journal_t *journal);
 
 /*
  * The log thread user interface:
-- 
cgit v1.2.3-70-g09d2


From 88635ca277adb67db34e88281817d1ce10713553 Mon Sep 17 00:00:00 2001
From: Zheng Liu <gnehzuil.liu@gmail.com>
Date: Wed, 28 Dec 2011 19:00:25 -0500
Subject: ext4: add missing spaces to debugging printk's

Fix ext4_debug format in ext4_ext_handle_uninitialized_extents() and
ext4_end_io_dio().

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ++--
 fs/ext4/inode.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5684f251092..b35bb40556d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3456,8 +3456,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	int err = 0;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 
-	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
-		  "block %llu, max_blocks %u, flags %d, allocated %u",
+	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
 		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1254934de69..ef9e8fdddfb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2760,7 +2760,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	if (!io_end || !size)
 		goto out;
 
-	ext_debug("ext4_end_io_dio(): io_end 0x%p"
+	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
  		  iocb->private, io_end->inode->i_ino, iocb, offset,
 		  size);
-- 
cgit v1.2.3-70-g09d2


From 14c83c9fddf2e75bdd0c20f1072f35260e356484 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 28 Dec 2011 20:25:13 -0500
Subject: ext4: avoid counting the number of free inodes twice in
 find_group_orlov()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8fb6844f973..cdafc05d79c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
-	unsigned int freei, avefreei;
+	unsigned int freei, avefreei, grp_free;
 	ext4_fsblk_t freeb, avefreec;
 	unsigned int ndirs;
 	int max_dirs, min_inodes;
@@ -477,8 +477,8 @@ fallback_retry:
 	for (i = 0; i < ngroups; i++) {
 		grp = (parent_group + i) % ngroups;
 		desc = ext4_get_group_desc(sb, grp, NULL);
-		if (desc && ext4_free_inodes_count(sb, desc) &&
-		    ext4_free_inodes_count(sb, desc) >= avefreei) {
+		grp_free = ext4_free_inodes_count(sb, desc);
+		if (desc && grp_free && grp_free >= avefreei) {
 			*group = grp;
 			return 0;
 		}
-- 
cgit v1.2.3-70-g09d2


From ccb4d7af914e0fe9b2f1022f8ea6c300463fd5e6 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Wed, 28 Dec 2011 20:25:40 -0500
Subject: ext4: remove no longer used functions in inode.c

The functions ext4_block_truncate_page() and ext4_block_zero_page_range()
are no longer used, so remove them.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   4 --
 fs/ext4/inode.c | 120 --------------------------------------------------------
 2 files changed, 124 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b0e26a1272..ae2407f4502 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1880,10 +1880,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
-		struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-		struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_discard_partial_page_buffers(handle_t *handle,
 		struct address_space *mapping, loff_t from,
 		loff_t length, int flags);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ef9e8fdddfb..e6cc24dfa98 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3301,126 +3301,6 @@ next:
 	return err;
 }
 
-/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
-		struct address_space *mapping, loff_t from)
-{
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned length;
-	unsigned blocksize;
-	struct inode *inode = mapping->host;
-
-	blocksize = inode->i_sb->s_blocksize;
-	length = blocksize - (offset & (blocksize - 1));
-
-	return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-int ext4_block_zero_page_range(handle_t *handle,
-		struct address_space *mapping, loff_t from, loff_t length)
-{
-	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize, max, pos;
-	ext4_lblk_t iblock;
-	struct inode *inode = mapping->host;
-	struct buffer_head *bh;
-	struct page *page;
-	int err = 0;
-
-	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-				   mapping_gfp_mask(mapping) & ~__GFP_FS);
-	if (!page)
-		return -ENOMEM;
-
-	blocksize = inode->i_sb->s_blocksize;
-	max = blocksize - (offset & (blocksize - 1));
-
-	/*
-	 * correct length if it does not fall between
-	 * 'from' and the end of the block
-	 */
-	if (length > max || length < 0)
-		length = max;
-
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-
-	/* Find the buffer that contains "offset" */
-	bh = page_buffers(page);
-	pos = blocksize;
-	while (offset >= pos) {
-		bh = bh->b_this_page;
-		iblock++;
-		pos += blocksize;
-	}
-
-	err = 0;
-	if (buffer_freed(bh)) {
-		BUFFER_TRACE(bh, "freed: skip");
-		goto unlock;
-	}
-
-	if (!buffer_mapped(bh)) {
-		BUFFER_TRACE(bh, "unmapped");
-		ext4_get_block(inode, iblock, bh, 0);
-		/* unmapped? It's a hole - nothing to do */
-		if (!buffer_mapped(bh)) {
-			BUFFER_TRACE(bh, "still unmapped");
-			goto unlock;
-		}
-	}
-
-	/* Ok, it's mapped. Make sure it's up-to-date */
-	if (PageUptodate(page))
-		set_buffer_uptodate(bh);
-
-	if (!buffer_uptodate(bh)) {
-		err = -EIO;
-		ll_rw_block(READ, 1, &bh);
-		wait_on_buffer(bh);
-		/* Uhhuh. Read error. Complain and punt. */
-		if (!buffer_uptodate(bh))
-			goto unlock;
-	}
-
-	if (ext4_should_journal_data(inode)) {
-		BUFFER_TRACE(bh, "get write access");
-		err = ext4_journal_get_write_access(handle, bh);
-		if (err)
-			goto unlock;
-	}
-
-	zero_user(page, offset, length);
-
-	BUFFER_TRACE(bh, "zeroed end of block");
-
-	err = 0;
-	if (ext4_should_journal_data(inode)) {
-		err = ext4_handle_dirty_metadata(handle, inode, bh);
-	} else
-		mark_buffer_dirty(bh);
-
-unlock:
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
 int ext4_can_truncate(struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
-- 
cgit v1.2.3-70-g09d2


From 597d508c17a6dcd17770f4dd9da873d93cc15493 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 28 Dec 2011 20:32:07 -0500
Subject: ext4: use proper little-endian bitops

ext4_{set,clear}_bit() is defined as __test_and_{set,clear}_bit_le() for
ext4.  Only two ext4_{set,clear}_bit() calls check the return value.  The
rest of calls ignore the return value and they can be replaced with
__{set,clear}_bit_le().

This changes ext4_{set,clear}_bit() from __test_and_{set,clear}_bit_le()
to __{set,clear}_bit_le() and introduces ext4_test_and_{set,clear}_bit()
for the two places where old bit needs to be returned.

This ext4_{set,clear}_bit() change is considered safe, because if someone
uses these macros without noticing the change, new ext4_{set,clear}_bit
don't have return value and causes compiler errors where the return value
is used.

This also removes unused ext4_find_first_zero_bit().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 7 ++++---
 fs/ext4/ialloc.c | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ae2407f4502..0e43bba049a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -957,12 +957,13 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
 					 EXT4_MOUNT2_##opt)
 
-#define ext4_set_bit			__test_and_set_bit_le
+#define ext4_test_and_set_bit		__test_and_set_bit_le
+#define ext4_set_bit			__set_bit_le
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
-#define ext4_clear_bit			__test_and_clear_bit_le
+#define ext4_test_and_clear_bit		__test_and_clear_bit_le
+#define ext4_clear_bit			__clear_bit_le
 #define ext4_clear_bit_atomic		ext2_clear_bit_atomic
 #define ext4_test_bit			test_bit_le
-#define ext4_find_first_zero_bit	find_first_zero_bit_le
 #define ext4_find_next_zero_bit		find_next_zero_bit_le
 #define ext4_find_next_bit		find_next_bit_le
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index cdafc05d79c..72fc9892231 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		fatal = ext4_journal_get_write_access(handle, bh2);
 	}
 	ext4_lock_group(sb, block_group);
-	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
 	if (fatal || !cleared) {
 		ext4_unlock_group(sb, block_group);
 		goto out;
@@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	 */
 	down_read(&grp->alloc_sem);
 	ext4_lock_group(sb, group);
-	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+	if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
 		retval = 1;
 		goto err_ret;
-- 
cgit v1.2.3-70-g09d2


From 18e3143848f1abdd07e7d9879cf67f4e147ff8b7 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:18:50 -0500
Subject: ext4: add a function which extends a group without checking
 parameters

This patch added a function named ext4_group_extend_no_check() whose code
is copied from ext4_group_extend().  ext4_group_extend_no_check() assumes
the parameter is valid and has been checked by caller.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 996780ab4f4..1c3c2b56049 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -968,6 +968,57 @@ exit_put:
 	return err;
 } /* ext4_group_add */
 
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+				      ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	handle_t *handle;
+	int err = 0, err2;
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext4_group_add_blocks().
+	 */
+	handle = ext4_journal_start_sb(sb, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext4_warning(sb, "error %d on journal start", err);
+		return err;
+	}
+
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err) {
+		ext4_warning(sb, "error %d on journal write access", err);
+		goto errout;
+	}
+
+	ext4_blocks_count_set(es, o_blocks_count + add);
+	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
+	if (err)
+		goto errout;
+	ext4_handle_dirty_super(handle, sb);
+	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+errout:
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
+		err = err2;
+
+	if (!err) {
+		if (test_opt(sb, DEBUG))
+			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+			       "blocks\n", ext4_blocks_count(es));
+		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block));
+	}
+	return err;
+}
+
 /*
  * Extend the filesystem to the new number of blocks specified.  This entry
  * point is only used to extend the current filesystem to the end of the last
-- 
cgit v1.2.3-70-g09d2


From bb08c1e7d8c072da338f6d905a89376b36023017 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:20:50 -0500
Subject: ext4: add a function which adds a new group descriptors to a fs

This patch adds a function named ext4_add_new_descs() which adds one
or more new group descriptors to a fs and whose code is copied from
ext4_group_add().

The function will be used by new resize implementation.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 1c3c2b56049..3bb4e7b502e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -735,6 +735,52 @@ exit_err:
 	}
 }
 
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+			      ext4_group_t group, struct inode *resize_inode,
+			      ext4_group_t count)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *gdb_bh;
+	int i, gdb_off, gdb_num, err = 0;
+
+	for (i = 0; i < count; i++, group++) {
+		int reserved_gdb = ext4_bg_has_super(sb, group) ?
+			le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * We will only either add reserved group blocks to a backup group
+		 * or remove reserved blocks for the first group in a new group block.
+		 * Doing both would be mean more complex code, and sane people don't
+		 * use non-sparse filesystems anymore.  This is already checked above.
+		 */
+		if (gdb_off) {
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			err = ext4_journal_get_write_access(handle, gdb_bh);
+
+			if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+				err = reserve_backup_gdb(handle, resize_inode, group);
+		} else
+			err = add_new_gdb(handle, resize_inode, group);
+		if (err)
+			break;
+	}
+	return err;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3-70-g09d2


From 28c7bac0091687e6116ebd6c179e154ae4053c90 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:22:50 -0500
Subject: ext4: add a structure which will be used by 64bit-resize interface

This patch adds a structure which will be used by 64bit-resize interface.
Two functions which allocate and destroy the structure respectively are
added.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3bb4e7b502e..6076d5e4b51 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -134,6 +134,61 @@ static int verify_group_input(struct super_block *sb,
 	return err;
 }
 
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+	struct ext4_new_group_data *groups;	/* new_group_data for groups
+						   in the flex group */
+	__u16 *bg_flags;			/* block group flags of groups
+						   in @groups */
+	ext4_group_t count;			/* number of groups in @groups
+						 */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+	struct ext4_new_flex_group_data *flex_gd;
+
+	flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+	if (flex_gd == NULL)
+		goto out3;
+
+	flex_gd->count = flexbg_size;
+
+	flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+				  flexbg_size, GFP_NOFS);
+	if (flex_gd->groups == NULL)
+		goto out2;
+
+	flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+	if (flex_gd->bg_flags == NULL)
+		goto out1;
+
+	return flex_gd;
+
+out1:
+	kfree(flex_gd->groups);
+out2:
+	kfree(flex_gd);
+out3:
+	return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+	kfree(flex_gd->bg_flags);
+	kfree(flex_gd->groups);
+	kfree(flex_gd);
+}
+
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 				  ext4_fsblk_t blk)
 {
-- 
cgit v1.2.3-70-g09d2


From 33afdcc5402d0abf70ef2dfb96d0b901d20bcc37 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:32:52 -0500
Subject: ext4: add a function which sets up group blocks of a flex bg

This patch adds a function named setup_new_flex_group_blocks() which
sets up group blocks of a flex bg.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   |   8 ++
 fs/ext4/resize.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0e43bba049a..05058e2b7f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,6 +511,14 @@ struct ext4_new_group_data {
 	__u32 free_blocks_count;
 };
 
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+	BLOCK_BITMAP = 0,	/* block bitmap */
+	INODE_BITMAP,		/* inode bitmap */
+	INODE_TABLE,		/* inode tables */
+	GROUP_TABLE_COUNT,
+};
+
 /*
  * Flags used by ext4_map_blocks()
  */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6076d5e4b51..e8ccb2f8f45 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -233,6 +233,256 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
 	return 0;
 }
 
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+			struct ext4_new_flex_group_data *flex_gd,
+			ext4_fsblk_t block, ext4_group_t count)
+{
+	ext4_group_t count2;
+
+	ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+	for (count2 = count; count > 0; count -= count2, block += count2) {
+		ext4_fsblk_t start;
+		struct buffer_head *bh;
+		ext4_group_t group;
+		int err;
+
+		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+		start = ext4_group_first_block_no(sb, group);
+		group -= flex_gd->groups[0].group;
+
+		count2 = sb->s_blocksize * 8 - (block - start);
+		if (count2 > count)
+			count2 = count;
+
+		if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+			BUG_ON(flex_gd->count > 1);
+			continue;
+		}
+
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			return err;
+
+		bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+		if (!bh)
+			return -EIO;
+
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			return err;
+		ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+			   block - start, count2);
+		ext4_set_bits(bh->b_data, block - start, count2);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
+			return err;
+		brelse(bh);
+	}
+
+	return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ *  1. copy super block and GDT, and initialize group tables if necessary.
+ *     In this step, we only set bits in blocks bitmaps for blocks taken by
+ *     super block and GDT.
+ *  2. allocate group tables in block bitmaps, that is, set bits in block
+ *     bitmap for blocks taken by group tables.
+ */
+static int setup_new_flex_group_blocks(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+	ext4_fsblk_t start;
+	ext4_fsblk_t block;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	__u16 *bg_flags = flex_gd->bg_flags;
+	handle_t *handle;
+	ext4_group_t group, count;
+	struct buffer_head *bh = NULL;
+	int reserved_gdb, i, j, err = 0, err2;
+
+	BUG_ON(!flex_gd->count || !group_data ||
+	       group_data[0].group != sbi->s_groups_count);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+
+	/* This transaction may be extended/restarted along the way */
+	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	group = group_data[0].group;
+	for (i = 0; i < flex_gd->count; i++, group++) {
+		unsigned long gdblocks;
+
+		gdblocks = ext4_bg_num_gdb(sb, group);
+		start = ext4_group_first_block_no(sb, group);
+
+		/* Copy all of the GDT blocks into the backup in this group */
+		for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+			struct buffer_head *gdb;
+
+			ext4_debug("update backup group %#04llx\n", block);
+			err = extend_or_restart_transaction(handle, 1);
+			if (err)
+				goto out;
+
+			gdb = sb_getblk(sb, block);
+			if (!gdb) {
+				err = -EIO;
+				goto out;
+			}
+
+			err = ext4_journal_get_write_access(handle, gdb);
+			if (err) {
+				brelse(gdb);
+				goto out;
+			}
+			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+			       gdb->b_size);
+			set_buffer_uptodate(gdb);
+
+			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+			if (unlikely(err)) {
+				brelse(gdb);
+				goto out;
+			}
+			brelse(gdb);
+		}
+
+		/* Zero out all of the reserved backup group descriptor
+		 * table blocks
+		 */
+		if (ext4_bg_has_super(sb, group)) {
+			err = sb_issue_zeroout(sb, gdblocks + start + 1,
+					reserved_gdb, GFP_NOFS);
+			if (err)
+				goto out;
+		}
+
+		/* Initialize group tables of the grop @group */
+		if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+			goto handle_bb;
+
+		/* Zero out all of the inode table blocks */
+		block = group_data[i].inode_table;
+		ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+			   block, sbi->s_itb_per_group);
+		err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+				       GFP_NOFS);
+		if (err)
+			goto out;
+
+handle_bb:
+		if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+			goto handle_ib;
+
+		/* Initialize block bitmap of the @group */
+		block = group_data[i].block_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			goto out;
+		}
+		if (ext4_bg_has_super(sb, group)) {
+			ext4_debug("mark backup superblock %#04llx (+0)\n",
+				   start);
+			ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
+						     1);
+		}
+		ext4_mark_bitmap_end(group_data[i].blocks_count,
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+
+handle_ib:
+		if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+			continue;
+
+		/* Initialize inode bitmap of the @group */
+		block = group_data[i].inode_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+		/* Mark unused entries in inode bitmap used */
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			goto out;
+		}
+
+		ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+	}
+	bh = NULL;
+
+	/* Mark group tables in block bitmap */
+	for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+		count = group_table_count[j];
+		start = (&group_data[0].block_bitmap)[j];
+		block = start;
+		for (i = 1; i < flex_gd->count; i++) {
+			block += group_table_count[j];
+			if (block == (&group_data[i].block_bitmap)[j]) {
+				count += group_table_count[j];
+				continue;
+			}
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+			count = group_table_count[j];
+			start = group_data[i].block_bitmap;
+			block = start;
+		}
+
+		if (count) {
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	brelse(bh);
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
+		err = err2;
+
+	return err;
+}
+
 /*
  * Set up the block and inode bitmaps, and the inode table for the new group.
  * This doesn't need to be part of the main transaction, since we are only
-- 
cgit v1.2.3-70-g09d2


From 083f5b24cc55448e0602a807a5c2872e1f3796e2 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:37:31 -0500
Subject: ext4: add a function which sets up a block group descriptors of a
 flex bg

This patch adds a function named ext4_setup_new_descs which sets up the
block group descriptors of a flex bg.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e8ccb2f8f45..098bdb8e97c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1086,6 +1086,62 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
 	return err;
 }
 
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_new_group_data	*group_data = flex_gd->groups;
+	struct ext4_group_desc		*gdp;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	struct buffer_head		*gdb_bh;
+	ext4_group_t			group;
+	__u16				*bg_flags = flex_gd->bg_flags;
+	int				i, gdb_off, gdb_num, err = 0;
+	
+
+	for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+		group = group_data->group;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+		 */
+		gdb_bh = sbi->s_group_desc[gdb_num];
+		/* Update group descriptor block for new group */
+		gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+						 gdb_off * EXT4_DESC_SIZE(sb));
+
+		memset(gdp, 0, EXT4_DESC_SIZE(sb));
+		ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+		ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+		ext4_inode_table_set(sb, gdp, group_data->inode_table);
+		ext4_free_group_clusters_set(sb, gdp,
+					     EXT4_B2C(sbi, group_data->free_blocks_count));
+		ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+		gdp->bg_flags = cpu_to_le16(*bg_flags);
+		gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+		if (unlikely(err)) {
+			ext4_std_error(sb, err);
+			break;
+		}
+
+		/*
+		 * We can allocate memory for mb_alloc based on the new group
+		 * descriptor
+		 */
+		err = ext4_mb_add_groupinfo(sb, group, gdp);
+		if (err)
+			break;
+	}
+	return err;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3-70-g09d2


From 2e10e2f2e5a800a54ad2f16dfdd8c034e005958b Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:41:39 -0500
Subject: ext4: add a function which updates the super block during online
 resizing

This patch adds a function named ext4_update_super() which updates
super block so the newly created block groups are visible to the file
system.  This code is copied from ext4_group_add().

The function will be used by new resize implementation.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 098bdb8e97c..eb0aebcca55 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1142,6 +1142,100 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
 	return err;
 }
 
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+			     struct ext4_new_flex_group_data *flex_gd)
+{
+	ext4_fsblk_t blocks_count = 0;
+	ext4_fsblk_t free_blocks = 0;
+	ext4_fsblk_t reserved_blocks = 0;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	for (i = 0; i < flex_gd->count; i++) {
+		blocks_count += group_data[i].blocks_count;
+		free_blocks += group_data[i].free_blocks_count;
+	}
+
+	reserved_blocks = ext4_r_blocks_count(es) * 100;
+	do_div(reserved_blocks, ext4_blocks_count(es));
+	reserved_blocks *= blocks_count;
+	do_div(reserved_blocks, 100);
+
+	ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers must perform a smp_wmb() after updating all
+	 *   dependent data and before modifying the groups count
+	 *
+	 * * Readers must perform an smp_rmb() after reading the groups
+	 *   count and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count += flex_gd->count;
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+				reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_B2C(sbi, free_blocks));
+	percpu_counter_add(&sbi->s_freeinodes_counter,
+			   EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+				      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, group_data[0].group);
+		atomic_add(EXT4_B2C(sbi, free_blocks),
+			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+	}
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: added group %u:"
+		       "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+		       blocks_count, free_blocks, reserved_blocks);
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3-70-g09d2


From c72df9f928efd5b17e84bdb7b8ec1be3b9c1ea9d Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:43:39 -0500
Subject: ext4: pass verify_reserved_gdb() the number of group decriptors

The 64bit resizer adds a flex group each time, so verify_reserved_gdb
can not use s_groups_count directly, it should use the number of group
decriptors before the added group.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index eb0aebcca55..12eace09654 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -656,10 +656,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
  * groups in current filesystem that have BACKUPS, or -ve error code.
  */
 static int verify_reserved_gdb(struct super_block *sb,
+			       ext4_group_t end,
 			       struct buffer_head *primary)
 {
 	const ext4_fsblk_t blk = primary->b_blocknr;
-	const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
 	unsigned seven = 7;
@@ -734,7 +734,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	if (!gdb_bh)
 		return -EIO;
 
-	gdbackups = verify_reserved_gdb(sb, gdb_bh);
+	gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
 	if (gdbackups < 0) {
 		err = gdbackups;
 		goto exit_bh;
@@ -897,7 +897,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 			err = -EIO;
 			goto exit_bh;
 		}
-		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+		gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+		if (gdbackups < 0) {
 			brelse(primary[res]);
 			err = gdbackups;
 			goto exit_bh;
-- 
cgit v1.2.3-70-g09d2


From 3fbea4b3683a5dfa86489ef7799cbe55e8003dfa Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:44:38 -0500
Subject: ext4: add a new function which allocates bitmaps and inode tables

This patch adds a new function named ext4_allocates_group_table()
which allocates block bitmaps, inode bitmaps and inode tables for a
flex groups and is used by resize code.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 12eace09654..a4075de73c7 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -189,6 +189,117 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
 	kfree(flex_gd);
 }
 
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize.  Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ */
+static void ext4_alloc_group_tables(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd,
+				int flexbg_size)
+{
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	ext4_fsblk_t start_blk;
+	ext4_fsblk_t last_blk;
+	ext4_group_t src_group;
+	ext4_group_t bb_index = 0;
+	ext4_group_t ib_index = 0;
+	ext4_group_t it_index = 0;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	unsigned overhead;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+	src_group = group_data[0].group;
+	last_group  = src_group + flex_gd->count - 1;
+
+	BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+	       (last_group & ~(flexbg_size - 1))));
+next_group:
+	group = group_data[0].group;
+	start_blk = ext4_group_first_block_no(sb, src_group);
+	last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+	overhead = ext4_bg_has_super(sb, src_group) ?
+		   (1 + ext4_bg_num_gdb(sb, src_group) +
+		    le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+
+	start_blk += overhead;
+
+	BUG_ON(src_group >= group_data[0].group + flex_gd->count);
+	/* We collect contiguous blocks as much as possible. */
+	src_group++;
+	for (; src_group <= last_group; src_group++)
+		if (!ext4_bg_has_super(sb, src_group))
+			last_blk += group_data[src_group - group].blocks_count;
+		else
+			break;
+
+	/* Allocate block bitmaps */
+	for (; bb_index < flex_gd->count; bb_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[bb_index].block_bitmap = start_blk++;
+		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		if (flexbg_size > 1)
+			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+	}
+
+	/* Allocate inode bitmaps */
+	for (; ib_index < flex_gd->count; ib_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[ib_index].inode_bitmap = start_blk++;
+		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		if (flexbg_size > 1)
+			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+	}
+
+	/* Allocate inode tables */
+	for (; it_index < flex_gd->count; it_index++) {
+		if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+			goto next_group;
+		group_data[it_index].inode_table = start_blk;
+		ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count -=
+					EXT4_SB(sb)->s_itb_per_group;
+		if (flexbg_size > 1)
+			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+
+		start_blk += EXT4_SB(sb)->s_itb_per_group;
+	}
+
+	if (test_opt(sb, DEBUG)) {
+		int i;
+		group = group_data[0].group;
+
+		printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+		       "%d groups, flexbg size is %d:\n", flex_gd->count,
+		       flexbg_size);
+
+		for (i = 0; i < flex_gd->count; i++) {
+			printk(KERN_DEBUG "adding %s group %u: %u "
+			       "blocks (%d free)\n",
+			       ext4_bg_has_super(sb, group + i) ? "normal" :
+			       "no-super", group + i,
+			       group_data[i].blocks_count,
+			       group_data[i].free_blocks_count);
+		}
+	}
+}
+
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 				  ext4_fsblk_t blk)
 {
-- 
cgit v1.2.3-70-g09d2


From 4bac1f8cef7bfd2c62793f75aba66a5b8357dede Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Tue, 3 Jan 2012 23:44:38 -0500
Subject: ext4: add a new function which adds a flex group to a fs

This patch adds a new function named ext4_flex_group_add() which adds a
flex group to a fs.  The function is used by 64bit-resize interface.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a4075de73c7..dac23561f3e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1348,6 +1348,88 @@ static void ext4_update_super(struct super_block *sb,
 		       blocks_count, free_blocks, reserved_blocks);
 }
 
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+			       struct inode *resize_inode,
+			       struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t o_blocks_count;
+	ext4_grpblk_t last;
+	ext4_group_t group;
+	handle_t *handle;
+	unsigned reserved_gdb;
+	int err = 0, err2 = 0, credit;
+
+	BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+	o_blocks_count = ext4_blocks_count(es);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+
+	err = setup_new_flex_group_blocks(sb, flex_gd);
+	if (err)
+		goto exit;
+	/*
+	 * We will always be modifying at least the superblock and  GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	credit = flex_gd->count * 4 + reserved_gdb;
+	handle = ext4_journal_start_sb(sb, credit);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto exit_journal;
+
+	group = flex_gd->groups[0].group;
+	BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+	err = ext4_add_new_descs(handle, sb, group,
+				resize_inode, flex_gd->count);
+	if (err)
+		goto exit_journal;
+
+	err = ext4_setup_new_descs(handle, sb, flex_gd);
+	if (err)
+		goto exit_journal;
+
+	ext4_update_super(sb, flex_gd);
+
+	err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+	err2 = ext4_journal_stop(handle);
+	if (!err)
+		err = err2;
+
+	if (!err) {
+		int i;
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block));
+		for (i = 0; i < flex_gd->count; i++, group++) {
+			struct buffer_head *gdb_bh;
+			int gdb_num;
+			gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+				       gdb_bh->b_size);
+		}
+	}
+exit:
+	return err;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
-- 
cgit v1.2.3-70-g09d2


From 19c5246d251640ac76daa4d34165af78c64b1454 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 4 Jan 2012 17:09:44 -0500
Subject: ext4: add new online resize interface

This patch adds new online resize interface, whose input argument is a
64-bit integer indicating how many blocks there are in the resized fs.

In new resize impelmentation, all work like allocating group tables
are done by kernel side, so the new resize interface can support
flex_bg feature and prepares ground for suppoting resize with features
like bigalloc and exclude bitmap. Besides these, user-space tools just
passes in the new number of blocks.

We delay initializing the bitmaps and inode tables of added groups if
possible and add multi groups (a flex groups) each time, so new resize
is very fast like mkfs.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |   7 ++
 fs/ext4/ext4.h                     |   2 +
 fs/ext4/ioctl.c                    |  57 ++++++++++++
 fs/ext4/resize.c                   | 177 +++++++++++++++++++++++++++++++++++++
 4 files changed, 243 insertions(+)

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 4917cf24a5e..10ec4639f15 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -581,6 +581,13 @@ Table of Ext4 specific ioctls
 			      behaviour may change in the future as it is
 			      not necessary and has been done this way only
 			      for sake of simplicity.
+
+ EXT4_IOC_RESIZE_FS	      Resize the filesystem to a new size.  The number
+			      of blocks of resized filesystem is passed in via
+			      64 bit integer argument.  The kernel allocates
+			      bitmaps and inode table, the userspace tool thus
+			      just passes the new number of blocks.
+
 ..............................................................................
 
 References
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 05058e2b7f4..4bc0e82a905 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -583,6 +583,7 @@ enum {
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -1929,6 +1930,7 @@ extern int ext4_group_add(struct super_block *sb,
 extern int ext4_group_extend(struct super_block *sb,
 				struct ext4_super_block *es,
 				ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 
 /* super.c */
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ff1aab7cd6e..c1a98804a38 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -18,6 +18,8 @@
 #include "ext4_jbd2.h"
 #include "ext4.h"
 
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
@@ -329,6 +331,60 @@ mext_out:
 		return err;
 	}
 
+	case EXT4_IOC_RESIZE_FS: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		int err = 0, err2 = 0;
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not (yet) supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+			       EXT4_FEATURE_INCOMPAT_META_BG)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not (yet) supported with meta_bg");
+			return -EOPNOTSUPP;
+		}
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
+			return -EFAULT;
+		}
+
+		if (n_blocks_count > MAX_32_NUM &&
+		    !EXT4_HAS_INCOMPAT_FEATURE(sb,
+					       EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "File system only supports 32-bit block numbers");
+			return -EOPNOTSUPP;
+		}
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto resizefs_out;
+
+		err = ext4_resize_fs(sb, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write(filp->f_path.mnt);
+resizefs_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
 	case FITRIM:
 	{
 		struct request_queue *q = bdev_get_queue(sb->s_bdev);
@@ -427,6 +483,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	case EXT4_IOC_MOVE_EXT:
 	case FITRIM:
+	case EXT4_IOC_RESIZE_FS:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dac23561f3e..5fe2a013ee6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1430,6 +1430,70 @@ exit:
 	return err;
 }
 
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+				    struct ext4_new_flex_group_data *flex_gd,
+				    ext4_fsblk_t n_blocks_count,
+				    unsigned long flexbg_size)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t n_group;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	ext4_grpblk_t last;
+	ext4_grpblk_t blocks_per_group;
+	unsigned long i;
+
+	blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (o_blocks_count == n_blocks_count)
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+	last_group = group | (flexbg_size - 1);
+	if (last_group > n_group)
+		last_group = n_group;
+
+	flex_gd->count = last_group - group + 1;
+
+	for (i = 0; i < flex_gd->count; i++) {
+		int overhead;
+
+		group_data[i].group = group + i;
+		group_data[i].blocks_count = blocks_per_group;
+		overhead = ext4_bg_has_super(sb, group + i) ?
+			   (1 + ext4_bg_num_gdb(sb, group + i) +
+			    le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+		group_data[i].free_blocks_count = blocks_per_group - overhead;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+			flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+					       EXT4_BG_INODE_UNINIT;
+		else
+			flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+	}
+
+	if (last_group == n_group &&
+	    EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+		/* We need to initialize block bitmap of last group. */
+		flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+	if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+		group_data[i - 1].blocks_count = last + 1;
+		group_data[i - 1].free_blocks_count -= blocks_per_group-
+					last - 1;
+	}
+
+	return 1;
+}
+
 /* Add group descriptor data to an existing or new group descriptor block.
  * Ensure we handle all possible error conditions _before_ we start modifying
  * the filesystem, because we cannot abort the transaction and not have it
@@ -1827,3 +1891,116 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 exit_put:
 	return err;
 } /* ext4_group_extend */
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+	struct ext4_new_flex_group_data *flex_gd = NULL;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *bh;
+	struct inode *resize_inode;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t o_group;
+	ext4_group_t n_group;
+	ext4_grpblk_t offset;
+	unsigned long n_desc_blocks;
+	unsigned long o_desc_blocks;
+	unsigned long desc_blocks;
+	int err = 0, flexbg_size = 1;
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count < o_blocks_count) {
+		/* On-line shrinking not supported */
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count == o_blocks_count)
+		/* Nothing need to do */
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+
+	n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
+			EXT4_DESC_PER_BLOCK(sb);
+	o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+			EXT4_DESC_PER_BLOCK(sb);
+	desc_blocks = n_desc_blocks - o_desc_blocks;
+
+	if (desc_blocks &&
+	    (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
+	     le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
+		ext4_warning(sb, "No reserved GDT blocks, can't resize");
+		return -EPERM;
+	}
+
+	resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+	if (IS_ERR(resize_inode)) {
+		ext4_warning(sb, "Error opening resize inode");
+		return PTR_ERR(resize_inode);
+	}
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, n_blocks_count - 1);
+	if (!bh) {
+		ext4_warning(sb, "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	if (offset != 0) {
+		/* extend the last group */
+		ext4_grpblk_t add;
+		add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+		err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+		if (err)
+			goto out;
+	}
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    es->s_log_groups_per_flex)
+		flexbg_size = 1 << es->s_log_groups_per_flex;
+
+	o_blocks_count = ext4_blocks_count(es);
+	if (o_blocks_count == n_blocks_count)
+		goto out;
+
+	flex_gd = alloc_flex_gd(flexbg_size);
+	if (flex_gd == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Add flex groups. Note that a regular group is a
+	 * flex group with 1 group.
+	 */
+	while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+					      flexbg_size)) {
+		ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+		err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+		if (unlikely(err))
+			break;
+	}
+
+out:
+	if (flex_gd)
+		free_flex_gd(flex_gd);
+
+	iput(resize_inode);
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
+		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+	return err;
+}
-- 
cgit v1.2.3-70-g09d2


From d89651c8e222b2d2797bf66d4eb7064459f4f4f4 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 4 Jan 2012 17:09:48 -0500
Subject: ext4: let ext4_group_extend() use common code

ext4_group_extend_no_check() is moved out from ext4_group_extend(),
this patch lets ext4_group_extend() call ext4_group_extentd_no_check()
instead.

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 41 ++---------------------------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5fe2a013ee6..eba706d9276 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1795,8 +1795,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
 	struct buffer_head *bh;
-	handle_t *handle;
-	int err, err2;
+	int err;
 	ext4_group_t group;
 
 	o_blocks_count = ext4_blocks_count(es);
@@ -1852,43 +1851,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	}
 	brelse(bh);
 
-	/* We will update the superblock, one block bitmap, and
-	 * one group descriptor via ext4_free_blocks().
-	 */
-	handle = ext4_journal_start_sb(sb, 3);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		ext4_warning(sb, "error %d on journal start", err);
-		goto exit_put;
-	}
-
-	if ((err = ext4_journal_get_write_access(handle,
-						 EXT4_SB(sb)->s_sbh))) {
-		ext4_warning(sb, "error %d on journal write access", err);
-		ext4_journal_stop(handle);
-		goto exit_put;
-	}
-	ext4_blocks_count_set(es, o_blocks_count + add);
-	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	/* We add the blocks to the bitmap and set the group need init bit */
-	err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
-	ext4_handle_dirty_super(handle, sb);
-	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	err2 = ext4_journal_stop(handle);
-	if (!err && err2)
-		err = err2;
-
-	if (err)
-		goto exit_put;
-
-	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
-		       ext4_blocks_count(es));
-	update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
-		       sizeof(struct ext4_super_block));
-exit_put:
+	err = ext4_group_extend_no_check(sb, o_blocks_count, add);
 	return err;
 } /* ext4_group_extend */
 
-- 
cgit v1.2.3-70-g09d2


From 61f296cc49751f1dc992039229d12b0de7e0c2ae Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Wed, 4 Jan 2012 17:09:50 -0500
Subject: ext4: let ext4_group_add() use common code

This patch lets ext4_group_add() call ext4_flex_group_add().

Signed-off-by: Yongqiang Yang <xiaoqiangnk@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 309 ++-----------------------------------------------------
 1 file changed, 10 insertions(+), 299 deletions(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index eba706d9276..f9d948f0eb8 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -594,137 +594,6 @@ out:
 	return err;
 }
 
-/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
- * This doesn't need to be part of the main transaction, since we are only
- * changing blocks outside the actual filesystem.  We still do journaling to
- * ensure the recovery is correct in case of a failure just after resize.
- * If any part of this fails, we simply abort the resize.
- */
-static int setup_new_group_blocks(struct super_block *sb,
-				  struct ext4_new_group_data *input)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
-	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
-		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
-	unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
-	struct buffer_head *bh;
-	handle_t *handle;
-	ext4_fsblk_t block;
-	ext4_grpblk_t bit;
-	int i;
-	int err = 0, err2;
-
-	/* This transaction may be extended/restarted along the way */
-	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
-
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	BUG_ON(input->group != sbi->s_groups_count);
-
-	/* Copy all of the GDT blocks into the backup in this group */
-	for (i = 0, bit = 1, block = start + 1;
-	     i < gdblocks; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
-		err = extend_or_restart_transaction(handle, 1);
-		if (err)
-			goto exit_journal;
-
-		gdb = sb_getblk(sb, block);
-		if (!gdb) {
-			err = -EIO;
-			goto exit_journal;
-		}
-		if ((err = ext4_journal_get_write_access(handle, gdb))) {
-			brelse(gdb);
-			goto exit_journal;
-		}
-		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
-		set_buffer_uptodate(gdb);
-		err = ext4_handle_dirty_metadata(handle, NULL, gdb);
-		if (unlikely(err)) {
-			brelse(gdb);
-			goto exit_journal;
-		}
-		brelse(gdb);
-	}
-
-	/* Zero out all of the reserved backup group descriptor table blocks */
-	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-			block, sbi->s_itb_per_group);
-	err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
-			       GFP_NOFS);
-	if (err)
-		goto exit_journal;
-
-	err = extend_or_restart_transaction(handle, 2);
-	if (err)
-		goto exit_journal;
-
-	bh = bclean(handle, sb, input->block_bitmap);
-	if (IS_ERR(bh)) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	if (ext4_bg_has_super(sb, input->group)) {
-		ext4_debug("mark backup group tables %#04llx (+0)\n", start);
-		ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
-	}
-
-	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
-		   input->block_bitmap - start);
-	ext4_set_bit(input->block_bitmap - start, bh->b_data);
-	ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
-		   input->inode_bitmap - start);
-	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
-	/* Zero out all of the inode table blocks */
-	block = input->inode_table;
-	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
-			block, sbi->s_itb_per_group);
-	err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
-	if (err)
-		goto exit_bh;
-	ext4_set_bits(bh->b_data, input->inode_table - start,
-		      sbi->s_itb_per_group);
-
-
-	ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
-			     bh->b_data);
-	err = ext4_handle_dirty_metadata(handle, NULL, bh);
-	if (unlikely(err)) {
-		ext4_std_error(sb, err);
-		goto exit_bh;
-	}
-	brelse(bh);
-	/* Mark unused entries in inode bitmap used */
-	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
-		   input->inode_bitmap, input->inode_bitmap - start);
-	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-			     bh->b_data);
-	err = ext4_handle_dirty_metadata(handle, NULL, bh);
-	if (unlikely(err))
-		ext4_std_error(sb, err);
-exit_bh:
-	brelse(bh);
-
-exit_journal:
-	if ((err2 = ext4_journal_stop(handle)) && !err)
-		err = err2;
-
-	return err;
-}
-
 /*
  * Iterate through the groups which hold BACKUP superblock/GDT copies in an
  * ext4 filesystem.  The counters should be initialized to 1, 5, and 7 before
@@ -1509,16 +1378,15 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
  */
 int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 {
+	struct ext4_new_flex_group_data flex_gd;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
 		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
-	struct buffer_head *primary = NULL;
-	struct ext4_group_desc *gdp;
 	struct inode *inode = NULL;
-	handle_t *handle;
 	int gdb_off, gdb_num;
-	int err, err2;
+	int err;
+	__u16 bg_flags = 0;
 
 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
 	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
@@ -1557,172 +1425,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	}
 
 
-	if ((err = verify_group_input(sb, input)))
-		goto exit_put;
-
-	if ((err = setup_new_group_blocks(sb, input)))
-		goto exit_put;
-
-	/*
-	 * We will always be modifying at least the superblock and a GDT
-	 * block.  If we are adding a group past the last current GDT block,
-	 * we will also modify the inode and the dindirect block.  If we
-	 * are adding a group with superblock/GDT backups  we will also
-	 * modify each of the reserved GDT dindirect blocks.
-	 */
-	handle = ext4_journal_start_sb(sb,
-				       ext4_bg_has_super(sb, input->group) ?
-				       3 + reserved_gdb : 4);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		goto exit_put;
-	}
-
-	if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
-		goto exit_journal;
-
-        /*
-         * We will only either add reserved group blocks to a backup group
-         * or remove reserved blocks for the first group in a new group block.
-         * Doing both would be mean more complex code, and sane people don't
-         * use non-sparse filesystems anymore.  This is already checked above.
-         */
-	if (gdb_off) {
-		primary = sbi->s_group_desc[gdb_num];
-		if ((err = ext4_journal_get_write_access(handle, primary)))
-			goto exit_journal;
-
-		if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
-			err = reserve_backup_gdb(handle, inode, input->group);
-			if (err)
-				goto exit_journal;
-		}
-	} else {
-		/*
-		 * Note that we can access new group descriptor block safely
-		 * only if add_new_gdb() succeeds.
-		 */
-		err = add_new_gdb(handle, inode, input->group);
-		if (err)
-			goto exit_journal;
-		primary = sbi->s_group_desc[gdb_num];
-	}
-
-        /*
-         * OK, now we've set up the new group.  Time to make it active.
-         *
-         * so we have to be safe wrt. concurrent accesses the group
-         * data.  So we need to be careful to set all of the relevant
-         * group descriptor data etc. *before* we enable the group.
-         *
-         * The key field here is sbi->s_groups_count: as long as
-         * that retains its old value, nobody is going to access the new
-         * group.
-         *
-         * So first we update all the descriptor metadata for the new
-         * group; then we update the total disk blocks count; then we
-         * update the groups count to enable the group; then finally we
-         * update the free space counts so that the system can start
-         * using the new disk blocks.
-         */
-
-	/* Update group descriptor block for new group */
-	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
-					 gdb_off * EXT4_DESC_SIZE(sb));
-
-	memset(gdp, 0, EXT4_DESC_SIZE(sb));
-	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
-	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
-	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-	ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
-	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
-	gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
-
-	/*
-	 * We can allocate memory for mb_alloc based on the new group
-	 * descriptor
-	 */
-	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+	err = verify_group_input(sb, input);
 	if (err)
-		goto exit_journal;
-
-	/*
-	 * Make the new blocks and inodes valid next.  We do this before
-	 * increasing the group count so that once the group is enabled,
-	 * all of its blocks and inodes are already valid.
-	 *
-	 * We always allocate group-by-group, then block-by-block or
-	 * inode-by-inode within a group, so enabling these
-	 * blocks/inodes before the group is live won't actually let us
-	 * allocate the new space yet.
-	 */
-	ext4_blocks_count_set(es, ext4_blocks_count(es) +
-		input->blocks_count);
-	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
-
-	/*
-	 * We need to protect s_groups_count against other CPUs seeing
-	 * inconsistent state in the superblock.
-	 *
-	 * The precise rules we use are:
-	 *
-	 * * Writers must perform a smp_wmb() after updating all dependent
-	 *   data and before modifying the groups count
-	 *
-	 * * Readers must perform an smp_rmb() after reading the groups count
-	 *   and before reading any dependent data.
-	 *
-	 * NB. These rules can be relaxed when checking the group count
-	 * while freeing data, as we can only allocate from a block
-	 * group after serialising against the group count, and we can
-	 * only then free after serialising in turn against that
-	 * allocation.
-	 */
-	smp_wmb();
-
-	/* Update the global fs size fields */
-	sbi->s_groups_count++;
-
-	err = ext4_handle_dirty_metadata(handle, NULL, primary);
-	if (unlikely(err)) {
-		ext4_std_error(sb, err);
-		goto exit_journal;
-	}
-
-	/* Update the reserved block counts only once the new group is
-	 * active. */
-	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
-		input->reserved_blocks);
-
-	/* Update the free space counts */
-	percpu_counter_add(&sbi->s_freeclusters_counter,
-			   EXT4_B2C(sbi, input->free_blocks_count));
-	percpu_counter_add(&sbi->s_freeinodes_counter,
-			   EXT4_INODES_PER_GROUP(sb));
-
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
-	    sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group;
-		flex_group = ext4_flex_group(sbi, input->group);
-		atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
-		atomic_add(EXT4_INODES_PER_GROUP(sb),
-			   &sbi->s_flex_groups[flex_group].free_inodes);
-	}
-
-	ext4_handle_dirty_super(handle, sb);
+		goto out;
 
-exit_journal:
-	if ((err2 = ext4_journal_stop(handle)) && !err)
-		err = err2;
-	if (!err && primary) {
-		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
-			       sizeof(struct ext4_super_block));
-		update_backups(sb, primary->b_blocknr, primary->b_data,
-			       primary->b_size);
-	}
-exit_put:
+	flex_gd.count = 1;
+	flex_gd.groups = input;
+	flex_gd.bg_flags = &bg_flags;
+	err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
 	iput(inode);
 	return err;
 } /* ext4_group_add */
-- 
cgit v1.2.3-70-g09d2


From 014a1770371a028d22f364718c805f4216911ecd Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Wed, 4 Jan 2012 17:09:52 -0500
Subject: ext4: add missing ext4_resize_end on error paths

Online resize ioctls 'EXT4_IOC_GROUP_EXTEND' and 'EXT4_IOC_GROUP_ADD'
call ext4_resize_begin() to check permissions and to set the
EXT4_RESIZING bit lock, they do their work and they must finish with
ext4_resize_end() which calls clear_bit_unlock() to unlock and to
avoid -EBUSY errors for the next resize operations.

This patch adds the missing ext4_resize_end() calls on error paths.

Patch tested.

Cc: stable@vger.kernel.org
Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1a98804a38..b81a5f1b697 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -184,19 +184,22 @@ setversion_out:
 		if (err)
 			return err;
 
-		if (get_user(n_blocks_count, (__u32 __user *)arg))
-			return -EFAULT;
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
+		}
 
 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online resizing not supported with bigalloc");
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto group_extend_out;
 		}
 
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
-			return err;
+			goto group_extend_out;
 
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		if (EXT4_SB(sb)->s_journal) {
@@ -206,9 +209,10 @@ setversion_out:
 		}
 		if (err == 0)
 			err = err2;
+
 		mnt_drop_write(filp->f_path.mnt);
+group_extend_out:
 		ext4_resize_end(sb);
-
 		return err;
 	}
 
@@ -267,19 +271,22 @@ mext_out:
 			return err;
 
 		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
-				sizeof(input)))
-			return -EFAULT;
+				sizeof(input))) {
+			err = -EFAULT;
+			goto group_add_out;
+		}
 
 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online resizing not supported with bigalloc");
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto group_add_out;
 		}
 
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
-			return err;
+			goto group_add_out;
 
 		err = ext4_group_add(sb, &input);
 		if (EXT4_SB(sb)->s_journal) {
@@ -289,9 +296,10 @@ mext_out:
 		}
 		if (err == 0)
 			err = err2;
+
 		mnt_drop_write(filp->f_path.mnt);
+group_add_out:
 		ext4_resize_end(sb);
-
 		return err;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 1d526fc91bea04ee35b7599bf8b82f86c0aaf46c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Wed, 4 Jan 2012 21:22:51 -0500
Subject: ext4: Report max_batch_time option correctly

Currently the value reported for max_batch_time is really the
value of min_batch_time.

Reported-by: Russell Coker <russell@coker.com.au>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 35377d57ec4..36570b7af7c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1096,7 +1096,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	}
 	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
 		seq_printf(seq, ",max_batch_time=%u",
-			   (unsigned) sbi->s_min_batch_time);
+			   (unsigned) sbi->s_max_batch_time);
 	}
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 9b90e5e02896406a6da28a376568003d14c06770 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 4 Jan 2012 22:01:53 -0500
Subject: ext4: reserve new feature flag codepoints

Reserve the ext4 features flags EXT4_FEATURE_RO_COMPAT_METADATA_CSUM,
EXT4_FEATURE_INCOMPAT_INLINEDATA, and EXT4_FEATURE_INCOMPAT_LARGEDIR.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4bc0e82a905..13d15149c85 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1407,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC		0x0200
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1419,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA	0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
 
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
-- 
cgit v1.2.3-70-g09d2


From 9837d8e982b7e87a7207f90618e45d460e196e6a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 4 Jan 2012 22:03:11 -0500
Subject: jbd2: fix hung processes in jbd2_journal_lock_updates()

Toshiyuki Okajima found out that when running

for ((i=0; i < 100000; i++)); do
        if ((i%2 == 0)); then
                chattr +j /mnt/file
        else
                chattr -j /mnt/file
        fi
        echo "0" >> /mnt/file
done

process sometimes hangs indefinitely in jbd2_journal_lock_updates().

Toshiyuki identified that the following race happens:

jbd2_journal_lock_updates()            |jbd2_journal_stop()
---------------------------------------+---------------------------------------
 write_lock(&journal->j_state_lock)    |    .
 ++journal->j_barrier_count            |    .
 spin_lock(&tran->t_handle_lock)       |    .
 atomic_read(&tran->t_updates) //not 0 |
                                       | atomic_dec_and_test(&tran->t_updates)
                                       |    // t_updates = 0
                                       | wake_up(&journal->j_wait_updates)
 prepare_to_wait()                     |    // no process is woken up.
 spin_unlock(&tran->t_handle_lock)     |
 write_unlock(&journal->j_state_lock)  |
 schedule() // never return            |

We fix the problem by first calling prepare_to_wait() and only after that
checking t_updates in jbd2_journal_lock_updates().

Reported-and-analyzed-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0e41a4c080..35ae096bed5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal)
 			break;
 
 		spin_lock(&transaction->t_handle_lock);
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+				TASK_UNINTERRUPTIBLE);
 		if (!atomic_read(&transaction->t_updates)) {
 			spin_unlock(&transaction->t_handle_lock);
+			finish_wait(&journal->j_wait_updates, &wait);
 			break;
 		}
-		prepare_to_wait(&journal->j_wait_updates, &wait,
-				TASK_UNINTERRUPTIBLE);
 		spin_unlock(&transaction->t_handle_lock);
 		write_unlock(&journal->j_state_lock);
 		schedule();
-- 
cgit v1.2.3-70-g09d2


From 176576dbc8141528557eeeb007af2d5a2a4891ef Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Wed, 4 Jan 2012 22:32:12 -0500
Subject: ext4: make local symbol ext4_initxattrs static

The ext4_initxattrs symbol is used only in this file, so it should be
declared static.

Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/xattr_security.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 34e4350dd4d..e247f8bca8e 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -48,8 +48,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-		    void *fs_info)
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		void *fs_info)
 {
 	const struct xattr *xattr;
 	handle_t *handle = fs_info;
-- 
cgit v1.2.3-70-g09d2


From 5f163cc759a9fa8844a4efcf1f579dc5b2ca2491 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 4 Jan 2012 22:33:28 -0500
Subject: ext4: make more symbols static

A couple more functions can reasonably be made static if desired.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 4 +++-
 fs/ext4/ext4.h   | 5 -----
 fs/ext4/inode.c  | 5 ++++-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 12ccacda44e..f9e2cd8cf71 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,6 +23,8 @@
 
 #include <trace/events/ext4.h>
 
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+					    ext4_group_t block_group);
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
@@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
  * This function returns the number of file system metadata clusters at
  * the beginning of a block group, including the reserved gdt blocks.
  */
-unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
 				     ext4_group_t block_group)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 13d15149c85..e7dc9ad7394 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1803,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
 extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
 					      ext4_group_t block_group,
 					      struct ext4_group_desc *gdp);
-extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
-					    ext4_group_t block_group);
 extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
 					   ext4_group_t block_group,
 					   struct ext4_group_desc *gdp);
@@ -1896,9 +1894,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_discard_partial_page_buffers(handle_t *handle,
 		struct address_space *mapping, loff_t from,
 		loff_t length, int flags);
-extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e6cc24dfa98..a526684cbe3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,6 +72,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+		struct inode *inode, struct page *page, loff_t from,
+		loff_t length, int flags);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -3161,7 +3164,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
  *
  * Returns zero on sucess or negative on failure.
  */
-int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 		struct inode *inode, struct page *page, loff_t from,
 		loff_t length, int flags)
 {
-- 
cgit v1.2.3-70-g09d2


From d50f2ab6f050311dbf7b8f5501b25f0bf64a439b Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Tue, 10 Jan 2012 11:51:10 -0500
Subject: ext4: fix undefined behavior in ext4_fill_flex_info()

Commit 503358ae01b70ce6909d19dd01287093f6b6271c ("ext4: avoid divide by
zero when trying to mount a corrupted file system") fixes CVE-2009-4307
by performing a sanity check on s_log_groups_per_flex, since it can be
set to a bogus value by an attacker.

	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
	groups_per_flex = 1 << sbi->s_log_groups_per_flex;

	if (groups_per_flex < 2) { ... }

This patch fixes two potential issues in the previous commit.

1) The sanity check might only work on architectures like PowerPC.
On x86, 5 bits are used for the shifting amount.  That means, given a
large s_log_groups_per_flex value like 36, groups_per_flex = 1 << 36
is essentially 1 << 4 = 16, rather than 0.  This will bypass the check,
leaving s_log_groups_per_flex and groups_per_flex inconsistent.

2) The sanity check relies on undefined behavior, i.e., oversized shift.
A standard-confirming C compiler could rewrite the check in unexpected
ways.  Consider the following equivalent form, assuming groups_per_flex
is unsigned for simplicity.

	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
	if (groups_per_flex == 0 || groups_per_flex == 1) {

We compile the code snippet using Clang 3.0 and GCC 4.6.  Clang will
completely optimize away the check groups_per_flex == 0, leaving the
patched code as vulnerable as the original.  GCC keeps the check, but
there is no guarantee that future versions will do the same.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 36570b7af7c..108c3af8617 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2006,17 +2006,16 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	struct ext4_group_desc *gdp = NULL;
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
-	int groups_per_flex = 0;
+	unsigned int groups_per_flex = 0;
 	size_t size;
 	int i;
 
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
-	if (groups_per_flex < 2) {
+	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
 		sbi->s_log_groups_per_flex = 0;
 		return 1;
 	}
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
 
 	/* We allocate both existing and potentially added groups */
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
-- 
cgit v1.2.3-70-g09d2