From dcd0538ff4e854fa9d7f4630b359ca8fdb5cb5a8 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 16 Jan 2007 11:32:23 -0800
Subject: ocfs2: sparse b-tree support

Introduce tree rotations into the b-tree code. This will allow ocfs2 to
support sparse files. Much of the added code is designed to be generic (in
the ocfs2 sense) so that it can later be re-used to implement large
extended attributes.

This patch only adds the rotation code and does minimal updates to callers
of the extent api.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e804432..b0880fdb310 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
 			struct buffer_head *fe_bh,
-			u64 blkno,
+			u32 cpos,
+			u64 start_blk,
 			u32 new_clusters,
 			struct ocfs2_alloc_context *meta_ac);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
-- 
cgit v1.2.3-70-g09d2


From 363041a5f74b953ab6b705ac9c88e5eda218a24b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 17 Jan 2007 12:31:35 -0800
Subject: ocfs2: temporarily remove extent map caching

The code in extent_map.c is not prepared to deal with a subtree being
rotated between lookups. This can happen when filling holes in sparse files.
Instead of a lengthy patch to update the code (which would likely lose the
benefit of caching subtree roots), we remove most of the algorithms and
implement a simple path based lookup. A less ambitious extent caching scheme
will be added in a later patch.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c      |    5 +-
 fs/ocfs2/alloc.h      |    3 +
 fs/ocfs2/aops.c       |    8 +-
 fs/ocfs2/dir.c        |    2 +-
 fs/ocfs2/dlmglue.c    |    4 -
 fs/ocfs2/extent_map.c | 1024 ++++---------------------------------------------
 fs/ocfs2/extent_map.h |   19 +-
 fs/ocfs2/inode.c      |    6 +-
 fs/ocfs2/inode.h      |    1 -
 fs/ocfs2/journal.c    |    3 +-
 fs/ocfs2/namei.c      |    3 +-
 fs/ocfs2/ocfs2.h      |    5 -
 fs/ocfs2/slot_map.c   |    2 +-
 fs/ocfs2/super.c      |    7 -
 14 files changed, 96 insertions(+), 996 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a9669686757..85a05f12024 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1146,9 +1146,8 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
  *
  * This function doesn't handle non btree extent lists.
  */
-static int ocfs2_find_leaf(struct inode *inode,
-			   struct ocfs2_extent_list *root_el, u32 cpos,
-			   struct buffer_head **leaf_bh)
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+		    u32 cpos, struct buffer_head **leaf_bh)
 {
 	int ret;
 	struct buffer_head *bh = NULL;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index b0880fdb310..bff2a162b03 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -80,4 +80,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct buffer_head *fe_bh,
 			  struct ocfs2_truncate_context *tc);
 
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+		    u32 cpos, struct buffer_head **leaf_bh);
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 875c1144381..f3b0cc5cba1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -158,8 +158,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	if (err)
 		goto bail;
 
-	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
-					  NULL);
+	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL);
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -499,8 +498,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 
-	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
-					  NULL);
+	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
 
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -574,7 +572,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
-	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
 					  &contig_blocks);
 	if (ret) {
 		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 5d211c53a8d..c91490670ff 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -379,7 +379,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 
 	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
 						   (sb->s_blocksize_bits - 9)),
-					     1, &p_blkno, NULL);
+					     &p_blkno, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index ca4f0e0e758..8de6678a340 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1614,10 +1614,6 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 	 * for the inode metadata. */
 	ocfs2_metadata_cache_purge(inode);
 
-	/* will do nothing for inode types that don't use the extent
-	 * map (bitmap files, etc) */
-	ocfs2_extent_map_trunc(inode, 0);
-
 	if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
 		mlog(0, "Trusting LVB on inode %llu\n",
 		     (unsigned long long)oi->ip_blkno);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9..3b4322fd369 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
  *
  * extent_map.c
  *
- * In-memory extent map for OCFS2.  Man, this code was prettier in
- * the library.
+ * Block/Cluster mapping functions
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
@@ -26,1016 +25,155 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 
+#include "alloc.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
 
 #include "buffer_head_io.h"
 
-
-/*
- * SUCK SUCK SUCK
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-
-struct ocfs2_extent_map_entry {
-	struct rb_node e_node;
-	int e_tree_depth;
-	struct ocfs2_extent_rec e_rec;
-};
-
-struct ocfs2_em_insert_context {
-	int need_left;
-	int need_right;
-	struct ocfs2_extent_map_entry *new_ent;
-	struct ocfs2_extent_map_entry *old_ent;
-	struct ocfs2_extent_map_entry *left_ent;
-	struct ocfs2_extent_map_entry *right_ent;
-};
-
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-
-
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-			u32 cpos, u32 clusters,
-			struct rb_node ***ret_p,
-			struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
-				   struct ocfs2_extent_rec *rec,
-				   int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-					 struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-				      u32 cpos, u32 clusters,
-				      struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-					u32 cpos, u32 clusters,
-					struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-				       struct ocfs2_extent_rec *rec,
-				       int tree_depth,
-				       struct ocfs2_em_insert_context *ctxt);
-
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
-					      u32 cpos, u32 clusters)
-{
-	if (le32_to_cpu(rec->e_cpos) > cpos)
-		return 0;
-	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
-			      le32_to_cpu(rec->e_clusters))
-		return 0;
-	return 1;
-}
-
-
 /*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock.  This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
  *
- * The rb_node garbage lets insertion share the search.  Trivial
- * callers pass NULL.
+ * Should work fine on interior and exterior nodes.
  */
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-			u32 cpos, u32 clusters,
-			struct rb_node ***ret_p,
-			struct rb_node **ret_parent)
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+				    u32 v_cluster)
 {
-	struct rb_node **p = &em->em_extents.rb_node;
-	struct rb_node *parent = NULL;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	while (*p)
-	{
-		parent = *p;
-		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
-			       e_node);
-		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
-			p = &(*p)->rb_left;
-			ent = NULL;
-		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
-				    le32_to_cpu(ent->e_rec.e_clusters))) {
-			p = &(*p)->rb_right;
-			ent = NULL;
-		} else
-			break;
-	}
-
-	if (ret_p != NULL)
-		*ret_p = p;
-	if (ret_parent != NULL)
-		*ret_parent = parent;
-	return ent;
-}
-
-/*
- * Find the leaf containing the interval we want.  While we're on our
- * way down the tree, fill in every record we see at any depth, because
- * we might want it later.
- *
- * Note that this code is run without ip_lock.  That's because it
- * sleeps while reading.  If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-				      u32 cpos, u32 clusters,
-				      struct ocfs2_extent_list *el)
-{
-	int i, ret;
-	struct buffer_head *eb_bh = NULL;
-	u64 blkno;
-	u32 rec_end;
-	struct ocfs2_extent_block *eb;
+	int ret = -1;
+	int i;
 	struct ocfs2_extent_rec *rec;
+	u32 rec_end, rec_start;
 
-	/*
-	 * The bh data containing the el cannot change here, because
-	 * we hold alloc_sem.  So we can do this without other
-	 * locks.
-	 */
-	while (el->l_tree_depth)
-	{
-		blkno = 0;
-		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-			rec = &el->l_recs[i];
-			rec_end = (le32_to_cpu(rec->e_cpos) +
-				   le32_to_cpu(rec->e_clusters));
-
-			ret = -EBADR;
-			if (rec_end > OCFS2_I(inode)->ip_clusters) {
-				mlog_errno(ret);
-				ocfs2_error(inode->i_sb,
-					    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-					    i,
-					    (unsigned long long)le64_to_cpu(rec->e_blkno),
-					    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-					    OCFS2_I(inode)->ip_clusters);
-				goto out_free;
-			}
-
-			if (rec_end <= cpos) {
-				ret = ocfs2_extent_map_insert(inode, rec,
-						le16_to_cpu(el->l_tree_depth));
-				if (ret && (ret != -EEXIST)) {
-					mlog_errno(ret);
-					goto out_free;
-				}
-				continue;
-			}
-			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
-				ret = ocfs2_extent_map_insert(inode, rec,
-						le16_to_cpu(el->l_tree_depth));
-				if (ret && (ret != -EEXIST)) {
-					mlog_errno(ret);
-					goto out_free;
-				}
-				continue;
-			}
-
-			/*
-			 * We've found a record that matches our
-			 * interval.  We don't insert it because we're
-			 * about to traverse it.
-			 */
-
-			/* Check to see if we're stradling */
-			ret = -ESRCH;
-			if (!ocfs2_extent_rec_contains_clusters(rec,
-							        cpos,
-								clusters)) {
-				mlog_errno(ret);
-				goto out_free;
-			}
-
-			/*
-			 * If we've already found a record, the el has
-			 * two records covering the same interval.
-			 * EEEK!
-			 */
-			ret = -EBADR;
-			if (blkno) {
-				mlog_errno(ret);
-				ocfs2_error(inode->i_sb,
-					    "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
-					    cpos, clusters,
-					    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-					    (unsigned long long)blkno, i,
-					    (unsigned long long)le64_to_cpu(rec->e_blkno));
-				goto out_free;
-			}
-
-			blkno = le64_to_cpu(rec->e_blkno);
-		}
-
-		/*
-		 * We don't support holes, and we're still up
-		 * in the branches, so we'd better have found someone
-		 */
-		ret = -EBADR;
-		if (!blkno) {
-			ocfs2_error(inode->i_sb,
-				    "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
-				    cpos, clusters,
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			mlog_errno(ret);
-			goto out_free;
-		}
-
-		if (eb_bh) {
-			brelse(eb_bh);
-			eb_bh = NULL;
-		}
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       blkno, &eb_bh, OCFS2_BH_CACHED,
-				       inode);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_free;
-		}
-		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			ret = -EIO;
-			goto out_free;
-		}
-		el = &eb->h_list;
-	}
-
-	BUG_ON(el->l_tree_depth);
-
-	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
 
-		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
-		    OCFS2_I(inode)->ip_clusters) {
-			ret = -EBADR;
-			mlog_errno(ret);
-			ocfs2_error(inode->i_sb,
-				    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-				    i,
-				    (unsigned long long)le64_to_cpu(rec->e_blkno),
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-				    OCFS2_I(inode)->ip_clusters);
-			return ret;
-		}
+		rec_start = le32_to_cpu(rec->e_cpos);
+		rec_end = rec_start + le32_to_cpu(rec->e_clusters);
 
-		ret = ocfs2_extent_map_insert(inode, rec,
-					      le16_to_cpu(el->l_tree_depth));
-		if (ret && (ret != -EEXIST)) {
-			mlog_errno(ret);
-			goto out_free;
+		if (v_cluster >= rec_start && v_cluster < rec_end) {
+			ret = i;
+			break;
 		}
 	}
 
-	ret = 0;
-
-out_free:
-	if (eb_bh)
-		brelse(eb_bh);
-
 	return ret;
 }
 
-/*
- * This lookup actually will read from disk.  It has one invariant:
- * It will never re-traverse blocks.  This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
- */
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-					u32 cpos,
-					u32 clusters,
-					struct ocfs2_extent_map_entry **ret_ent)
+static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+			      u32 *p_cluster, u32 *num_clusters)
 {
-	int ret;
-	u64 blkno;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-	struct buffer_head *bh = NULL;
-	struct ocfs2_extent_block *eb;
+	int ret, i;
+	struct buffer_head *di_bh = NULL;
+	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_dinode *di;
+	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (ent) {
-		if (!ent->e_tree_depth) {
-			spin_unlock(&OCFS2_I(inode)->ip_lock);
-			*ret_ent = ent;
-			return 0;
-		}
-		blkno = le64_to_cpu(ent->e_rec.e_blkno);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
-				       OCFS2_BH_CACHED, inode);
-		if (ret) {
-			mlog_errno(ret);
-			if (bh)
-				brelse(bh);
-			return ret;
-		}
-		eb = (struct ocfs2_extent_block *)bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			brelse(bh);
-			return -EIO;
-		}
-		el = &eb->h_list;
-	} else {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       OCFS2_I(inode)->ip_blkno, &bh,
-				       OCFS2_BH_CACHED, inode);
-		if (ret) {
-			mlog_errno(ret);
-			if (bh)
-				brelse(bh);
-			return ret;
-		}
-		di = (struct ocfs2_dinode *)bh->b_data;
-		if (!OCFS2_IS_VALID_DINODE(di)) {
-			brelse(bh);
-			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
-			return -EIO;
-		}
-		el = &di->id2.i_list;
-	}
-
-	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
-	brelse(bh);
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
+		goto out;
 	}
 
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (!ent) {
-		ret = -ESRCH;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	/* FIXME: Make sure this isn't a corruption */
-	BUG_ON(ent->e_tree_depth);
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	el = &di->id2.i_list;
 
-	*ret_ent = ent;
-
-	return 0;
-}
-
-/*
- * Callers must hold ip_lock.  This can insert pieces of the tree,
- * thus racing lookup if the lock weren't held.
- */
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-					 struct ocfs2_extent_map_entry *ent)
-{
-	struct rb_node **p, *parent;
-	struct ocfs2_extent_map_entry *old_ent;
-
-	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
-					  le32_to_cpu(ent->e_rec.e_clusters),
-					  &p, &parent);
-	if (old_ent)
-		return -EEXIST;
-
-	rb_link_node(&ent->e_node, parent, p);
-	rb_insert_color(&ent->e_node, &em->em_extents);
-
-	return 0;
-}
-
-
-/*
- * Simple rule: on any return code other than -EAGAIN, anything left
- * in the insert_context will be freed.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert.  It is not an actual error, but it
- * tells the caller we have no more work to do.
- */
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-				       struct ocfs2_extent_rec *rec,
-				       int tree_depth,
-				       struct ocfs2_em_insert_context *ctxt)
-{
-	int ret;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *old_ent;
-
-	ctxt->need_left = 0;
-	ctxt->need_right = 0;
-	ctxt->old_ent = NULL;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-	if (!ret) {
-		ctxt->new_ent = NULL;
-		goto out_unlock;
-	}
-
-	/* Since insert_entry failed, the map MUST have old_ent */
-	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-					  le32_to_cpu(rec->e_clusters),
-					  NULL, NULL);
-
-	BUG_ON(!old_ent);
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
-	if (old_ent->e_tree_depth < tree_depth) {
-		/* Another thread beat us to the lower tree_depth */
-		ret = -EEXIST;
-		goto out_unlock;
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
 	}
 
-	if (old_ent->e_tree_depth == tree_depth) {
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
 		/*
-		 * Another thread beat us to this tree_depth.
-		 * Let's make sure we agree with that thread (the
-		 * extent_rec should be identical).
+		 * A hole was found. Return some canned values that
+		 * callers can key on.
 		 */
-		if (!memcmp(rec, &old_ent->e_rec,
-			    sizeof(struct ocfs2_extent_rec)))
-			ret = 0;
-		else
-			/* FIXME: Should this be ESRCH/EBADR??? */
-			ret = -EEXIST;
+		*p_cluster = 0;
+		if (num_clusters)
+			*num_clusters = 1;
+	} else {
+		rec = &el->l_recs[i];
 
-		goto out_unlock;
-	}
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
-	/*
-	 * We do it in this order specifically so that no actual tree
-	 * changes occur until we have all the pieces we need.  We
-	 * don't want malloc failures to leave an inconsistent tree.
-	 * Whenever we drop the lock, another process could be
-	 * inserting.  Also note that, if another process just beat us
-	 * to an insert, we might not need the same pieces we needed
-	 * the first go round.  In the end, the pieces we need will
-	 * be used, and the pieces we don't will be freed.
-	 */
-	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
-			     le32_to_cpu(old_ent->e_rec.e_cpos));
-	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
-			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
-			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
-	ret = -EAGAIN;
-	if (ctxt->need_left) {
-		if (!ctxt->left_ent)
-			goto out_unlock;
-		*(ctxt->left_ent) = *old_ent;
-		ctxt->left_ent->e_rec.e_clusters =
-			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
-				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
-	}
-	if (ctxt->need_right) {
-		if (!ctxt->right_ent)
-			goto out_unlock;
-		*(ctxt->right_ent) = *old_ent;
-		ctxt->right_ent->e_rec.e_cpos =
-			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0)", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
 				    le32_to_cpu(rec->e_clusters));
-		ctxt->right_ent->e_rec.e_clusters =
-			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
-				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
-				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
-	}
-
-	rb_erase(&old_ent->e_node, &em->em_extents);
-	/* Now that he's erased, set him up for deletion */
-	ctxt->old_ent = old_ent;
-
-	if (ctxt->need_left) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    ctxt->left_ent);
-		if (ret)
-			goto out_unlock;
-		ctxt->left_ent = NULL;
-	}
-
-	if (ctxt->need_right) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    ctxt->right_ent);
-		if (ret)
-			goto out_unlock;
-		ctxt->right_ent = NULL;
-	}
-
-	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-
-	if (!ret)
-		ctxt->new_ent = NULL;
-
-out_unlock:
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	return ret;
-}
-
-
-static int ocfs2_extent_map_insert(struct inode *inode,
-				   struct ocfs2_extent_rec *rec,
-				   int tree_depth)
-{
-	int ret;
-	struct ocfs2_em_insert_context ctxt = {0, };
-
-	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
-	    OCFS2_I(inode)->ip_map.em_clusters) {
-		ret = -EBADR;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	/* Zero e_clusters means a truncated tail record.  It better be EOF */
-	if (!rec->e_clusters) {
-		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
-		    OCFS2_I(inode)->ip_map.em_clusters) {
-			ret = -EBADR;
-			mlog_errno(ret);
-			ocfs2_error(inode->i_sb,
-				    "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
-				    (unsigned long long)le64_to_cpu(rec->e_blkno),
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			return ret;
-		}
-
-		/* Ignore the truncated tail */
-		return 0;
-	}
-
-	ret = -ENOMEM;
-	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
-					GFP_NOFS);
-	if (!ctxt.new_ent) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	ctxt.new_ent->e_rec = *rec;
-	ctxt.new_ent->e_tree_depth = tree_depth;
-
-	do {
-		ret = -ENOMEM;
-		if (ctxt.need_left && !ctxt.left_ent) {
-			ctxt.left_ent =
-				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_NOFS);
-			if (!ctxt.left_ent)
-				break;
-		}
-		if (ctxt.need_right && !ctxt.right_ent) {
-			ctxt.right_ent =
-				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_NOFS);
-			if (!ctxt.right_ent)
-				break;
+			ret = -EROFS;
+			goto out;
 		}
 
-		ret = ocfs2_extent_map_try_insert(inode, rec,
-						  tree_depth, &ctxt);
-	} while (ret == -EAGAIN);
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
 
-	if ((ret < 0) && (ret != -EEXIST))
-		mlog_errno(ret);
-
-	if (ctxt.left_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
-	if (ctxt.right_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
-	if (ctxt.old_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
-	if (ctxt.new_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
-
-	return ret;
-}
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
 
-/*
- * Append this record to the tail of the extent map.  It must be
- * tree_depth 0.  The record might be an extension of an existing
- * record, and as such that needs to be handled.  eg:
- *
- * Existing record in the extent map:
- *
- *	cpos = 10, len = 10
- *	|---------|
- *
- * New Record:
- *
- *	cpos = 10, len = 20
- *	|------------------|
- *
- * The passed record is the new on-disk record.  The new_clusters value
- * is how many clusters were added to the file.  If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters.  If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
- */
-int ocfs2_extent_map_append(struct inode *inode,
-			    struct ocfs2_extent_rec *rec,
-			    u32 new_clusters)
-{
-	int ret;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-	struct ocfs2_extent_rec *old;
-
-	BUG_ON(!new_clusters);
-	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
-
-	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+		if (num_clusters)
+			*num_clusters = le32_to_cpu(rec->e_clusters) - coff;
 	}
 
-	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
-			 le32_to_cpu(rec->e_clusters)) !=
-			(em->em_clusters + new_clusters),
-			"Inode %llu:\n"
-			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
-			"em->em_clusters = %u + new_clusters = %u = %u\n",
-			(unsigned long long)OCFS2_I(inode)->ip_blkno,
-			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
-			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
-			em->em_clusters, new_clusters,
-			em->em_clusters + new_clusters);
-
-	em->em_clusters += new_clusters;
-
-	ret = -ENOENT;
-	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
-		/* This is a contiguous append */
-		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
-					      NULL, NULL);
-		if (ent) {
-			old = &ent->e_rec;
-			BUG_ON((le32_to_cpu(rec->e_cpos) +
-				le32_to_cpu(rec->e_clusters)) !=
-				 (le32_to_cpu(old->e_cpos) +
-				  le32_to_cpu(old->e_clusters) +
-				  new_clusters));
-			if (ent->e_tree_depth == 0) {
-				BUG_ON(le32_to_cpu(old->e_cpos) !=
-				       le32_to_cpu(rec->e_cpos));
-				BUG_ON(le64_to_cpu(old->e_blkno) !=
-				       le64_to_cpu(rec->e_blkno));
-				ret = 0;
-			}
-			/*
-			 * Let non-leafs fall through as -ENOENT to
-			 * force insertion of the new leaf.
-			 */
-			le32_add_cpu(&old->e_clusters, new_clusters);
-		}
-	}
-
-	if (ret == -ENOENT)
-		ret = ocfs2_extent_map_insert(inode, rec, 0);
-	if (ret < 0)
-		mlog_errno(ret);
+out:
+	brelse(di_bh);
+	brelse(eb_bh);
 	return ret;
 }
 
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
-
 /*
- * Look up the record containing this cluster offset.  This record is
- * part of the extent map.  Do not free it.  Any changes you make to
- * it will reflect in the extent map.  So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
- *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
- * rec->e_clusters -= 5;
- *
- * The lookup does not read from disk.  If the map isn't filled in for
- * an entry, you won't find it.
- *
- * Also note that the returned record is valid until alloc_sem is
- * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ * This expects alloc_sem to be held. The allocation cannot change at
+ * all while the map is in the process of being updated.
  */
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
-			     struct ocfs2_extent_rec **rec,
-			     int *tree_depth)
-{
-	int ret = -ENOENT;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-
-	*rec = NULL;
-
-	if (cpos >= OCFS2_I(inode)->ip_clusters)
-		return -EINVAL;
-
-	if (cpos >= em->em_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
-	}
-
-	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
-				      NULL, NULL);
-
-	if (ent) {
-		*rec = &ent->e_rec;
-		if (tree_depth)
-			*tree_depth = ent->e_tree_depth;
-		ret = 0;
-	}
-
-	return ret;
-}
-
-int ocfs2_extent_map_get_clusters(struct inode *inode,
-				  u32 v_cpos, int count,
-				  u32 *p_cpos, int *ret_count)
-{
-	int ret;
-	u32 coff, ccount;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	*p_cpos = ccount = 0;
-
-	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
-		return -EINVAL;
-
-	if ((v_cpos + count) > em->em_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
-	}
-
-
-	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
-	if (ret)
-		return ret;
-
-	if (ent) {
-		/* We should never find ourselves straddling an interval */
-		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
-							v_cpos,
-							count))
-			return -ESRCH;
-
-		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
-		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
-				le64_to_cpu(ent->e_rec.e_blkno)) +
-			  coff;
-
-		if (ret_count)
-			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
-
-		return 0;
-	}
-
-
-	return -ENOENT;
-}
-
-#endif  /*  0  */
-
-int ocfs2_extent_map_get_blocks(struct inode *inode,
-				u64 v_blkno, int count,
-				u64 *p_blkno, int *ret_count)
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				int *ret_count)
 {
 	int ret;
-	u64 boff;
-	u32 cpos, clusters;
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-	struct ocfs2_extent_map_entry *ent = NULL;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_rec *rec;
-
-	*p_blkno = 0;
+	u32 cpos, num_clusters, p_cluster;
+	u64 boff = 0;
 
 	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
-	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
-					    (u64)count + bpc - 1);
-	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
-		ret = -EINVAL;
-		mlog_errno(ret);
-		return ret;
-	}
 
-	if ((cpos + clusters) > em->em_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
-	}
-
-	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
+		goto out;
 	}
 
-	if (ent)
-	{
-		rec = &ent->e_rec;
-
-		/* We should never find ourselves straddling an interval */
-		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
-			ret = -ESRCH;
-			mlog_errno(ret);
-			return ret;
-		}
-
-		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
-						le32_to_cpu(rec->e_cpos));
+	/*
+	 * p_cluster == 0 indicates a hole.
+	 */
+	if (p_cluster) {
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 		boff += (v_blkno & (u64)(bpc - 1));
-		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
-
-		if (ret_count) {
-			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
-					le32_to_cpu(rec->e_clusters)) - boff;
-		}
-
-		return 0;
 	}
 
-	return -ENOENT;
-}
-
-int ocfs2_extent_map_init(struct inode *inode)
-{
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	*p_blkno = boff;
 
-	em->em_extents = RB_ROOT;
-	em->em_clusters = 0;
-
-	return 0;
-}
-
-/* Needs the lock */
-static void __ocfs2_extent_map_drop(struct inode *inode,
-				    u32 new_clusters,
-				    struct rb_node **free_head,
-				    struct ocfs2_extent_map_entry **tail_ent)
-{
-	struct rb_node *node, *next;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-
-	*free_head = NULL;
-
-	ent = NULL;
-	node = rb_last(&em->em_extents);
-	while (node)
-	{
-		next = rb_prev(node);
-
-		ent = rb_entry(node, struct ocfs2_extent_map_entry,
-			       e_node);
-		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
-			break;
-
-		rb_erase(&ent->e_node, &em->em_extents);
-
-		node->rb_right = *free_head;
-		*free_head = node;
-
-		ent = NULL;
-		node = next;
-	}
-
-	/* Do we have an entry straddling new_clusters? */
-	if (tail_ent) {
-		if (ent &&
-		    ((le32_to_cpu(ent->e_rec.e_cpos) +
-		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
-			*tail_ent = ent;
-		else
-			*tail_ent = NULL;
+	if (ret_count) {
+		*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		*ret_count -= v_blkno & (u64)(bpc - 1);
 	}
-}
-
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
-	struct rb_node *node;
-	struct ocfs2_extent_map_entry *ent;
-
-	while (free_head) {
-		node = free_head;
-		free_head = node->rb_right;
 
-		ent = rb_entry(node, struct ocfs2_extent_map_entry,
-			       e_node);
-		kmem_cache_free(ocfs2_em_ent_cachep, ent);
-	}
-}
-
-/*
- * Remove all entries past new_clusters, inclusive of an entry that
- * contains new_clusters.  This is effectively a cache forget.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
- */
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
-{
-	struct rb_node *free_head = NULL;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-
-	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-
-	if (ent) {
-		rb_erase(&ent->e_node, &em->em_extents);
-		ent->e_node.rb_right = free_head;
-		free_head = &ent->e_node;
-	}
-
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
-
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one.  This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
-{
-	struct rb_node *free_head = NULL;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-
-	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-
-	if (ent)
-		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
-					       le32_to_cpu(ent->e_rec.e_cpos));
-
-	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
-
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
-
-int __init init_ocfs2_extent_maps(void)
-{
-	ocfs2_em_ent_cachep =
-		kmem_cache_create("ocfs2_em_ent",
-				  sizeof(struct ocfs2_extent_map_entry),
-				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-	if (!ocfs2_em_ent_cachep)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void exit_ocfs2_extent_maps(void)
-{
-	kmem_cache_destroy(ocfs2_em_ent_cachep);
+out:
+	return ret;
 }
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa88..036e2325144 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,7 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
 
-int init_ocfs2_extent_maps(void);
-void exit_ocfs2_extent_maps(void);
-
-/*
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
- * to be held.  The allocation cannot change at all while the map is
- * in the process of being updated.
- */
-int ocfs2_extent_map_init(struct inode *inode);
-int ocfs2_extent_map_append(struct inode *inode,
-			    struct ocfs2_extent_rec *rec,
-			    u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
-				u64 v_blkno, int count,
-				u64 *p_blkno, int *ret_count);
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				int *ret_count);
 
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 08d57a3d4e8..5ff8549eb1a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1003,9 +1003,6 @@ void ocfs2_clear_inode(struct inode *inode)
 			"Clear inode of %llu, inode has io markers\n",
 			(unsigned long long)oi->ip_blkno);
 
-	ocfs2_extent_map_drop(inode, 0);
-	ocfs2_extent_map_init(inode);
-
 	status = ocfs2_drop_inode_locks(inode);
 	if (status < 0)
 		mlog_errno(status);
@@ -1102,8 +1099,7 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		return NULL;
 	}
 
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
-					     &p_blkno, NULL);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
 	if (tmperr < 0) {
 		mlog_errno(tmperr);
 		goto fail;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 042ae20a713..a9ced009cb9 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,7 +43,6 @@ struct ocfs2_inode_info
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
 	u32				ip_clusters;
-	struct ocfs2_extent_map		ip_map;
 	struct list_head		ip_io_markers;
 
 	struct mutex			ip_io_mutex;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 12445a31f73..2e2e04fe973 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -670,8 +670,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
 	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
 
 		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
-						     1, &p_blkno,
-						     &p_blocks);
+						     &p_blkno, &p_blocks);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d65fef4a8bd..5755e074825 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1511,8 +1511,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
-					     &p_blocks);
+	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index fe7e1ecafca..faeb53f2eec 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
 #include "endian.h"
 #include "ocfs2_lockid.h"
 
-struct ocfs2_extent_map {
-	u32		em_clusters;
-	struct rb_root	em_extents;
-};
-
 /* Most user visible OCFS2 inodes will have very few pieces of
  * metadata, but larger files (including bitmaps, etc) must be taken
  * into account when designing an access scheme. We allow a small
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74..f4416e7330e 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 16564ea6c14..6ab52351943 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
 
 	ocfs2_print_version();
 
-	if (init_ocfs2_extent_maps())
-		return -ENOMEM;
-
 	status = init_ocfs2_uptodate_cache();
 	if (status < 0) {
 		mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
 	if (status < 0) {
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
-		exit_ocfs2_extent_maps();
 	}
 
 	mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
 
 	unregister_filesystem(&ocfs2_fs_type);
 
-	exit_ocfs2_extent_maps();
-
 	exit_ocfs2_uptodate_cache();
 
 	mlog_exit_void();
@@ -948,7 +942,6 @@ static void ocfs2_inode_init_once(void *data,
 		oi->ip_flags = 0;
 		oi->ip_open_count = 0;
 		spin_lock_init(&oi->ip_lock);
-		ocfs2_extent_map_init(&oi->vfs_inode);
 		INIT_LIST_HEAD(&oi->ip_io_markers);
 		oi->ip_created_trans = 0;
 		oi->ip_last_trans = 0;
-- 
cgit v1.2.3-70-g09d2


From 60b11392f1a09433740bda3048202213daa27736 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 16 Feb 2007 11:46:50 -0800
Subject: ocfs2: zero tail of sparse files on truncate

Since we don't zero on extend anymore, truncate needs to be fixed up to zero
the part of a file between i_size and and end of it's cluster. Otherwise a
subsequent extend could expose bad data.

This introduced a new helper, which can be used in ocfs2_write().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h |   2 +
 fs/ocfs2/aops.c  |  34 ++++-----
 fs/ocfs2/aops.h  |  12 +++
 fs/ocfs2/file.c  |  40 ++++++++--
 fs/ocfs2/inode.c |  30 +++++++-
 fs/ocfs2/ocfs2.h |  11 +++
 7 files changed, 328 insertions(+), 25 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9a40603c4d4..98694a1add4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -3342,6 +3344,228 @@ bail:
 	return status;
 }
 
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	return 0;
+}
+
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	return ocfs2_journal_dirty_data(handle, bh);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+				     struct page **pages, int numpages,
+				     u64 phys, handle_t *handle)
+{
+	int i, ret, partial = 0;
+	void *kaddr;
+	struct page *page;
+	unsigned int from, to = PAGE_CACHE_SIZE;
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+	if (numpages == 0)
+		goto out;
+
+	from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+	if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+		/*
+		 * Since 'from' has been capped to a value below page
+		 * size, this calculation won't be able to overflow
+		 * 'to'
+		 */
+		to = ocfs2_align_bytes_to_clusters(sb, from);
+
+		/*
+		 * The truncate tail in this case should never contain
+		 * more than one page at maximum. The loop below also
+		 * assumes this.
+		 */
+		BUG_ON(numpages != 1);
+	}
+
+	for(i = 0; i < numpages; i++) {
+		page = pages[i];
+
+		BUG_ON(from > PAGE_CACHE_SIZE);
+		BUG_ON(to > PAGE_CACHE_SIZE);
+
+		ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+		if (ret)
+			mlog_errno(ret);
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + from, 0, to - from);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		/*
+		 * Need to set the buffers we zero'd into uptodate
+		 * here if they aren't - ocfs2_map_page_blocks()
+		 * might've skipped some
+		 */
+		if (ocfs2_should_order_data(inode)) {
+			ret = walk_page_buffers(handle,
+						page_buffers(page),
+						from, to, &partial,
+						ocfs2_ordered_zero_func);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else {
+			ret = walk_page_buffers(handle, page_buffers(page),
+						from, to, &partial,
+						ocfs2_writeback_zero_func);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+
+		if (!partial)
+			SetPageUptodate(page);
+
+		flush_dcache_page(page);
+
+		/*
+		 * Every page after the 1st one should be completely zero'd.
+		 */
+		from = 0;
+	}
+out:
+	if (pages) {
+		for (i = 0; i < numpages; i++) {
+			page = pages[i];
+			unlock_page(page);
+			mark_page_accessed(page);
+			page_cache_release(page);
+		}
+	}
+}
+
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+				int *num, u64 *phys)
+{
+	int i, numpages = 0, ret = 0;
+	unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index;
+	u64 next_cluster_bytes;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+	/* Cluster boundary, so we don't need to grab any pages. */
+	if ((isize & (csize - 1)) == 0)
+		goto out;
+
+	ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+					  phys, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Tail is a hole. */
+	if (*phys == 0)
+		goto out;
+
+	next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+	index = isize >> PAGE_CACHE_SHIFT;
+	do {
+		pages[numpages] = grab_cache_page(mapping, index);
+		if (!pages[numpages]) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		numpages++;
+		index++;
+	} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+	if (ret != 0) {
+		if (pages) {
+			for (i = 0; i < numpages; i++) {
+				if (pages[i]) {
+					unlock_page(pages[i]);
+					page_cache_release(pages[i]);
+				}
+			}
+		}
+		numpages = 0;
+	}
+
+	*num = numpages;
+
+	return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+				 u64 new_i_size)
+{
+	int ret, numpages;
+	struct page **pages = NULL;
+	u64 phys;
+
+	/*
+	 * File systems which don't support sparse files zero on every
+	 * extend.
+	 */
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		return 0;
+
+	pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+			sizeof(struct page *), GFP_NOFS);
+	if (pages == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Truncate on an i_size boundary - nothing more to do.
+	 */
+	if (numpages == 0)
+		goto out;
+
+	ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+				 handle);
+
+	/*
+	 * Initiate writeout of the pages we zero'd here. We don't
+	 * wait on them - the truncate_inode_pages() call later will
+	 * do that for us.
+	 */
+	ret = filemap_fdatawrite(inode->i_mapping);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (pages)
+		kfree(pages);
+
+	return ret;
+}
+
 /*
  * It is expected, that by the time you call this function,
  * inode->i_size and fe->i_size have been adjusted.
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index bff2a162b03..3cb39cd5e47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -71,6 +71,8 @@ struct ocfs2_truncate_context {
 	struct buffer_head *tc_last_eb_bh;
 };
 
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+				 u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *fe_bh,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index acf8f000672..605c82a93f0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -308,13 +308,13 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
  * their fixes when they happen) --Mark */
-static int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
-				unsigned from,
-				unsigned to,
-				int *partial,
-				int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@@ -654,9 +654,9 @@ static void ocfs2_clear_page_regions(struct page *page,
  *
  * This will also skip zeroing, which is handled externally.
  */
-static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
-				 struct inode *inode, unsigned int from,
-				 unsigned int to, int new)
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new)
 {
 	int ret = 0;
 	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
@@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 		 * Ignore blocks outside of our i/o range -
 		 * they may belong to unallocated clusters.
 		 */
-		if (block_start >= to ||
-		    (block_start + bsize) <= from) {
+		if (block_start >= to || block_end <= from) {
 			if (PageUptodate(page))
 				set_buffer_uptodate(bh);
 			continue;
@@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 	u64 v_blkno, p_blkno;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	unsigned long index, start;
 	struct page **cpages;
 
@@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
-	 * non-allocating write, or any writes where cluster size is
-	 * less than page size, we only need one page. Otherwise,
-	 * allocating writes of cluster size larger than page size
-	 * need cluster size pages.
+	 * non allocating write, we just change the one
+	 * page. Otherwise, we'll need a whole clusters worth.
 	 */
-	if (new && !wc->w_large_pages)
-		numpages = (1 << cbits) / PAGE_SIZE;
+	if (new)
+		numpages = ocfs2_pages_per_cluster(inode->i_sb);
 
 	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
 	if (!cpages) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index eeb2c42483e..7d94071f0ab 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 unsigned from,
 							 unsigned to);
 
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new);
+
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh));
+
 struct ocfs2_write_ctxt;
 typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
 				u64 *, unsigned int *, unsigned int *);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 667e5a869bf..5fd49ec169d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
 	int status;
 	handle_t *handle;
+	struct ocfs2_dinode *di;
 
 	mlog_entry_void();
 
@@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	/*
+	 * Do this before setting i_size.
+	 */
+	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di = (struct ocfs2_dinode *) fe_bh->b_data;
+	di->i_size = cpu_to_le64(new_i_size);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0)
 		mlog_errno(status);
 
+out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
+
 	mlog_exit(status);
 	return status;
 }
@@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 		mlog_errno(status);
 		goto bail;
 	}
-	ocfs2_data_unlock(inode, 1);
 
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
@@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 
 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 
 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 
 	/* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+
 bail:
 
 	mlog_exit(status);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0bd86a13759..78c99b5050d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 	int status = 0;
 	struct ocfs2_truncate_context *tc = NULL;
 	struct ocfs2_dinode *fe;
+	handle_t *handle = NULL;
 
 	mlog_entry_void();
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	if (fe->i_clusters) {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out;
+		}
+
+		status = ocfs2_journal_access(handle, inode, fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		i_size_write(inode, 0);
+
+		status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+
 		status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
 		if (status < 0) {
 			mlog_errno(status);
@@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 			goto out;
 		}
 	}
-out:
 
+out:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 2699f7cac21..82cc92dcf8a 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc
 	return index;
 }
 
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int pages_per_cluster = 1;
+
+	if (PAGE_CACHE_SHIFT < cbits)
+		pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+	return pages_per_cluster;
+}
+
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
-- 
cgit v1.2.3-70-g09d2


From e48edee2d8eab812f31f0ff62c6ba635ca2e1e21 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 7 Mar 2007 16:46:57 -0800
Subject: ocfs2: make room for unwritten extents flag

Due to the size of our group bitmaps, we'll never have a leaf node extent
record with more than 16 bits worth of clusters. Split e_clusters up so that
leaf nodes can get a flags field where we can mark unwritten extents.
Interior nodes whose length references all the child nodes beneath it can't
split their e_clusters field, so we use a union to preserve sizing there.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c      | 155 +++++++++++++++++++++++++++++++-------------------
 fs/ocfs2/alloc.h      |  19 +++++++
 fs/ocfs2/extent_map.c |  19 +++++--
 fs/ocfs2/file.c       |   6 +-
 fs/ocfs2/journal.h    |   2 +-
 fs/ocfs2/ocfs2_fs.h   |  19 ++++++-
 6 files changed, 151 insertions(+), 69 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 027cf5d05ff..0eab0d32828 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -218,20 +218,32 @@ enum ocfs2_contig_type {
 	CONTIG_RIGHT
 };
 
+
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
 static int ocfs2_block_extent_contig(struct super_block *sb,
 				     struct ocfs2_extent_rec *ext,
 				     u64 blkno)
 {
-	return blkno == (le64_to_cpu(ext->e_blkno) +
-			 ocfs2_clusters_to_blocks(sb,
-						  le32_to_cpu(ext->e_clusters)));
+	u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+	blk_end += ocfs2_clusters_to_blocks(sb,
+				    le16_to_cpu(ext->e_leaf_clusters));
+
+	return blkno == blk_end;
 }
 
 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
 				  struct ocfs2_extent_rec *right)
 {
-	return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
-		le32_to_cpu(right->e_cpos));
+	u32 left_range;
+
+	left_range = le32_to_cpu(left->e_cpos) +
+		le16_to_cpu(left->e_leaf_clusters);
+
+	return (left_range == le32_to_cpu(right->e_cpos));
 }
 
 static enum ocfs2_contig_type
@@ -430,7 +442,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
 
 	return le32_to_cpu(el->l_recs[i].e_cpos) +
-		le32_to_cpu(el->l_recs[i].e_clusters);
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
 }
 
 /*
@@ -442,7 +454,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
  * for the new last extent block.
  *
  * the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
  */
 static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    handle_t *handle,
@@ -532,7 +544,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		 */
 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
-		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+		/*
+		 * eb_el isn't always an interior node, but even leaf
+		 * nodes want a zero'd flags and reserved field so
+		 * this gets the whole 32 bits regardless of use.
+		 */
+		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
 		if (!eb_el->l_tree_depth)
 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 
@@ -577,7 +594,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	i = le16_to_cpu(el->l_next_free_rec);
 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
-	el->l_recs[i].e_clusters = 0;
+	el->l_recs[i].e_int_clusters = 0;
 	le16_add_cpu(&el->l_next_free_rec, 1);
 
 	/* fe needs a new last extent block pointer, as does the
@@ -662,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	/* copy the fe data into the new extent block */
 	eb_el->l_tree_depth = fe_el->l_tree_depth;
 	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
-		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
-		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
-	}
+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = fe_el->l_recs[i];
 
 	status = ocfs2_journal_dirty(handle, new_eb_bh);
 	if (status < 0) {
@@ -687,12 +701,9 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	le16_add_cpu(&fe_el->l_tree_depth, 1);
 	fe_el->l_recs[0].e_cpos = 0;
 	fe_el->l_recs[0].e_blkno = eb->h_blkno;
-	fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
-	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-		fe_el->l_recs[i].e_cpos = 0;
-		fe_el->l_recs[i].e_clusters = 0;
-		fe_el->l_recs[i].e_blkno = 0;
-	}
+	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
 	fe_el->l_next_free_rec = cpu_to_le16(1);
 
 	/* If this is our 1st tree depth shift, then last_eb_blk
@@ -817,9 +828,13 @@ bail:
 	return status;
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
 static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 {
-	return !rec->e_clusters;
+	return !rec->e_leaf_clusters;
 }
 
 /*
@@ -930,6 +945,8 @@ static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
 {
 	int next_free = le16_to_cpu(el->l_next_free_rec);
 
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
 	if (next_free == 0)
 		goto set_and_inc;
 
@@ -1034,7 +1051,7 @@ static int __ocfs2_find_path(struct inode *inode,
 			 * rightmost record.
 			 */
 			range = le32_to_cpu(rec->e_cpos) +
-				le32_to_cpu(rec->e_clusters);
+				ocfs2_rec_clusters(el, rec);
 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
 			    break;
 		}
@@ -1195,21 +1212,21 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
 	 */
 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
-	left_rec->e_clusters = cpu_to_le32(left_clusters);
+	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
 
 	/*
 	 * Calculate the rightmost cluster count boundary before
-	 * moving cpos - we will need to adjust e_clusters after
+	 * moving cpos - we will need to adjust clusters after
 	 * updating e_cpos to keep the same highest cluster count.
 	 */
 	right_end = le32_to_cpu(right_rec->e_cpos);
-	right_end += le32_to_cpu(right_rec->e_clusters);
+	right_end += le32_to_cpu(right_rec->e_int_clusters);
 
 	right_rec->e_cpos = left_rec->e_cpos;
 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
 
 	right_end -= le32_to_cpu(right_rec->e_cpos);
-	right_rec->e_clusters = cpu_to_le32(right_end);
+	right_rec->e_int_clusters = cpu_to_le32(right_end);
 }
 
 /*
@@ -1452,6 +1469,8 @@ static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
 	u64 blkno;
 	struct ocfs2_extent_list *el;
 
+	BUG_ON(path->p_tree_depth == 0);
+
 	*cpos = 0;
 
 	blkno = path_leaf_bh(path)->b_blocknr;
@@ -1486,7 +1505,9 @@ static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
 				}
 
 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
-				*cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
+				*cpos = *cpos + ocfs2_rec_clusters(el,
+							   &el->l_recs[j - 1]);
+				*cpos = *cpos - 1;
 				goto out;
 			}
 		}
@@ -1715,7 +1736,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 	unsigned int range;
 	struct ocfs2_extent_rec *rec;
 
-	BUG_ON(el->l_tree_depth);
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
 	/*
 	 * Contiguous insert - either left or right.
@@ -1726,8 +1747,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 			rec->e_blkno = insert_rec->e_blkno;
 			rec->e_cpos = insert_rec->e_cpos;
 		}
-		le32_add_cpu(&rec->e_clusters,
-			     le32_to_cpu(insert_rec->e_clusters));
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
 		return;
 	}
 
@@ -1748,7 +1769,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 	if (insert->ins_appending == APPEND_TAIL) {
 		i = le16_to_cpu(el->l_next_free_rec) - 1;
 		rec = &el->l_recs[i];
-		range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
+		range = le32_to_cpu(rec->e_cpos)
+			+ le16_to_cpu(rec->e_leaf_clusters);
 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
 
 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
@@ -1761,9 +1783,9 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 				le16_to_cpu(el->l_count),
 				le16_to_cpu(el->l_next_free_rec),
 				le32_to_cpu(el->l_recs[i].e_cpos),
-				le32_to_cpu(el->l_recs[i].e_clusters),
+				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
 				le32_to_cpu(insert_rec->e_cpos),
-				le32_to_cpu(insert_rec->e_clusters));
+				le16_to_cpu(insert_rec->e_leaf_clusters));
 		i++;
 		el->l_recs[i] = *insert_rec;
 		le16_add_cpu(&el->l_next_free_rec, 1);
@@ -1805,6 +1827,12 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 
 	*ret_left_path = NULL;
 
+	/*
+	 * This shouldn't happen for non-trees. The extent rec cluster
+	 * count manipulation below only works for interior nodes.
+	 */
+	BUG_ON(right_path->p_tree_depth == 0);
+
 	/*
 	 * If our appending insert is at the leftmost edge of a leaf,
 	 * then we might need to update the rightmost records of the
@@ -1863,6 +1891,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 	bh = path_root_bh(right_path);
 	i = 0;
 	while (1) {
+		struct ocfs2_extent_rec *rec;
+
 		next_free = le16_to_cpu(el->l_next_free_rec);
 		if (next_free == 0) {
 			ocfs2_error(inode->i_sb,
@@ -1872,16 +1902,19 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 			goto out;
 		}
 
-		el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
-		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-			     le32_to_cpu(insert_rec->e_clusters));
-		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-			    -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
+		rec = &el->l_recs[next_free - 1];
+
+		rec->e_int_clusters = insert_rec->e_cpos;
+		le32_add_cpu(&rec->e_int_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		le32_add_cpu(&rec->e_int_clusters,
+			     -le32_to_cpu(rec->e_cpos));
 
 		ret = ocfs2_journal_dirty(handle, bh);
 		if (ret)
 			mlog_errno(ret);
 
+		/* Don't touch the leaf node */
 		if (++i >= right_path->p_tree_depth)
 			break;
 
@@ -2068,7 +2101,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
 	ocfs2_update_dinode_clusters(inode, di,
-				     le32_to_cpu(insert_rec->e_clusters));
+				     le16_to_cpu(insert_rec->e_leaf_clusters));
 
 	ret = ocfs2_journal_dirty(handle, di_bh);
 	if (ret)
@@ -2089,6 +2122,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
 	int i;
 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
 
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
 						  insert_rec);
@@ -2120,7 +2155,7 @@ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
 
 	insert->ins_appending = APPEND_NONE;
 
-	BUG_ON(el->l_tree_depth);
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
 	if (!el->l_next_free_rec)
 		goto set_tail_append;
@@ -2134,7 +2169,8 @@ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
 	rec = &el->l_recs[i];
 
-	if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
+	if (cpos >=
+	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
 		goto set_tail_append;
 
 	return;
@@ -2242,7 +2278,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * The insert code isn't quite ready to deal with all cases of
 	 * left contiguousness. Specifically, if it's an insert into
 	 * the 1st record in a leaf, it will require the adjustment of
-	 * e_clusters on the last record of the path directly to it's
+	 * cluster count on the last record of the path directly to it's
 	 * left. For now, just catch that case and fool the layers
 	 * above us. This works just fine for tree_depth == 0, which
 	 * is why we allow that above.
@@ -2310,9 +2346,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
 			OCFS2_I(inode)->ip_clusters);
 
+	memset(&rec, 0, sizeof(rec));
 	rec.e_cpos = cpu_to_le32(cpos);
 	rec.e_blkno = cpu_to_le64(start_blk);
-	rec.e_clusters = cpu_to_le32(new_clusters);
+	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 
 	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
 					  &insert);
@@ -2981,7 +3018,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
 		 * Check it we'll only be trimming off the end of this
 		 * cluster.
 		 */
-		if (le16_to_cpu(rec->e_clusters) > clusters_to_del)
+		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
 			goto out;
 	}
 
@@ -3061,11 +3098,11 @@ find_tail_record:
 
 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-		     le32_to_cpu(rec->e_clusters),
+		     ocfs2_rec_clusters(el, rec),
 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
 		     le16_to_cpu(el->l_next_free_rec));
 
-		BUG_ON(le32_to_cpu(rec->e_clusters) < clusters_to_del);
+		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
 
 		if (le16_to_cpu(el->l_tree_depth) == 0) {
 			/*
@@ -3107,13 +3144,13 @@ find_tail_record:
 				goto find_tail_record;
 			}
 
-			le32_add_cpu(&rec->e_clusters, -clusters_to_del);
+			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
 
 			/*
 			 * We'll use "new_edge" on our way back up the
 			 * tree to know what our rightmost cpos is.
 			 */
-			new_edge = le32_to_cpu(rec->e_clusters);
+			new_edge = le16_to_cpu(rec->e_leaf_clusters);
 			new_edge += le32_to_cpu(rec->e_cpos);
 
 			/*
@@ -3121,12 +3158,12 @@ find_tail_record:
 			 */
 			*delete_start = le64_to_cpu(rec->e_blkno)
 				+ ocfs2_clusters_to_blocks(inode->i_sb,
-					le32_to_cpu(rec->e_clusters));
+					le16_to_cpu(rec->e_leaf_clusters));
 
 			/*
 			 * If it's now empty, remove this record.
 			 */
-			if (le32_to_cpu(rec->e_clusters) == 0) {
+			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
 				memset(rec, 0,
 				       sizeof(struct ocfs2_extent_rec));
 				le16_add_cpu(&el->l_next_free_rec, -1);
@@ -3152,15 +3189,15 @@ find_tail_record:
 			if (new_edge == 0)
 				goto delete;
 
-			rec->e_clusters = cpu_to_le32(new_edge);
-			le32_add_cpu(&rec->e_clusters,
+			rec->e_int_clusters = cpu_to_le32(new_edge);
+			le32_add_cpu(&rec->e_int_clusters,
 				     -le32_to_cpu(rec->e_cpos));
 
 			 /*
 			  * A deleted child record should have been
 			  * caught above.
 			  */
-			 BUG_ON(le32_to_cpu(rec->e_clusters) == 0);
+			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
 		}
 
 delete:
@@ -3173,7 +3210,7 @@ delete:
 		mlog(0, "extent list container %llu, after: record %d: "
 		     "(%u, %u, %llu), next = %u.\n",
 		     (unsigned long long)bh->b_blocknr, i,
-		     le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
+		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
 		     le16_to_cpu(el->l_next_free_rec));
 
@@ -3195,7 +3232,7 @@ delete:
 
 			ocfs2_remove_from_cache(inode, bh);
 
-			BUG_ON(le32_to_cpu(el->l_recs[0].e_clusters));
+			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
 
@@ -3283,7 +3320,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	 * Lower levels depend on this never happening, but it's best
 	 * to check it up here before changing the tree.
 	 */
-	if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
+	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %lu has an empty extent record, depth %u\n",
 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
@@ -3644,13 +3681,13 @@ start:
 
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
-		le32_to_cpu(el->l_recs[i].e_clusters);
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
 		clusters_to_del = 0;
 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
-		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
 	} else if (range > new_highest_cpos) {
-		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
 				  new_highest_cpos;
 	} else {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3cb39cd5e47..fbcb5934a08 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -85,4 +85,23 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
 		    u32 cpos, struct buffer_head **leaf_bh);
 
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+					      struct ocfs2_extent_rec *rec)
+{
+	/*
+	 * Cluster count in extent records is slightly different
+	 * between interior nodes and leaf nodes. This is to support
+	 * unwritten extents which need a flags field in leaf node
+	 * records, thus shrinking the available space for a clusters
+	 * field.
+	 */
+	if (el->l_tree_depth)
+		return le32_to_cpu(rec->e_int_clusters);
+	else
+		return le16_to_cpu(rec->e_leaf_clusters);
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 937c2722b75..ea0ce41d419 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -50,13 +50,15 @@ static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
 	int ret = -1;
 	int i;
 	struct ocfs2_extent_rec *rec;
-	u32 rec_end, rec_start;
+	u32 rec_end, rec_start, clusters;
 
 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
 
 		rec_start = le32_to_cpu(rec->e_cpos);
-		rec_end = rec_start + le32_to_cpu(rec->e_clusters);
+		clusters = ocfs2_rec_clusters(el, rec);
+
+		rec_end = rec_start + clusters;
 
 		if (v_cluster >= rec_start && v_cluster < rec_end) {
 			ret = i;
@@ -98,6 +100,15 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
 	}
 
 	i = ocfs2_search_extent_list(el, v_cluster);
@@ -118,7 +129,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
 				    "record (%u, %u, 0)", inode->i_ino,
 				    le32_to_cpu(rec->e_cpos),
-				    le32_to_cpu(rec->e_clusters));
+				    ocfs2_rec_clusters(el, rec));
 			ret = -EROFS;
 			goto out;
 		}
@@ -130,7 +141,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		*p_cluster = *p_cluster + coff;
 
 		if (num_clusters)
-			*num_clusters = le32_to_cpu(rec->e_clusters) - coff;
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
 	}
 
 out:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f516619a374..36176018b4b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1127,7 +1127,6 @@ static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 				       size_t count)
 {
 	int ret = 0;
-	unsigned int extent_flags;
 	u32 cpos, clusters, extent_len, phys_cpos;
 	struct super_block *sb = inode->i_sb;
 
@@ -1135,14 +1134,13 @@ static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
 
 	while (clusters) {
-		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
-					 &extent_flags);
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+		if (phys_cpos == 0) {
 			ret = 1;
 			break;
 		}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f2775..3db5de4506d 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	/* We may be deleting metadata blocks, so metadata alloc dinode +
 	   one desc. block for each possible delete. */
 	if (tree_depth && next_free == 1 &&
-	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
 		credits += 1 + tree_depth;
 
 	/* update to the truncate log. */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0101974f4f..71306479c68 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -155,6 +155,12 @@
 #define OCFS2_FL_VISIBLE	(0x000100FF)	/* User visible flags */
 #define OCFS2_FL_MODIFIABLE	(0x000100FF)	/* User modifiable flags */
 
+/*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN	(0x01)	/* Extent is allocated but
+					 * unwritten */
+
 /*
  * ioctl commands
  */
@@ -283,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 /*
  * On disk extent record for OCFS2
  * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
  */
 struct ocfs2_extent_rec {
 /*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
-	__le32 e_clusters;	/* Clusters covered by this extent */
+	union {
+		__le32 e_int_clusters; /* Clusters covered by all children */
+		struct {
+			__le16 e_leaf_clusters; /* Clusters covered by this
+						   extent */
+			__u8 e_reserved1;
+			__u8 e_flags; /* Extent flags */
+		};
+	};
 	__le64 e_blkno;		/* Physical disk offset, in blocks */
 /*10*/
 };
-- 
cgit v1.2.3-70-g09d2