From 363041a5f74b953ab6b705ac9c88e5eda218a24b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 17 Jan 2007 12:31:35 -0800
Subject: ocfs2: temporarily remove extent map caching

The code in extent_map.c is not prepared to deal with a subtree being
rotated between lookups. This can happen when filling holes in sparse files.
Instead of a lengthy patch to update the code (which would likely lose the
benefit of caching subtree roots), we remove most of the algorithms and
implement a simple path based lookup. A less ambitious extent caching scheme
will be added in a later patch.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/extent_map.c | 1024 ++++---------------------------------------------
 1 file changed, 81 insertions(+), 943 deletions(-)

(limited to 'fs/ocfs2/extent_map.c')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9..3b4322fd369 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
  *
  * extent_map.c
  *
- * In-memory extent map for OCFS2.  Man, this code was prettier in
- * the library.
+ * Block/Cluster mapping functions
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
@@ -26,1016 +25,155 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 
+#include "alloc.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
 
 #include "buffer_head_io.h"
 
-
-/*
- * SUCK SUCK SUCK
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-
-struct ocfs2_extent_map_entry {
-	struct rb_node e_node;
-	int e_tree_depth;
-	struct ocfs2_extent_rec e_rec;
-};
-
-struct ocfs2_em_insert_context {
-	int need_left;
-	int need_right;
-	struct ocfs2_extent_map_entry *new_ent;
-	struct ocfs2_extent_map_entry *old_ent;
-	struct ocfs2_extent_map_entry *left_ent;
-	struct ocfs2_extent_map_entry *right_ent;
-};
-
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-
-
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-			u32 cpos, u32 clusters,
-			struct rb_node ***ret_p,
-			struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
-				   struct ocfs2_extent_rec *rec,
-				   int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-					 struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-				      u32 cpos, u32 clusters,
-				      struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-					u32 cpos, u32 clusters,
-					struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-				       struct ocfs2_extent_rec *rec,
-				       int tree_depth,
-				       struct ocfs2_em_insert_context *ctxt);
-
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
-					      u32 cpos, u32 clusters)
-{
-	if (le32_to_cpu(rec->e_cpos) > cpos)
-		return 0;
-	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
-			      le32_to_cpu(rec->e_clusters))
-		return 0;
-	return 1;
-}
-
-
 /*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock.  This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
  *
- * The rb_node garbage lets insertion share the search.  Trivial
- * callers pass NULL.
+ * Should work fine on interior and exterior nodes.
  */
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-			u32 cpos, u32 clusters,
-			struct rb_node ***ret_p,
-			struct rb_node **ret_parent)
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+				    u32 v_cluster)
 {
-	struct rb_node **p = &em->em_extents.rb_node;
-	struct rb_node *parent = NULL;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	while (*p)
-	{
-		parent = *p;
-		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
-			       e_node);
-		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
-			p = &(*p)->rb_left;
-			ent = NULL;
-		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
-				    le32_to_cpu(ent->e_rec.e_clusters))) {
-			p = &(*p)->rb_right;
-			ent = NULL;
-		} else
-			break;
-	}
-
-	if (ret_p != NULL)
-		*ret_p = p;
-	if (ret_parent != NULL)
-		*ret_parent = parent;
-	return ent;
-}
-
-/*
- * Find the leaf containing the interval we want.  While we're on our
- * way down the tree, fill in every record we see at any depth, because
- * we might want it later.
- *
- * Note that this code is run without ip_lock.  That's because it
- * sleeps while reading.  If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-				      u32 cpos, u32 clusters,
-				      struct ocfs2_extent_list *el)
-{
-	int i, ret;
-	struct buffer_head *eb_bh = NULL;
-	u64 blkno;
-	u32 rec_end;
-	struct ocfs2_extent_block *eb;
+	int ret = -1;
+	int i;
 	struct ocfs2_extent_rec *rec;
+	u32 rec_end, rec_start;
 
-	/*
-	 * The bh data containing the el cannot change here, because
-	 * we hold alloc_sem.  So we can do this without other
-	 * locks.
-	 */
-	while (el->l_tree_depth)
-	{
-		blkno = 0;
-		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-			rec = &el->l_recs[i];
-			rec_end = (le32_to_cpu(rec->e_cpos) +
-				   le32_to_cpu(rec->e_clusters));
-
-			ret = -EBADR;
-			if (rec_end > OCFS2_I(inode)->ip_clusters) {
-				mlog_errno(ret);
-				ocfs2_error(inode->i_sb,
-					    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-					    i,
-					    (unsigned long long)le64_to_cpu(rec->e_blkno),
-					    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-					    OCFS2_I(inode)->ip_clusters);
-				goto out_free;
-			}
-
-			if (rec_end <= cpos) {
-				ret = ocfs2_extent_map_insert(inode, rec,
-						le16_to_cpu(el->l_tree_depth));
-				if (ret && (ret != -EEXIST)) {
-					mlog_errno(ret);
-					goto out_free;
-				}
-				continue;
-			}
-			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
-				ret = ocfs2_extent_map_insert(inode, rec,
-						le16_to_cpu(el->l_tree_depth));
-				if (ret && (ret != -EEXIST)) {
-					mlog_errno(ret);
-					goto out_free;
-				}
-				continue;
-			}
-
-			/*
-			 * We've found a record that matches our
-			 * interval.  We don't insert it because we're
-			 * about to traverse it.
-			 */
-
-			/* Check to see if we're stradling */
-			ret = -ESRCH;
-			if (!ocfs2_extent_rec_contains_clusters(rec,
-							        cpos,
-								clusters)) {
-				mlog_errno(ret);
-				goto out_free;
-			}
-
-			/*
-			 * If we've already found a record, the el has
-			 * two records covering the same interval.
-			 * EEEK!
-			 */
-			ret = -EBADR;
-			if (blkno) {
-				mlog_errno(ret);
-				ocfs2_error(inode->i_sb,
-					    "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
-					    cpos, clusters,
-					    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-					    (unsigned long long)blkno, i,
-					    (unsigned long long)le64_to_cpu(rec->e_blkno));
-				goto out_free;
-			}
-
-			blkno = le64_to_cpu(rec->e_blkno);
-		}
-
-		/*
-		 * We don't support holes, and we're still up
-		 * in the branches, so we'd better have found someone
-		 */
-		ret = -EBADR;
-		if (!blkno) {
-			ocfs2_error(inode->i_sb,
-				    "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
-				    cpos, clusters,
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			mlog_errno(ret);
-			goto out_free;
-		}
-
-		if (eb_bh) {
-			brelse(eb_bh);
-			eb_bh = NULL;
-		}
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       blkno, &eb_bh, OCFS2_BH_CACHED,
-				       inode);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_free;
-		}
-		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			ret = -EIO;
-			goto out_free;
-		}
-		el = &eb->h_list;
-	}
-
-	BUG_ON(el->l_tree_depth);
-
-	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
 
-		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
-		    OCFS2_I(inode)->ip_clusters) {
-			ret = -EBADR;
-			mlog_errno(ret);
-			ocfs2_error(inode->i_sb,
-				    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-				    i,
-				    (unsigned long long)le64_to_cpu(rec->e_blkno),
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-				    OCFS2_I(inode)->ip_clusters);
-			return ret;
-		}
+		rec_start = le32_to_cpu(rec->e_cpos);
+		rec_end = rec_start + le32_to_cpu(rec->e_clusters);
 
-		ret = ocfs2_extent_map_insert(inode, rec,
-					      le16_to_cpu(el->l_tree_depth));
-		if (ret && (ret != -EEXIST)) {
-			mlog_errno(ret);
-			goto out_free;
+		if (v_cluster >= rec_start && v_cluster < rec_end) {
+			ret = i;
+			break;
 		}
 	}
 
-	ret = 0;
-
-out_free:
-	if (eb_bh)
-		brelse(eb_bh);
-
 	return ret;
 }
 
-/*
- * This lookup actually will read from disk.  It has one invariant:
- * It will never re-traverse blocks.  This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
- */
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-					u32 cpos,
-					u32 clusters,
-					struct ocfs2_extent_map_entry **ret_ent)
+static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+			      u32 *p_cluster, u32 *num_clusters)
 {
-	int ret;
-	u64 blkno;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-	struct buffer_head *bh = NULL;
-	struct ocfs2_extent_block *eb;
+	int ret, i;
+	struct buffer_head *di_bh = NULL;
+	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_dinode *di;
+	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (ent) {
-		if (!ent->e_tree_depth) {
-			spin_unlock(&OCFS2_I(inode)->ip_lock);
-			*ret_ent = ent;
-			return 0;
-		}
-		blkno = le64_to_cpu(ent->e_rec.e_blkno);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
-				       OCFS2_BH_CACHED, inode);
-		if (ret) {
-			mlog_errno(ret);
-			if (bh)
-				brelse(bh);
-			return ret;
-		}
-		eb = (struct ocfs2_extent_block *)bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			brelse(bh);
-			return -EIO;
-		}
-		el = &eb->h_list;
-	} else {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       OCFS2_I(inode)->ip_blkno, &bh,
-				       OCFS2_BH_CACHED, inode);
-		if (ret) {
-			mlog_errno(ret);
-			if (bh)
-				brelse(bh);
-			return ret;
-		}
-		di = (struct ocfs2_dinode *)bh->b_data;
-		if (!OCFS2_IS_VALID_DINODE(di)) {
-			brelse(bh);
-			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
-			return -EIO;
-		}
-		el = &di->id2.i_list;
-	}
-
-	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
-	brelse(bh);
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
+		goto out;
 	}
 
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (!ent) {
-		ret = -ESRCH;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	/* FIXME: Make sure this isn't a corruption */
-	BUG_ON(ent->e_tree_depth);
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	el = &di->id2.i_list;
 
-	*ret_ent = ent;
-
-	return 0;
-}
-
-/*
- * Callers must hold ip_lock.  This can insert pieces of the tree,
- * thus racing lookup if the lock weren't held.
- */
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-					 struct ocfs2_extent_map_entry *ent)
-{
-	struct rb_node **p, *parent;
-	struct ocfs2_extent_map_entry *old_ent;
-
-	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
-					  le32_to_cpu(ent->e_rec.e_clusters),
-					  &p, &parent);
-	if (old_ent)
-		return -EEXIST;
-
-	rb_link_node(&ent->e_node, parent, p);
-	rb_insert_color(&ent->e_node, &em->em_extents);
-
-	return 0;
-}
-
-
-/*
- * Simple rule: on any return code other than -EAGAIN, anything left
- * in the insert_context will be freed.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert.  It is not an actual error, but it
- * tells the caller we have no more work to do.
- */
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-				       struct ocfs2_extent_rec *rec,
-				       int tree_depth,
-				       struct ocfs2_em_insert_context *ctxt)
-{
-	int ret;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *old_ent;
-
-	ctxt->need_left = 0;
-	ctxt->need_right = 0;
-	ctxt->old_ent = NULL;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-	if (!ret) {
-		ctxt->new_ent = NULL;
-		goto out_unlock;
-	}
-
-	/* Since insert_entry failed, the map MUST have old_ent */
-	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-					  le32_to_cpu(rec->e_clusters),
-					  NULL, NULL);
-
-	BUG_ON(!old_ent);
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
-	if (old_ent->e_tree_depth < tree_depth) {
-		/* Another thread beat us to the lower tree_depth */
-		ret = -EEXIST;
-		goto out_unlock;
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
 	}
 
-	if (old_ent->e_tree_depth == tree_depth) {
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
 		/*
-		 * Another thread beat us to this tree_depth.
-		 * Let's make sure we agree with that thread (the
-		 * extent_rec should be identical).
+		 * A hole was found. Return some canned values that
+		 * callers can key on.
 		 */
-		if (!memcmp(rec, &old_ent->e_rec,
-			    sizeof(struct ocfs2_extent_rec)))
-			ret = 0;
-		else
-			/* FIXME: Should this be ESRCH/EBADR??? */
-			ret = -EEXIST;
+		*p_cluster = 0;
+		if (num_clusters)
+			*num_clusters = 1;
+	} else {
+		rec = &el->l_recs[i];
 
-		goto out_unlock;
-	}
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
-	/*
-	 * We do it in this order specifically so that no actual tree
-	 * changes occur until we have all the pieces we need.  We
-	 * don't want malloc failures to leave an inconsistent tree.
-	 * Whenever we drop the lock, another process could be
-	 * inserting.  Also note that, if another process just beat us
-	 * to an insert, we might not need the same pieces we needed
-	 * the first go round.  In the end, the pieces we need will
-	 * be used, and the pieces we don't will be freed.
-	 */
-	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
-			     le32_to_cpu(old_ent->e_rec.e_cpos));
-	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
-			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
-			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
-	ret = -EAGAIN;
-	if (ctxt->need_left) {
-		if (!ctxt->left_ent)
-			goto out_unlock;
-		*(ctxt->left_ent) = *old_ent;
-		ctxt->left_ent->e_rec.e_clusters =
-			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
-				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
-	}
-	if (ctxt->need_right) {
-		if (!ctxt->right_ent)
-			goto out_unlock;
-		*(ctxt->right_ent) = *old_ent;
-		ctxt->right_ent->e_rec.e_cpos =
-			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0)", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
 				    le32_to_cpu(rec->e_clusters));
-		ctxt->right_ent->e_rec.e_clusters =
-			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
-				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
-				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
-	}
-
-	rb_erase(&old_ent->e_node, &em->em_extents);
-	/* Now that he's erased, set him up for deletion */
-	ctxt->old_ent = old_ent;
-
-	if (ctxt->need_left) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    ctxt->left_ent);
-		if (ret)
-			goto out_unlock;
-		ctxt->left_ent = NULL;
-	}
-
-	if (ctxt->need_right) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    ctxt->right_ent);
-		if (ret)
-			goto out_unlock;
-		ctxt->right_ent = NULL;
-	}
-
-	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-
-	if (!ret)
-		ctxt->new_ent = NULL;
-
-out_unlock:
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	return ret;
-}
-
-
-static int ocfs2_extent_map_insert(struct inode *inode,
-				   struct ocfs2_extent_rec *rec,
-				   int tree_depth)
-{
-	int ret;
-	struct ocfs2_em_insert_context ctxt = {0, };
-
-	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
-	    OCFS2_I(inode)->ip_map.em_clusters) {
-		ret = -EBADR;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	/* Zero e_clusters means a truncated tail record.  It better be EOF */
-	if (!rec->e_clusters) {
-		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
-		    OCFS2_I(inode)->ip_map.em_clusters) {
-			ret = -EBADR;
-			mlog_errno(ret);
-			ocfs2_error(inode->i_sb,
-				    "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
-				    (unsigned long long)le64_to_cpu(rec->e_blkno),
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			return ret;
-		}
-
-		/* Ignore the truncated tail */
-		return 0;
-	}
-
-	ret = -ENOMEM;
-	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
-					GFP_NOFS);
-	if (!ctxt.new_ent) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	ctxt.new_ent->e_rec = *rec;
-	ctxt.new_ent->e_tree_depth = tree_depth;
-
-	do {
-		ret = -ENOMEM;
-		if (ctxt.need_left && !ctxt.left_ent) {
-			ctxt.left_ent =
-				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_NOFS);
-			if (!ctxt.left_ent)
-				break;
-		}
-		if (ctxt.need_right && !ctxt.right_ent) {
-			ctxt.right_ent =
-				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_NOFS);
-			if (!ctxt.right_ent)
-				break;
+			ret = -EROFS;
+			goto out;
 		}
 
-		ret = ocfs2_extent_map_try_insert(inode, rec,
-						  tree_depth, &ctxt);
-	} while (ret == -EAGAIN);
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
 
-	if ((ret < 0) && (ret != -EEXIST))
-		mlog_errno(ret);
-
-	if (ctxt.left_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
-	if (ctxt.right_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
-	if (ctxt.old_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
-	if (ctxt.new_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
-
-	return ret;
-}
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
 
-/*
- * Append this record to the tail of the extent map.  It must be
- * tree_depth 0.  The record might be an extension of an existing
- * record, and as such that needs to be handled.  eg:
- *
- * Existing record in the extent map:
- *
- *	cpos = 10, len = 10
- *	|---------|
- *
- * New Record:
- *
- *	cpos = 10, len = 20
- *	|------------------|
- *
- * The passed record is the new on-disk record.  The new_clusters value
- * is how many clusters were added to the file.  If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters.  If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
- */
-int ocfs2_extent_map_append(struct inode *inode,
-			    struct ocfs2_extent_rec *rec,
-			    u32 new_clusters)
-{
-	int ret;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-	struct ocfs2_extent_rec *old;
-
-	BUG_ON(!new_clusters);
-	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
-
-	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+		if (num_clusters)
+			*num_clusters = le32_to_cpu(rec->e_clusters) - coff;
 	}
 
-	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
-			 le32_to_cpu(rec->e_clusters)) !=
-			(em->em_clusters + new_clusters),
-			"Inode %llu:\n"
-			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
-			"em->em_clusters = %u + new_clusters = %u = %u\n",
-			(unsigned long long)OCFS2_I(inode)->ip_blkno,
-			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
-			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
-			em->em_clusters, new_clusters,
-			em->em_clusters + new_clusters);
-
-	em->em_clusters += new_clusters;
-
-	ret = -ENOENT;
-	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
-		/* This is a contiguous append */
-		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
-					      NULL, NULL);
-		if (ent) {
-			old = &ent->e_rec;
-			BUG_ON((le32_to_cpu(rec->e_cpos) +
-				le32_to_cpu(rec->e_clusters)) !=
-				 (le32_to_cpu(old->e_cpos) +
-				  le32_to_cpu(old->e_clusters) +
-				  new_clusters));
-			if (ent->e_tree_depth == 0) {
-				BUG_ON(le32_to_cpu(old->e_cpos) !=
-				       le32_to_cpu(rec->e_cpos));
-				BUG_ON(le64_to_cpu(old->e_blkno) !=
-				       le64_to_cpu(rec->e_blkno));
-				ret = 0;
-			}
-			/*
-			 * Let non-leafs fall through as -ENOENT to
-			 * force insertion of the new leaf.
-			 */
-			le32_add_cpu(&old->e_clusters, new_clusters);
-		}
-	}
-
-	if (ret == -ENOENT)
-		ret = ocfs2_extent_map_insert(inode, rec, 0);
-	if (ret < 0)
-		mlog_errno(ret);
+out:
+	brelse(di_bh);
+	brelse(eb_bh);
 	return ret;
 }
 
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
-
 /*
- * Look up the record containing this cluster offset.  This record is
- * part of the extent map.  Do not free it.  Any changes you make to
- * it will reflect in the extent map.  So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
- *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
- * rec->e_clusters -= 5;
- *
- * The lookup does not read from disk.  If the map isn't filled in for
- * an entry, you won't find it.
- *
- * Also note that the returned record is valid until alloc_sem is
- * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ * This expects alloc_sem to be held. The allocation cannot change at
+ * all while the map is in the process of being updated.
  */
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
-			     struct ocfs2_extent_rec **rec,
-			     int *tree_depth)
-{
-	int ret = -ENOENT;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-
-	*rec = NULL;
-
-	if (cpos >= OCFS2_I(inode)->ip_clusters)
-		return -EINVAL;
-
-	if (cpos >= em->em_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
-	}
-
-	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
-				      NULL, NULL);
-
-	if (ent) {
-		*rec = &ent->e_rec;
-		if (tree_depth)
-			*tree_depth = ent->e_tree_depth;
-		ret = 0;
-	}
-
-	return ret;
-}
-
-int ocfs2_extent_map_get_clusters(struct inode *inode,
-				  u32 v_cpos, int count,
-				  u32 *p_cpos, int *ret_count)
-{
-	int ret;
-	u32 coff, ccount;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	*p_cpos = ccount = 0;
-
-	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
-		return -EINVAL;
-
-	if ((v_cpos + count) > em->em_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
-	}
-
-
-	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
-	if (ret)
-		return ret;
-
-	if (ent) {
-		/* We should never find ourselves straddling an interval */
-		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
-							v_cpos,
-							count))
-			return -ESRCH;
-
-		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
-		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
-				le64_to_cpu(ent->e_rec.e_blkno)) +
-			  coff;
-
-		if (ret_count)
-			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
-
-		return 0;
-	}
-
-
-	return -ENOENT;
-}
-
-#endif  /*  0  */
-
-int ocfs2_extent_map_get_blocks(struct inode *inode,
-				u64 v_blkno, int count,
-				u64 *p_blkno, int *ret_count)
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				int *ret_count)
 {
 	int ret;
-	u64 boff;
-	u32 cpos, clusters;
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-	struct ocfs2_extent_map_entry *ent = NULL;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_rec *rec;
-
-	*p_blkno = 0;
+	u32 cpos, num_clusters, p_cluster;
+	u64 boff = 0;
 
 	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
-	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
-					    (u64)count + bpc - 1);
-	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
-		ret = -EINVAL;
-		mlog_errno(ret);
-		return ret;
-	}
 
-	if ((cpos + clusters) > em->em_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
-	}
-
-	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
+		goto out;
 	}
 
-	if (ent)
-	{
-		rec = &ent->e_rec;
-
-		/* We should never find ourselves straddling an interval */
-		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
-			ret = -ESRCH;
-			mlog_errno(ret);
-			return ret;
-		}
-
-		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
-						le32_to_cpu(rec->e_cpos));
+	/*
+	 * p_cluster == 0 indicates a hole.
+	 */
+	if (p_cluster) {
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 		boff += (v_blkno & (u64)(bpc - 1));
-		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
-
-		if (ret_count) {
-			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
-					le32_to_cpu(rec->e_clusters)) - boff;
-		}
-
-		return 0;
 	}
 
-	return -ENOENT;
-}
-
-int ocfs2_extent_map_init(struct inode *inode)
-{
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	*p_blkno = boff;
 
-	em->em_extents = RB_ROOT;
-	em->em_clusters = 0;
-
-	return 0;
-}
-
-/* Needs the lock */
-static void __ocfs2_extent_map_drop(struct inode *inode,
-				    u32 new_clusters,
-				    struct rb_node **free_head,
-				    struct ocfs2_extent_map_entry **tail_ent)
-{
-	struct rb_node *node, *next;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-
-	*free_head = NULL;
-
-	ent = NULL;
-	node = rb_last(&em->em_extents);
-	while (node)
-	{
-		next = rb_prev(node);
-
-		ent = rb_entry(node, struct ocfs2_extent_map_entry,
-			       e_node);
-		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
-			break;
-
-		rb_erase(&ent->e_node, &em->em_extents);
-
-		node->rb_right = *free_head;
-		*free_head = node;
-
-		ent = NULL;
-		node = next;
-	}
-
-	/* Do we have an entry straddling new_clusters? */
-	if (tail_ent) {
-		if (ent &&
-		    ((le32_to_cpu(ent->e_rec.e_cpos) +
-		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
-			*tail_ent = ent;
-		else
-			*tail_ent = NULL;
+	if (ret_count) {
+		*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		*ret_count -= v_blkno & (u64)(bpc - 1);
 	}
-}
-
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
-	struct rb_node *node;
-	struct ocfs2_extent_map_entry *ent;
-
-	while (free_head) {
-		node = free_head;
-		free_head = node->rb_right;
 
-		ent = rb_entry(node, struct ocfs2_extent_map_entry,
-			       e_node);
-		kmem_cache_free(ocfs2_em_ent_cachep, ent);
-	}
-}
-
-/*
- * Remove all entries past new_clusters, inclusive of an entry that
- * contains new_clusters.  This is effectively a cache forget.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
- */
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
-{
-	struct rb_node *free_head = NULL;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-
-	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-
-	if (ent) {
-		rb_erase(&ent->e_node, &em->em_extents);
-		ent->e_node.rb_right = free_head;
-		free_head = &ent->e_node;
-	}
-
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
-
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one.  This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
-{
-	struct rb_node *free_head = NULL;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-
-	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-
-	if (ent)
-		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
-					       le32_to_cpu(ent->e_rec.e_cpos));
-
-	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
-
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
-
-int __init init_ocfs2_extent_maps(void)
-{
-	ocfs2_em_ent_cachep =
-		kmem_cache_create("ocfs2_em_ent",
-				  sizeof(struct ocfs2_extent_map_entry),
-				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-	if (!ocfs2_em_ent_cachep)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void exit_ocfs2_extent_maps(void)
-{
-	kmem_cache_destroy(ocfs2_em_ent_cachep);
+out:
+	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 9 Feb 2007 20:24:12 -0800
Subject: ocfs2: teach ocfs2_file_aio_write() about sparse files

Unfortunately, ocfs2 can no longer make use of generic_file_aio_write_nlock()
because allocating writes will require zeroing of pages adjacent to the I/O
for cluster sizes greater than page size.

Implement a custom file write here, which can order page locks for zeroing.
This also has the advantage that cluster locks can easily be ordered outside
of the page locks.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c       | 679 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/aops.h       |  38 +++
 fs/ocfs2/extent_map.c |   4 +-
 fs/ocfs2/extent_map.h |   2 +
 fs/ocfs2/file.c       | 374 ++++++++++++++++++++++++---
 fs/ocfs2/file.h       |   4 +
 fs/ocfs2/ocfs2.h      |  32 +++
 7 files changed, 1076 insertions(+), 57 deletions(-)

(limited to 'fs/ocfs2/extent_map.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f3b0cc5cba1..5ffb3702b5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
+#include <linux/swap.h>
 
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -37,6 +38,7 @@
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
+#include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
 
@@ -645,23 +647,27 @@ static ssize_t ocfs2_direct_IO(int rw,
 
 	mlog_entry_void();
 
-	/*
-	 * We get PR data locks even for O_DIRECT.  This allows
-	 * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
-	 * extending and buffered zeroing writes race.  If they did
-	 * race then the buffered zeroing could be written back after
-	 * the O_DIRECT I/O.  It's one thing to tell people not to mix
-	 * buffered and O_DIRECT writes, but expecting them to
-	 * understand that file extension is also an implicit buffered
-	 * write is too much.  By getting the PR we force writeback of
-	 * the buffered zeroing before proceeding.
-	 */
-	ret = ocfs2_data_lock(inode, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+		/*
+		 * We get PR data locks even for O_DIRECT.  This
+		 * allows concurrent O_DIRECT I/O but doesn't let
+		 * O_DIRECT with extending and buffered zeroing writes
+		 * race.  If they did race then the buffered zeroing
+		 * could be written back after the O_DIRECT I/O.  It's
+		 * one thing to tell people not to mix buffered and
+		 * O_DIRECT writes, but expecting them to understand
+		 * that file extension is also an implicit buffered
+		 * write is too much.  By getting the PR we force
+		 * writeback of the buffered zeroing before
+		 * proceeding.
+		 */
+		ret = ocfs2_data_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ocfs2_data_unlock(inode, 0);
 	}
-	ocfs2_data_unlock(inode, 0);
 
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
@@ -673,6 +679,647 @@ out:
 	return ret;
 }
 
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+					    u32 cpos,
+					    unsigned int *start,
+					    unsigned int *end)
+{
+	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+		unsigned int cpp;
+
+		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+
+		cluster_start = cpos % cpp;
+		cluster_start = cluster_start << osb->s_clustersize_bits;
+
+		cluster_end = cluster_start + osb->s_clustersize;
+	}
+
+	BUG_ON(cluster_start > PAGE_SIZE);
+	BUG_ON(cluster_end > PAGE_SIZE);
+
+	if (start)
+		*start = cluster_start;
+	if (end)
+		*end = cluster_end;
+}
+
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+				     struct ocfs2_super *osb, u32 cpos,
+				     unsigned from, unsigned to)
+{
+	void *kaddr;
+	unsigned int cluster_start, cluster_end;
+
+	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+
+	if (from || to) {
+		if (from > cluster_start)
+			memset(kaddr + cluster_start, 0, from - cluster_start);
+		if (to < cluster_end)
+			memset(kaddr + to, 0, cluster_end - to);
+	} else {
+		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+	}
+
+	kunmap_atomic(kaddr, KM_USER0);
+}
+
+/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+				 struct inode *inode, unsigned int from,
+				 unsigned int to, int new)
+{
+	int ret = 0;
+	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+	unsigned int block_end, block_start;
+	unsigned int bsize = 1 << inode->i_blkbits;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, bsize, 0);
+
+	head = page_buffers(page);
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	     bh = bh->b_this_page, block_start += bsize) {
+		block_end = block_start + bsize;
+
+		/*
+		 * Ignore blocks outside of our i/o range -
+		 * they may belong to unallocated clusters.
+		 */
+		if (block_start >= to ||
+		    (block_start + bsize) <= from) {
+			if (PageUptodate(page))
+				set_buffer_uptodate(bh);
+			continue;
+		}
+
+		/*
+		 * For an allocating write with cluster size >= page
+		 * size, we always write the entire page.
+		 */
+
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+
+		if (!buffer_mapped(bh)) {
+			map_bh(bh, inode->i_sb, *p_blkno);
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		}
+
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+		     (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+
+		*p_blkno = *p_blkno + 1;
+	}
+
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			ret = -EIO;
+	}
+
+	if (ret == 0 || !new)
+		return ret;
+
+	/*
+	 * If we get -EIO above, zero out any newly allocated blocks
+	 * to avoid exposing stale data.
+	 */
+	bh = head;
+	block_start = 0;
+	do {
+		void *kaddr;
+
+		block_end = block_start + bsize;
+		if (block_end <= from)
+			goto next_bh;
+		if (block_start >= to)
+			break;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr+block_start, 0, bh->b_size);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+
+next_bh:
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+
+/*
+ * This will copy user data from the iovec in the buffered write
+ * context.
+ */
+int ocfs2_map_and_write_user_data(struct inode *inode,
+				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+				  unsigned int *ret_from, unsigned int *ret_to)
+{
+	int ret;
+	unsigned int to, from, cluster_start, cluster_end;
+	unsigned long bytes, src_from;
+	char *dst;
+	struct ocfs2_buffered_write_priv *bp = wc->w_private;
+	const struct iovec *cur_iov = bp->b_cur_iov;
+	char __user *buf;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+					&cluster_end);
+
+	buf = cur_iov->iov_base + bp->b_cur_off;
+	src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+
+	from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+
+	/*
+	 * This is a lot of comparisons, but it reads quite
+	 * easily, which is important here.
+	 */
+	/* Stay within the src page */
+	bytes = PAGE_SIZE - src_from;
+	/* Stay within the vector */
+	bytes = min(bytes,
+		    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
+	/* Stay within count */
+	bytes = min(bytes, (unsigned long)wc->w_count);
+	/*
+	 * For clustersize > page size, just stay within
+	 * target page, otherwise we have to calculate pos
+	 * within the cluster and obey the rightmost
+	 * boundary.
+	 */
+	if (wc->w_large_pages) {
+		/*
+		 * For cluster size < page size, we have to
+		 * calculate pos within the cluster and obey
+		 * the rightmost boundary.
+		 */
+		bytes = min(bytes, (unsigned long)(osb->s_clustersize
+				   - (wc->w_pos & (osb->s_clustersize - 1))));
+	} else {
+		/*
+		 * cluster size > page size is the most common
+		 * case - we just stay within the target page
+		 * boundary.
+		 */
+		bytes = min(bytes, PAGE_CACHE_SIZE - from);
+	}
+
+	to = from + bytes;
+
+	if (wc->w_this_page_new)
+		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+					    cluster_start, cluster_end, 1);
+	else
+		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+					    from, to, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(from > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > osb->s_clustersize);
+	BUG_ON(to > osb->s_clustersize);
+
+	dst = kmap(wc->w_this_page);
+	memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+	kunmap(wc->w_this_page);
+
+	/*
+	 * XXX: This is slow, but simple. The caller of
+	 * ocfs2_buffered_write_cluster() is responsible for
+	 * passing through the iovecs, so it's difficult to
+	 * predict what our next step is in here after our
+	 * initial write. A future version should be pushing
+	 * that iovec manipulation further down.
+	 *
+	 * By setting this, we indicate that a copy from user
+	 * data was done, and subsequent calls for this
+	 * cluster will skip copying more data.
+	 */
+	wc->w_finished_copy = 1;
+
+	*ret_from = from;
+	*ret_to = to;
+out:
+
+	return bytes ? (unsigned int)bytes : ret;
+}
+
+/*
+ * Map, fill and write a page to disk.
+ *
+ * The work of copying data is done via callback.  Newly allocated
+ * pages which don't take user data will be zero'd (set 'new' to
+ * indicate an allocating write)
+ *
+ * Returns a negative error code or the number of bytes copied into
+ * the page.
+ */
+int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+			  u64 *p_blkno, struct page *page,
+			  struct ocfs2_write_ctxt *wc, int new)
+{
+	int ret, copied = 0;
+	unsigned int from = 0, to = 0;
+	unsigned int cluster_start, cluster_end;
+	unsigned int zero_from = 0, zero_to = 0;
+
+	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+					&cluster_start, &cluster_end);
+
+	if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+	    && !wc->w_finished_copy) {
+
+		wc->w_this_page = page;
+		wc->w_this_page_new = new;
+		ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		copied = ret;
+
+		zero_from = from;
+		zero_to = to;
+		if (new) {
+			from = cluster_start;
+			to = cluster_end;
+		}
+	} else {
+		/*
+		 * If we haven't allocated the new page yet, we
+		 * shouldn't be writing it out without copying user
+		 * data. This is likely a math error from the caller.
+		 */
+		BUG_ON(!new);
+
+		from = cluster_start;
+		to = cluster_end;
+
+		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+					    cluster_start, cluster_end, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Parts of newly allocated pages need to be zero'd.
+	 *
+	 * Above, we have also rewritten 'to' and 'from' - as far as
+	 * the rest of the function is concerned, the entire cluster
+	 * range inside of a page needs to be written.
+	 *
+	 * We can skip this if the page is up to date - it's already
+	 * been zero'd from being read in as a hole.
+	 */
+	if (new && !PageUptodate(page))
+		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+					 wc->w_cpos, zero_from, zero_to);
+
+	flush_dcache_page(page);
+
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	/*
+	 * We don't use generic_commit_write() because we need to
+	 * handle our own i_size update.
+	 */
+	ret = block_commit_write(page, from, to);
+	if (ret)
+		mlog_errno(ret);
+out:
+
+	return copied ? copied : ret;
+}
+
+/*
+ * Do the actual write of some data into an inode. Optionally allocate
+ * in order to fulfill the write.
+ *
+ * cpos is the logical cluster offset within the file to write at
+ *
+ * 'phys' is the physical mapping of that offset. a 'phys' value of
+ * zero indicates that allocation is required. In this case, data_ac
+ * and meta_ac should be valid (meta_ac can be null if metadata
+ * allocation isn't required).
+ */
+static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+			   struct buffer_head *di_bh,
+			   struct ocfs2_alloc_context *data_ac,
+			   struct ocfs2_alloc_context *meta_ac,
+			   struct ocfs2_write_ctxt *wc)
+{
+	int ret, i, numpages = 1, new;
+	unsigned int copied = 0;
+	u32 tmp_pos;
+	u64 v_blkno, p_blkno;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	unsigned long index, start;
+	struct page **cpages;
+
+	new = phys == 0 ? 1 : 0;
+
+	/*
+	 * Figure out how many pages we'll be manipulating here. For
+	 * non-allocating write, or any writes where cluster size is
+	 * less than page size, we only need one page. Otherwise,
+	 * allocating writes of cluster size larger than page size
+	 * need cluster size pages.
+	 */
+	if (new && !wc->w_large_pages)
+		numpages = (1 << cbits) / PAGE_SIZE;
+
+	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
+	if (!cpages) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * Fill our page array first. That way we've grabbed enough so
+	 * that we can zero and flush if we error after adding the
+	 * extent.
+	 */
+	if (new) {
+		start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+							   wc->w_cpos);
+		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+	} else {
+		start = wc->w_pos >> PAGE_CACHE_SHIFT;
+		v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+	}
+
+	for(i = 0; i < numpages; i++) {
+		index = start + i;
+
+		cpages[i] = grab_cache_page(mapping, index);
+		if (!cpages[i]) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (new) {
+		/*
+		 * This is safe to call with the page locks - it won't take
+		 * any additional semaphores or cluster locks.
+		 */
+		tmp_pos = wc->w_cpos;
+		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+						 &tmp_pos, 1, di_bh, handle,
+						 data_ac, meta_ac, NULL);
+		/*
+		 * This shouldn't happen because we must have already
+		 * calculated the correct meta data allocation required. The
+		 * internal tree allocation code should know how to increase
+		 * transaction credits itself.
+		 *
+		 * If need be, we could handle -EAGAIN for a
+		 * RESTART_TRANS here.
+		 */
+		mlog_bug_on_msg(ret == -EAGAIN,
+				"Inode %llu: EAGAIN return during allocation.\n",
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL);
+	if (ret < 0) {
+
+		/*
+		 * XXX: Should we go readonly here?
+		 */
+
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0);
+
+	for(i = 0; i < numpages; i++) {
+		ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+					    wc, new);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		copied += ret;
+	}
+
+out:
+	for(i = 0; i < numpages; i++) {
+		unlock_page(cpages[i]);
+		mark_page_accessed(cpages[i]);
+		page_cache_release(cpages[i]);
+	}
+	kfree(cpages);
+
+	return copied ? copied : ret;
+}
+
+static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+				  struct ocfs2_super *osb, loff_t pos,
+				  size_t count, ocfs2_page_writer *cb,
+				  void *cb_priv)
+{
+	wc->w_count = count;
+	wc->w_pos = pos;
+	wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
+	wc->w_finished_copy = 0;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+		wc->w_large_pages = 1;
+	else
+		wc->w_large_pages = 0;
+
+	wc->w_write_data_page = cb;
+	wc->w_private = cb_priv;
+}
+
+/*
+ * Write a cluster to an inode. The cluster may not be allocated yet,
+ * in which case it will be. This only exists for buffered writes -
+ * O_DIRECT takes a more "traditional" path through the kernel.
+ *
+ * The caller is responsible for incrementing pos, written counts, etc
+ *
+ * For file systems that don't support sparse files, pre-allocation
+ * and page zeroing up until cpos should be done prior to this
+ * function call.
+ *
+ * Callers should be holding i_sem, and the rw cluster lock.
+ *
+ * Returns the number of user bytes written, or less than zero for
+ * error.
+ */
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+				     size_t count, ocfs2_page_writer *actor,
+				     void *priv)
+{
+	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+	ssize_t written = 0;
+	u32 phys;
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle;
+	struct ocfs2_write_ctxt wc;
+
+	ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+
+	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	/*
+	 * Take alloc sem here to prevent concurrent lookups. That way
+	 * the mapping, zeroing and tree manipulation within
+	 * ocfs2_write() will be safe against ->readpage(). This
+	 * should also serve to lock out allocation from a shared
+	 * writeable region.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_meta;
+	}
+
+	/* phys == 0 means that allocation is required. */
+	if (phys == 0) {
+		ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_meta;
+		}
+
+		credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+	}
+
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_meta;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_data;
+	}
+
+	written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+			      meta_ac, &wc);
+	if (written < 0) {
+		ret = written;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	pos += written;
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_data:
+	ocfs2_data_unlock(inode, 1);
+
+out_meta:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 1);
+
+out:
+	brelse(di_bh);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return written ? written : ret;
+}
+
 const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab8..eeb2c42483e 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,6 +30,44 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 unsigned from,
 							 unsigned to);
 
+struct ocfs2_write_ctxt;
+typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+				u64 *, unsigned int *, unsigned int *);
+
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+				     size_t count, ocfs2_page_writer *actor,
+				     void *priv);
+
+struct ocfs2_write_ctxt {
+	size_t				w_count;
+	loff_t				w_pos;
+	u32				w_cpos;
+	unsigned int			w_finished_copy;
+
+	/* This is true if page_size > cluster_size */
+	unsigned int			w_large_pages;
+
+	/* Filler callback and private data */
+	ocfs2_page_writer		*w_write_data_page;
+	void				*w_private;
+
+	/* Only valid for the filler callback */
+	struct page			*w_this_page;
+	unsigned int			w_this_page_new;
+};
+
+struct ocfs2_buffered_write_priv {
+	char				*b_src_buf;
+	const struct iovec		*b_cur_iov; /* Current iovec */
+	size_t				b_cur_off; /* Offset in the
+						    * current iovec */
+};
+int ocfs2_map_and_write_user_data(struct inode *inode,
+				  struct ocfs2_write_ctxt *wc,
+				  u64 *p_blkno,
+				  unsigned int *ret_from,
+				  unsigned int *ret_to);
+
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 3b4322fd369..937c2722b75 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -67,8 +67,8 @@ static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
 	return ret;
 }
 
-static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
-			      u32 *p_cluster, u32 *num_clusters)
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters)
 {
 	int ret, i;
 	struct buffer_head *di_bh = NULL;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 036e2325144..625d0ee5e04 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,6 +25,8 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
 
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+		       u32 *num_clusters);
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 				int *ret_count);
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3bcf3629265..667e5a869bf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mount.h>
+#include <linux/writeback.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -485,13 +486,13 @@ leave:
  * accessed, and lock them, reserving the appropriate number of bits.
  *
  * Called from ocfs2_extend_allocation() for file systems which don't
- * support holes, and from ocfs2_prepare_write() for file systems
- * which understand sparse inodes.
+ * support holes, and from ocfs2_write() for file systems which
+ * understand sparse inodes.
  */
-static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-				 u32 clusters_to_add,
-				 struct ocfs2_alloc_context **data_ac,
-				 struct ocfs2_alloc_context **meta_ac)
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+			  u32 clusters_to_add,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
 {
 	int ret, num_free_extents;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -518,7 +519,7 @@ static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 	 * a cluster lock (because we ran out of room for another
 	 * extent) will violate ordering rules.
 	 *
-	 * Most of the time we'll only be seeing this 1 page at a time
+	 * Most of the time we'll only be seeing this 1 cluster at a time
 	 * anyway.
 	 */
 	if (!num_free_extents ||
@@ -596,13 +597,6 @@ static int ocfs2_extend_allocation(struct inode *inode,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
-				       &meta_ac);
-	if (status) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	/* blocks peope in read/write from reading our allocation
 	 * until we're done changing it. We depend on i_mutex to block
 	 * other extend/truncate calls while we're here. Ordering wrt
@@ -610,6 +604,13 @@ restart_all:
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 
+	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+				       &meta_ac);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -1088,10 +1089,49 @@ out:
 	return ret;
 }
 
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+				       size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
-					 int appending)
+					 int appending,
+					 int *direct_io)
 {
 	int ret = 0, meta_level = appending;
 	struct inode *inode = dentry->d_inode;
@@ -1143,12 +1183,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 			saved_pos = *ppos;
 		}
 
+		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+			loff_t end = saved_pos + count;
+
+			/*
+			 * Skip the O_DIRECT checks if we don't need
+			 * them.
+			 */
+			if (!direct_io || !(*direct_io))
+				break;
+
+			/*
+			 * Allowing concurrent direct writes means
+			 * i_size changes wouldn't be synchronized, so
+			 * one node could wind up truncating another
+			 * nodes writes.
+			 */
+			if (end > i_size_read(inode)) {
+				*direct_io = 0;
+				break;
+			}
+
+			/*
+			 * We don't fill holes during direct io, so
+			 * check for them here. If any are found, the
+			 * caller will have to retake some cluster
+			 * locks and initiate the io as buffered.
+			 */
+			ret = ocfs2_check_range_for_holes(inode, saved_pos,
+							  count);
+			if (ret == 1) {
+				*direct_io = 0;
+				ret = 0;
+			} else if (ret < 0)
+				mlog_errno(ret);
+			break;
+		}
+
 		/*
 		 * The rest of this loop is concerned with legacy file
 		 * systems which don't support sparse files.
 		 */
-		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-			break;
 
 		newsize = count + saved_pos;
 
@@ -1202,55 +1277,264 @@ out:
 	return ret;
 }
 
+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t base = *basep;
+
+	do {
+		int copy = min(bytes, iov->iov_len - base);
+
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			base = 0;
+		}
+	} while (bytes);
+	*iovp = iov;
+	*basep = base;
+}
+
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+					    const struct iovec *cur_iov,
+					    size_t iov_offset)
+{
+	int ret;
+	char *buf;
+	struct page *src_page = NULL;
+
+	buf = cur_iov->iov_base + iov_offset;
+
+	if (!segment_eq(get_fs(), KERNEL_DS)) {
+		/*
+		 * Pull in the user page. We want to do this outside
+		 * of the meta data locks in order to preserve locking
+		 * order in case of page fault.
+		 */
+		ret = get_user_pages(current, current->mm,
+				     (unsigned long)buf & PAGE_CACHE_MASK, 1,
+				     0, 0, &src_page, NULL);
+		if (ret == 1)
+			bp->b_src_buf = kmap(src_page);
+		else
+			src_page = ERR_PTR(-EFAULT);
+	} else {
+		bp->b_src_buf = buf;
+	}
+
+	return src_page;
+}
+
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+				   struct page *page)
+{
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+}
+
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+					 const struct iovec *iov,
+					 unsigned long nr_segs,
+					 size_t count,
+					 ssize_t o_direct_written)
+{
+	int ret = 0;
+	ssize_t copied, total = 0;
+	size_t iov_offset = 0;
+	const struct iovec *cur_iov = iov;
+	struct ocfs2_buffered_write_priv bp;
+	struct page *page;
+
+	/*
+	 * handle partial DIO write.  Adjust cur_iov if needed.
+	 */
+	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+
+	do {
+		bp.b_cur_off = iov_offset;
+		bp.b_cur_iov = cur_iov;
+
+		page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto out;
+		}
+
+		copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+						      ocfs2_map_and_write_user_data,
+						      &bp);
+
+		ocfs2_put_write_source(&bp, page);
+
+		if (copied < 0) {
+			mlog_errno(copied);
+			ret = copied;
+			goto out;
+		}
+
+		total += copied;
+		*ppos = *ppos + copied;
+		count -= copied;
+
+		ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+	} while(count);
+
+out:
+	return total ? total : ret;
+}
+
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+			     unsigned long *nr_segs)
+{
+	size_t ocount;		/* original count */
+	unsigned long seg;
+
+	ocount = 0;
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		ocount += iv->iov_len;
+		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		ocount -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+
+	*counted = ocount;
+	return 0;
+}
+
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs,
 				    loff_t pos)
 {
-	int ret, rw_level, have_alloc_sem = 0;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int appending = filp->f_flags & O_APPEND ? 1 : 0;
-
-	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
+	int can_do_direct, sync = 0;
+	ssize_t written = 0;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
+	loff_t *ppos = &iocb->ki_pos;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
-		   filp->f_path.dentry->d_name.len,
-		   filp->f_path.dentry->d_name.name);
+		   file->f_path.dentry->d_name.len,
+		   file->f_path.dentry->d_name.name);
 
-	/* happy write of zero bytes */
 	if (iocb->ki_left == 0)
 		return 0;
 
+	ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+	if (ret)
+		return ret;
+
+	count = ocount;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	appending = file->f_flags & O_APPEND ? 1 : 0;
+	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+
 	mutex_lock(&inode->i_mutex);
+
+relock:
 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
-	if (filp->f_flags & O_DIRECT) {
-		have_alloc_sem = 1;
+	if (direct_io) {
 		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
 	}
 
 	/* concurrent O_DIRECT writes are allowed */
-	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	rw_level = !direct_io;
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
-		rw_level = -1;
 		mlog_errno(ret);
-		goto out;
+		goto out_sems;
 	}
 
-	ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
-					    iocb->ki_left, appending);
+	can_do_direct = direct_io;
+	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+					    iocb->ki_left, appending,
+					    &can_do_direct);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
+	/*
+	 * We can't complete the direct I/O as requested, fall back to
+	 * buffered I/O.
+	 */
+	if (direct_io && !can_do_direct) {
+		ocfs2_rw_unlock(inode, rw_level);
+		up_read(&inode->i_alloc_sem);
+
+		have_alloc_sem = 0;
+		rw_level = -1;
+
+		direct_io = 0;
+		sync = 1;
+		goto relock;
+	}
+
+	if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+		sync = 1;
+
+	/*
+	 * XXX: Is it ok to execute these checks a second time?
+	 */
+	ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out;
+
+	/*
+	 * Set pos so that sync_page_range_nolock() below understands
+	 * where to start from. We might've moved it around via the
+	 * calls above. The range we want to actually sync starts from
+	 * *ppos here.
+	 *
+	 */
+	pos = *ppos;
+
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb);
 
-	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+	if (direct_io) {
+		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+						    ppos, count, ocount);
+		if (written < 0) {
+			ret = written;
+			goto out_dio;
+		}
+	} else {
+		written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+						    count, written);
+		if (written < 0) {
+			ret = written;
+			if (ret != -EFAULT || ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
 
+out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
-	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
 	/* 
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1268,14 +1552,25 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	}
 
 out:
+	if (rw_level != -1)
+		ocfs2_rw_unlock(inode, rw_level);
+
+out_sems:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
-	if (rw_level != -1) 
-		ocfs2_rw_unlock(inode, rw_level);
+
+	if (written > 0 && sync) {
+		ssize_t err;
+
+		err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+		if (err < 0)
+			written = err;
+	}
+
 	mutex_unlock(&inode->i_mutex);
 
 	mlog_exit(ret);
-	return ret;
+	return written ? written : ret;
 }
 
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
@@ -1300,7 +1595,8 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		goto out;
 	}
 
-	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+					    NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e2f6551604d..2c4460fced5 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -46,6 +46,10 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+			  u32 clusters_to_add,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index faeb53f2eec..2699f7cac21 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -463,6 +463,38 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
 	return (unsigned long)((bytes + 511) >> 9);
 }
 
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+							unsigned long pg_index)
+{
+	u32 clusters = pg_index;
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+
+	if (unlikely(PAGE_CACHE_SHIFT > cbits))
+		clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+	else if (PAGE_CACHE_SHIFT < cbits)
+		clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+
+	return clusters;
+}
+
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+							u32 clusters)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned long index = clusters;
+
+	if (PAGE_CACHE_SHIFT > cbits) {
+		index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+	} else if (PAGE_CACHE_SHIFT < cbits) {
+		index = clusters << (cbits - PAGE_CACHE_SHIFT);
+	}
+
+	return index;
+}
+
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
-- 
cgit v1.2.3-70-g09d2


From e48edee2d8eab812f31f0ff62c6ba635ca2e1e21 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 7 Mar 2007 16:46:57 -0800
Subject: ocfs2: make room for unwritten extents flag

Due to the size of our group bitmaps, we'll never have a leaf node extent
record with more than 16 bits worth of clusters. Split e_clusters up so that
leaf nodes can get a flags field where we can mark unwritten extents.
Interior nodes whose length references all the child nodes beneath it can't
split their e_clusters field, so we use a union to preserve sizing there.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c      | 155 +++++++++++++++++++++++++++++++-------------------
 fs/ocfs2/alloc.h      |  19 +++++++
 fs/ocfs2/extent_map.c |  19 +++++--
 fs/ocfs2/file.c       |   6 +-
 fs/ocfs2/journal.h    |   2 +-
 fs/ocfs2/ocfs2_fs.h   |  19 ++++++-
 6 files changed, 151 insertions(+), 69 deletions(-)

(limited to 'fs/ocfs2/extent_map.c')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 027cf5d05ff..0eab0d32828 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -218,20 +218,32 @@ enum ocfs2_contig_type {
 	CONTIG_RIGHT
 };
 
+
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
 static int ocfs2_block_extent_contig(struct super_block *sb,
 				     struct ocfs2_extent_rec *ext,
 				     u64 blkno)
 {
-	return blkno == (le64_to_cpu(ext->e_blkno) +
-			 ocfs2_clusters_to_blocks(sb,
-						  le32_to_cpu(ext->e_clusters)));
+	u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+	blk_end += ocfs2_clusters_to_blocks(sb,
+				    le16_to_cpu(ext->e_leaf_clusters));
+
+	return blkno == blk_end;
 }
 
 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
 				  struct ocfs2_extent_rec *right)
 {
-	return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
-		le32_to_cpu(right->e_cpos));
+	u32 left_range;
+
+	left_range = le32_to_cpu(left->e_cpos) +
+		le16_to_cpu(left->e_leaf_clusters);
+
+	return (left_range == le32_to_cpu(right->e_cpos));
 }
 
 static enum ocfs2_contig_type
@@ -430,7 +442,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
 
 	return le32_to_cpu(el->l_recs[i].e_cpos) +
-		le32_to_cpu(el->l_recs[i].e_clusters);
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
 }
 
 /*
@@ -442,7 +454,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
  * for the new last extent block.
  *
  * the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
  */
 static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    handle_t *handle,
@@ -532,7 +544,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		 */
 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
-		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+		/*
+		 * eb_el isn't always an interior node, but even leaf
+		 * nodes want a zero'd flags and reserved field so
+		 * this gets the whole 32 bits regardless of use.
+		 */
+		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
 		if (!eb_el->l_tree_depth)
 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 
@@ -577,7 +594,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	i = le16_to_cpu(el->l_next_free_rec);
 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
-	el->l_recs[i].e_clusters = 0;
+	el->l_recs[i].e_int_clusters = 0;
 	le16_add_cpu(&el->l_next_free_rec, 1);
 
 	/* fe needs a new last extent block pointer, as does the
@@ -662,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	/* copy the fe data into the new extent block */
 	eb_el->l_tree_depth = fe_el->l_tree_depth;
 	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
-		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
-		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
-	}
+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = fe_el->l_recs[i];
 
 	status = ocfs2_journal_dirty(handle, new_eb_bh);
 	if (status < 0) {
@@ -687,12 +701,9 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	le16_add_cpu(&fe_el->l_tree_depth, 1);
 	fe_el->l_recs[0].e_cpos = 0;
 	fe_el->l_recs[0].e_blkno = eb->h_blkno;
-	fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
-	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-		fe_el->l_recs[i].e_cpos = 0;
-		fe_el->l_recs[i].e_clusters = 0;
-		fe_el->l_recs[i].e_blkno = 0;
-	}
+	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
 	fe_el->l_next_free_rec = cpu_to_le16(1);
 
 	/* If this is our 1st tree depth shift, then last_eb_blk
@@ -817,9 +828,13 @@ bail:
 	return status;
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
 static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 {
-	return !rec->e_clusters;
+	return !rec->e_leaf_clusters;
 }
 
 /*
@@ -930,6 +945,8 @@ static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
 {
 	int next_free = le16_to_cpu(el->l_next_free_rec);
 
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
 	if (next_free == 0)
 		goto set_and_inc;
 
@@ -1034,7 +1051,7 @@ static int __ocfs2_find_path(struct inode *inode,
 			 * rightmost record.
 			 */
 			range = le32_to_cpu(rec->e_cpos) +
-				le32_to_cpu(rec->e_clusters);
+				ocfs2_rec_clusters(el, rec);
 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
 			    break;
 		}
@@ -1195,21 +1212,21 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
 	 */
 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
-	left_rec->e_clusters = cpu_to_le32(left_clusters);
+	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
 
 	/*
 	 * Calculate the rightmost cluster count boundary before
-	 * moving cpos - we will need to adjust e_clusters after
+	 * moving cpos - we will need to adjust clusters after
 	 * updating e_cpos to keep the same highest cluster count.
 	 */
 	right_end = le32_to_cpu(right_rec->e_cpos);
-	right_end += le32_to_cpu(right_rec->e_clusters);
+	right_end += le32_to_cpu(right_rec->e_int_clusters);
 
 	right_rec->e_cpos = left_rec->e_cpos;
 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
 
 	right_end -= le32_to_cpu(right_rec->e_cpos);
-	right_rec->e_clusters = cpu_to_le32(right_end);
+	right_rec->e_int_clusters = cpu_to_le32(right_end);
 }
 
 /*
@@ -1452,6 +1469,8 @@ static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
 	u64 blkno;
 	struct ocfs2_extent_list *el;
 
+	BUG_ON(path->p_tree_depth == 0);
+
 	*cpos = 0;
 
 	blkno = path_leaf_bh(path)->b_blocknr;
@@ -1486,7 +1505,9 @@ static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
 				}
 
 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
-				*cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
+				*cpos = *cpos + ocfs2_rec_clusters(el,
+							   &el->l_recs[j - 1]);
+				*cpos = *cpos - 1;
 				goto out;
 			}
 		}
@@ -1715,7 +1736,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 	unsigned int range;
 	struct ocfs2_extent_rec *rec;
 
-	BUG_ON(el->l_tree_depth);
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
 	/*
 	 * Contiguous insert - either left or right.
@@ -1726,8 +1747,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 			rec->e_blkno = insert_rec->e_blkno;
 			rec->e_cpos = insert_rec->e_cpos;
 		}
-		le32_add_cpu(&rec->e_clusters,
-			     le32_to_cpu(insert_rec->e_clusters));
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
 		return;
 	}
 
@@ -1748,7 +1769,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 	if (insert->ins_appending == APPEND_TAIL) {
 		i = le16_to_cpu(el->l_next_free_rec) - 1;
 		rec = &el->l_recs[i];
-		range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
+		range = le32_to_cpu(rec->e_cpos)
+			+ le16_to_cpu(rec->e_leaf_clusters);
 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
 
 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
@@ -1761,9 +1783,9 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
 				le16_to_cpu(el->l_count),
 				le16_to_cpu(el->l_next_free_rec),
 				le32_to_cpu(el->l_recs[i].e_cpos),
-				le32_to_cpu(el->l_recs[i].e_clusters),
+				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
 				le32_to_cpu(insert_rec->e_cpos),
-				le32_to_cpu(insert_rec->e_clusters));
+				le16_to_cpu(insert_rec->e_leaf_clusters));
 		i++;
 		el->l_recs[i] = *insert_rec;
 		le16_add_cpu(&el->l_next_free_rec, 1);
@@ -1805,6 +1827,12 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 
 	*ret_left_path = NULL;
 
+	/*
+	 * This shouldn't happen for non-trees. The extent rec cluster
+	 * count manipulation below only works for interior nodes.
+	 */
+	BUG_ON(right_path->p_tree_depth == 0);
+
 	/*
 	 * If our appending insert is at the leftmost edge of a leaf,
 	 * then we might need to update the rightmost records of the
@@ -1863,6 +1891,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 	bh = path_root_bh(right_path);
 	i = 0;
 	while (1) {
+		struct ocfs2_extent_rec *rec;
+
 		next_free = le16_to_cpu(el->l_next_free_rec);
 		if (next_free == 0) {
 			ocfs2_error(inode->i_sb,
@@ -1872,16 +1902,19 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 			goto out;
 		}
 
-		el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
-		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-			     le32_to_cpu(insert_rec->e_clusters));
-		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-			    -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
+		rec = &el->l_recs[next_free - 1];
+
+		rec->e_int_clusters = insert_rec->e_cpos;
+		le32_add_cpu(&rec->e_int_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		le32_add_cpu(&rec->e_int_clusters,
+			     -le32_to_cpu(rec->e_cpos));
 
 		ret = ocfs2_journal_dirty(handle, bh);
 		if (ret)
 			mlog_errno(ret);
 
+		/* Don't touch the leaf node */
 		if (++i >= right_path->p_tree_depth)
 			break;
 
@@ -2068,7 +2101,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
 	ocfs2_update_dinode_clusters(inode, di,
-				     le32_to_cpu(insert_rec->e_clusters));
+				     le16_to_cpu(insert_rec->e_leaf_clusters));
 
 	ret = ocfs2_journal_dirty(handle, di_bh);
 	if (ret)
@@ -2089,6 +2122,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
 	int i;
 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
 
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
 						  insert_rec);
@@ -2120,7 +2155,7 @@ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
 
 	insert->ins_appending = APPEND_NONE;
 
-	BUG_ON(el->l_tree_depth);
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
 	if (!el->l_next_free_rec)
 		goto set_tail_append;
@@ -2134,7 +2169,8 @@ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
 	rec = &el->l_recs[i];
 
-	if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
+	if (cpos >=
+	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
 		goto set_tail_append;
 
 	return;
@@ -2242,7 +2278,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * The insert code isn't quite ready to deal with all cases of
 	 * left contiguousness. Specifically, if it's an insert into
 	 * the 1st record in a leaf, it will require the adjustment of
-	 * e_clusters on the last record of the path directly to it's
+	 * cluster count on the last record of the path directly to it's
 	 * left. For now, just catch that case and fool the layers
 	 * above us. This works just fine for tree_depth == 0, which
 	 * is why we allow that above.
@@ -2310,9 +2346,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
 			OCFS2_I(inode)->ip_clusters);
 
+	memset(&rec, 0, sizeof(rec));
 	rec.e_cpos = cpu_to_le32(cpos);
 	rec.e_blkno = cpu_to_le64(start_blk);
-	rec.e_clusters = cpu_to_le32(new_clusters);
+	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 
 	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
 					  &insert);
@@ -2981,7 +3018,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
 		 * Check it we'll only be trimming off the end of this
 		 * cluster.
 		 */
-		if (le16_to_cpu(rec->e_clusters) > clusters_to_del)
+		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
 			goto out;
 	}
 
@@ -3061,11 +3098,11 @@ find_tail_record:
 
 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-		     le32_to_cpu(rec->e_clusters),
+		     ocfs2_rec_clusters(el, rec),
 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
 		     le16_to_cpu(el->l_next_free_rec));
 
-		BUG_ON(le32_to_cpu(rec->e_clusters) < clusters_to_del);
+		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
 
 		if (le16_to_cpu(el->l_tree_depth) == 0) {
 			/*
@@ -3107,13 +3144,13 @@ find_tail_record:
 				goto find_tail_record;
 			}
 
-			le32_add_cpu(&rec->e_clusters, -clusters_to_del);
+			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
 
 			/*
 			 * We'll use "new_edge" on our way back up the
 			 * tree to know what our rightmost cpos is.
 			 */
-			new_edge = le32_to_cpu(rec->e_clusters);
+			new_edge = le16_to_cpu(rec->e_leaf_clusters);
 			new_edge += le32_to_cpu(rec->e_cpos);
 
 			/*
@@ -3121,12 +3158,12 @@ find_tail_record:
 			 */
 			*delete_start = le64_to_cpu(rec->e_blkno)
 				+ ocfs2_clusters_to_blocks(inode->i_sb,
-					le32_to_cpu(rec->e_clusters));
+					le16_to_cpu(rec->e_leaf_clusters));
 
 			/*
 			 * If it's now empty, remove this record.
 			 */
-			if (le32_to_cpu(rec->e_clusters) == 0) {
+			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
 				memset(rec, 0,
 				       sizeof(struct ocfs2_extent_rec));
 				le16_add_cpu(&el->l_next_free_rec, -1);
@@ -3152,15 +3189,15 @@ find_tail_record:
 			if (new_edge == 0)
 				goto delete;
 
-			rec->e_clusters = cpu_to_le32(new_edge);
-			le32_add_cpu(&rec->e_clusters,
+			rec->e_int_clusters = cpu_to_le32(new_edge);
+			le32_add_cpu(&rec->e_int_clusters,
 				     -le32_to_cpu(rec->e_cpos));
 
 			 /*
 			  * A deleted child record should have been
 			  * caught above.
 			  */
-			 BUG_ON(le32_to_cpu(rec->e_clusters) == 0);
+			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
 		}
 
 delete:
@@ -3173,7 +3210,7 @@ delete:
 		mlog(0, "extent list container %llu, after: record %d: "
 		     "(%u, %u, %llu), next = %u.\n",
 		     (unsigned long long)bh->b_blocknr, i,
-		     le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
+		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
 		     le16_to_cpu(el->l_next_free_rec));
 
@@ -3195,7 +3232,7 @@ delete:
 
 			ocfs2_remove_from_cache(inode, bh);
 
-			BUG_ON(le32_to_cpu(el->l_recs[0].e_clusters));
+			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
 
@@ -3283,7 +3320,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	 * Lower levels depend on this never happening, but it's best
 	 * to check it up here before changing the tree.
 	 */
-	if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
+	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %lu has an empty extent record, depth %u\n",
 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
@@ -3644,13 +3681,13 @@ start:
 
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
-		le32_to_cpu(el->l_recs[i].e_clusters);
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
 		clusters_to_del = 0;
 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
-		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
 	} else if (range > new_highest_cpos) {
-		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
 				  new_highest_cpos;
 	} else {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3cb39cd5e47..fbcb5934a08 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -85,4 +85,23 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
 		    u32 cpos, struct buffer_head **leaf_bh);
 
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+					      struct ocfs2_extent_rec *rec)
+{
+	/*
+	 * Cluster count in extent records is slightly different
+	 * between interior nodes and leaf nodes. This is to support
+	 * unwritten extents which need a flags field in leaf node
+	 * records, thus shrinking the available space for a clusters
+	 * field.
+	 */
+	if (el->l_tree_depth)
+		return le32_to_cpu(rec->e_int_clusters);
+	else
+		return le16_to_cpu(rec->e_leaf_clusters);
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 937c2722b75..ea0ce41d419 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -50,13 +50,15 @@ static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
 	int ret = -1;
 	int i;
 	struct ocfs2_extent_rec *rec;
-	u32 rec_end, rec_start;
+	u32 rec_end, rec_start, clusters;
 
 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
 
 		rec_start = le32_to_cpu(rec->e_cpos);
-		rec_end = rec_start + le32_to_cpu(rec->e_clusters);
+		clusters = ocfs2_rec_clusters(el, rec);
+
+		rec_end = rec_start + clusters;
 
 		if (v_cluster >= rec_start && v_cluster < rec_end) {
 			ret = i;
@@ -98,6 +100,15 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
 	}
 
 	i = ocfs2_search_extent_list(el, v_cluster);
@@ -118,7 +129,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
 				    "record (%u, %u, 0)", inode->i_ino,
 				    le32_to_cpu(rec->e_cpos),
-				    le32_to_cpu(rec->e_clusters));
+				    ocfs2_rec_clusters(el, rec));
 			ret = -EROFS;
 			goto out;
 		}
@@ -130,7 +141,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		*p_cluster = *p_cluster + coff;
 
 		if (num_clusters)
-			*num_clusters = le32_to_cpu(rec->e_clusters) - coff;
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
 	}
 
 out:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f516619a374..36176018b4b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1127,7 +1127,6 @@ static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 				       size_t count)
 {
 	int ret = 0;
-	unsigned int extent_flags;
 	u32 cpos, clusters, extent_len, phys_cpos;
 	struct super_block *sb = inode->i_sb;
 
@@ -1135,14 +1134,13 @@ static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
 
 	while (clusters) {
-		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
-					 &extent_flags);
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+		if (phys_cpos == 0) {
 			ret = 1;
 			break;
 		}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f2775..3db5de4506d 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	/* We may be deleting metadata blocks, so metadata alloc dinode +
 	   one desc. block for each possible delete. */
 	if (tree_depth && next_free == 1 &&
-	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
 		credits += 1 + tree_depth;
 
 	/* update to the truncate log. */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0101974f4f..71306479c68 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -155,6 +155,12 @@
 #define OCFS2_FL_VISIBLE	(0x000100FF)	/* User visible flags */
 #define OCFS2_FL_MODIFIABLE	(0x000100FF)	/* User modifiable flags */
 
+/*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN	(0x01)	/* Extent is allocated but
+					 * unwritten */
+
 /*
  * ioctl commands
  */
@@ -283,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 /*
  * On disk extent record for OCFS2
  * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
  */
 struct ocfs2_extent_rec {
 /*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
-	__le32 e_clusters;	/* Clusters covered by this extent */
+	union {
+		__le32 e_int_clusters; /* Clusters covered by all children */
+		struct {
+			__le16 e_leaf_clusters; /* Clusters covered by this
+						   extent */
+			__u8 e_reserved1;
+			__u8 e_flags; /* Extent flags */
+		};
+	};
 	__le64 e_blkno;		/* Physical disk offset, in blocks */
 /*10*/
 };
-- 
cgit v1.2.3-70-g09d2


From 49cb8d2d496ce06869ccca2ab368ed6b0b5b979d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 9 Mar 2007 16:21:46 -0800
Subject: ocfs2: Read from an unwritten extent returns zeros

Return an optional extent flags field from our lookup functions and wire up
callers to treat unwritten regions as holes for the purpose of returning
zeros to the user.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c      | 11 +++++++----
 fs/ocfs2/aops.c       | 21 ++++++++++++++-------
 fs/ocfs2/dir.c        |  2 +-
 fs/ocfs2/extent_map.c | 14 +++++++++++---
 fs/ocfs2/extent_map.h |  4 ++--
 fs/ocfs2/file.c       |  6 ++++--
 fs/ocfs2/inode.c      |  3 ++-
 fs/ocfs2/journal.c    |  2 +-
 fs/ocfs2/namei.c      |  3 ++-
 fs/ocfs2/slot_map.c   |  2 +-
 10 files changed, 45 insertions(+), 23 deletions(-)

(limited to 'fs/ocfs2/extent_map.c')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0eab0d32828..412a2888a3e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3487,6 +3487,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
 {
 	int i, numpages = 0, ret = 0;
 	unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+	unsigned int ext_flags;
 	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
@@ -3499,7 +3500,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
 		goto out;
 
 	ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
-					  phys, NULL);
+					  phys, NULL, &ext_flags);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3509,6 +3510,11 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
 	if (*phys == 0)
 		goto out;
 
+	/* Tail is marked as unwritten, we can count on write to zero
+	 * in that case. */
+	if (ext_flags & OCFS2_EXT_UNWRITTEN)
+		goto out;
+
 	next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
 	index = isize >> PAGE_CACHE_SHIFT;
 	do {
@@ -3579,9 +3585,6 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
 		goto out;
 	}
 
-	/*
-	 * Truncate on an i_size boundary - nothing more to do.
-	 */
 	if (numpages == 0)
 		goto out;
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 014f4f52809..eb67c902b00 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -137,6 +137,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh_result, int create)
 {
 	int err = 0;
+	unsigned int ext_flags;
 	u64 p_blkno, past_eof;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
@@ -153,7 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL);
+	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
+					  &ext_flags);
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -171,7 +173,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			"ino %lu, iblock %llu\n", inode->i_ino,
 			(unsigned long long)iblock);
 
-	if (p_blkno)
+	/* Treat the unwritten extent as a hole for zeroing purposes. */
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
 
 	if (!ocfs2_sparse_alloc(osb)) {
@@ -396,7 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 
-	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
+	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
 
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -438,6 +441,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	int ret;
 	u64 p_blkno, inode_blocks;
 	int contig_blocks;
+	unsigned int ext_flags;
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 
@@ -458,7 +462,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
 	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-					  &contig_blocks);
+					  &contig_blocks, &ext_flags);
 	if (ret) {
 		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 		     (unsigned long long)iblock);
@@ -478,8 +482,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	/*
 	 * get_more_blocks() expects us to describe a hole by clearing
 	 * the mapped bit on bh_result().
+	 *
+	 * Consider an unwritten extent as a hole.
 	 */
-	if (p_blkno)
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
 	else {
 		/*
@@ -1111,7 +1117,8 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 		}
 	}
 
-	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL);
+	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+					  NULL);
 	if (ret < 0) {
 
 		/*
@@ -1215,7 +1222,7 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL);
+	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_meta;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c91490670ff..8d22e1e4a88 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -379,7 +379,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 
 	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
 						   (sb->s_blocksize_bits - 9)),
-					     &p_blkno, NULL);
+					     &p_blkno, NULL, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ea0ce41d419..eef6c188770 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -70,9 +70,11 @@ static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
 }
 
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
-		       u32 *p_cluster, u32 *num_clusters)
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
 {
 	int ret, i;
+	unsigned int flags = 0;
 	struct buffer_head *di_bh = NULL;
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_dinode *di;
@@ -142,8 +144,13 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 
 		if (num_clusters)
 			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+		flags = rec->e_flags;
 	}
 
+	if (extent_flags)
+		*extent_flags = flags;
+
 out:
 	brelse(di_bh);
 	brelse(eb_bh);
@@ -155,7 +162,7 @@ out:
  * all while the map is in the process of being updated.
  */
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
-				int *ret_count)
+				int *ret_count, unsigned int *extent_flags)
 {
 	int ret;
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
@@ -164,7 +171,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 
 	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
 
-	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters);
+	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
+				 extent_flags);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 625d0ee5e04..0031c59c347 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -26,8 +26,8 @@
 #define _EXTENT_MAP_H
 
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
-		       u32 *num_clusters);
+		       u32 *num_clusters, unsigned int *extent_flags);
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
-				int *ret_count);
+				int *ret_count, unsigned int *extent_flags);
 
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 36176018b4b..f516619a374 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1127,6 +1127,7 @@ static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 				       size_t count)
 {
 	int ret = 0;
+	unsigned int extent_flags;
 	u32 cpos, clusters, extent_len, phys_cpos;
 	struct super_block *sb = inode->i_sb;
 
@@ -1134,13 +1135,14 @@ static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
 
 	while (clusters) {
-		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len);
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		if (phys_cpos == 0) {
+		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
 			ret = 1;
 			break;
 		}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 78c99b5050d..310049bf7f6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1105,7 +1105,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		return NULL;
 	}
 
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+					     NULL);
 	if (tmperr < 0) {
 		mlog_errno(tmperr);
 		goto fail;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 2e2e04fe973..db77e0996bb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -670,7 +670,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
 	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
 
 		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
-						     &p_blkno, &p_blocks);
+						     &p_blkno, &p_blocks, NULL);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5755e074825..395859edb51 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1511,7 +1511,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks);
+	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
+					     NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index f4416e7330e..d921a28329d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL);
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
-- 
cgit v1.2.3-70-g09d2


From 4f902c37727bbedbc0508a1477874c58ddcc9af8 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 9 Mar 2007 16:26:50 -0800
Subject: ocfs2: Fix extent lookup to return true size of holes

Initially, we had wired things to return a size '1' of holes. Cook up a
small amount of code to find the next extent and calculate the number of
clusters between the virtual offset and the next allocated extent.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c       |   3 +-
 fs/ocfs2/extent_map.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/extent_map.h |   2 +-
 fs/ocfs2/journal.c    |   6 +--
 fs/ocfs2/namei.c      |   3 +-
 5 files changed, 109 insertions(+), 12 deletions(-)

(limited to 'fs/ocfs2/extent_map.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index eb67c902b00..ff71e0b430c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -439,8 +439,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 				     struct buffer_head *bh_result, int create)
 {
 	int ret;
-	u64 p_blkno, inode_blocks;
-	int contig_blocks;
+	u64 p_blkno, inode_blocks, contig_blocks;
 	unsigned int ext_flags;
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index eef6c188770..f35e04f27f3 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -38,6 +38,97 @@
 
 #include "buffer_head_io.h"
 
+/*
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
+ */
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+				       u32 v_cluster)
+{
+	int i;
+	struct ocfs2_extent_rec *rec;
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		if (v_cluster < le32_to_cpu(rec->e_cpos))
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
+ *
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
+ *
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
+ */
+static int ocfs2_figure_hole_clusters(struct inode *inode,
+				      struct ocfs2_extent_list *el,
+				      struct buffer_head *eb_bh,
+				      u32 v_cluster,
+				      u32 *num_clusters)
+{
+	int ret, i;
+	struct buffer_head *next_eb_bh = NULL;
+	struct ocfs2_extent_block *eb, *next_eb;
+
+	i = ocfs2_search_for_hole_index(el, v_cluster);
+
+	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+		/*
+		 * Check the next leaf for any extents.
+		 */
+
+		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+			goto no_more_extents;
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(eb->h_next_leaf_blk),
+				       &next_eb_bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
+
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
+			ret = -EROFS;
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
+			goto out;
+		}
+
+		el = &next_eb->h_list;
+
+		i = ocfs2_search_for_hole_index(el, v_cluster);
+	}
+
+no_more_extents:
+	if (i == le16_to_cpu(el->l_next_free_rec)) {
+		/*
+		 * We're at the end of our existing allocation. Just
+		 * return the maximum number of clusters we could
+		 * possibly allocate.
+		 */
+		*num_clusters = UINT_MAX - v_cluster;
+	} else {
+		*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
+	}
+
+	ret = 0;
+out:
+	brelse(next_eb_bh);
+	return ret;
+}
+
 /*
  * Return the index of the extent record which contains cluster #v_cluster.
  * -1 is returned if it was not found.
@@ -117,11 +208,19 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (i == -1) {
 		/*
 		 * A hole was found. Return some canned values that
-		 * callers can key on.
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
 		 */
 		*p_cluster = 0;
-		if (num_clusters)
-			*num_clusters = 1;
+		if (num_clusters) {
+			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+							 v_cluster,
+							 num_clusters);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
 	} else {
 		rec = &el->l_recs[i];
 
@@ -162,7 +261,7 @@ out:
  * all while the map is in the process of being updated.
  */
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
-				int *ret_count, unsigned int *extent_flags)
+				u64 *ret_count, unsigned int *extent_flags)
 {
 	int ret;
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 0031c59c347..1d745e174af 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -28,6 +28,6 @@
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
 		       u32 *num_clusters, unsigned int *extent_flags);
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
-				int *ret_count, unsigned int *extent_flags);
+				u64 *ret_count, unsigned int *extent_flags);
 
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index db77e0996bb..12d2340eee2 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -649,9 +649,9 @@ bail:
 static int ocfs2_force_read_journal(struct inode *inode)
 {
 	int status = 0;
-	int i, p_blocks;
-	u64 v_blkno, p_blkno;
-#define CONCURRENT_JOURNAL_FILL 32
+	int i;
+	u64 v_blkno, p_blkno, p_blocks;
+#define CONCURRENT_JOURNAL_FILL 32ULL
 	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
 
 	mlog_entry_void();
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 395859edb51..9bdbe4ae92f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1483,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 	struct buffer_head **bhs = NULL;
 	const char *c;
 	struct super_block *sb = osb->sb;
-	u64 p_blkno;
-	int p_blocks;
+	u64 p_blkno, p_blocks;
 	int virtual, blocks, status, i, bytes_left;
 
 	bytes_left = i_size_read(inode) + 1;
-- 
cgit v1.2.3-70-g09d2


From 83418978827324918a8cd25ce5227312de1d4468 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 23 Apr 2007 18:53:12 -0700
Subject: ocfs2: Cache extent records

The extent map code was ripped out earlier because of an inability to deal
with holes. This patch adds back a simpler caching scheme requiring far less
code.

Our old extent map caching was designed back when meta data block caching in
Ocfs2 didn't work very well, resulting in many disk reads. These days our
metadata caching is much better, resulting in no un-necessary disk reads. As
a result, extent caching doesn't have to be as fancy, nor does it have to
cache as many extents. Keeping the last 3 extents seen should be sufficient
to give us a small performance boost on some streaming workloads.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c      |   5 +
 fs/ocfs2/dlmglue.c    |   2 +
 fs/ocfs2/extent_map.c | 255 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/extent_map.h |  20 ++++
 fs/ocfs2/inode.c      |   2 +
 fs/ocfs2/inode.h      |   4 +
 fs/ocfs2/super.c      |   1 +
 7 files changed, 289 insertions(+)

(limited to 'fs/ocfs2/extent_map.c')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 412a2888a3e..a0c8667caa7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2417,6 +2417,8 @@ out_add:
 	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
+	else
+		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
 	if (bh)
@@ -3640,6 +3642,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
+
+	ocfs2_extent_map_trunc(inode, new_highest_cpos);
+
 start:
 	/*
 	 * Check that we still have allocation to delete.
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 43267eea353..27e43b0c0ea 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1613,6 +1613,8 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 	 * for the inode metadata. */
 	ocfs2_metadata_cache_purge(inode);
 
+	ocfs2_extent_map_trunc(inode, 0);
+
 	if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
 		mlog(0, "Trusting LVB on inode %llu\n",
 		     (unsigned long long)oi->ip_blkno);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f35e04f27f3..ba2b2ab1c6e 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -38,6 +38,254 @@
 
 #include "buffer_head_io.h"
 
+/*
+ * The extent caching implementation is intentionally trivial.
+ *
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
+ */
+
+void ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	oi->ip_extent_map.em_num_items = 0;
+	INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
+}
+
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+				      unsigned int cpos,
+				      struct ocfs2_extent_map_item **ret_emi)
+{
+	unsigned int range;
+	struct ocfs2_extent_map_item *emi;
+
+	*ret_emi = NULL;
+
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		range = emi->ei_cpos + emi->ei_clusters;
+
+		if (cpos >= emi->ei_cpos && cpos < range) {
+			list_move(&emi->ei_list, &em->em_list);
+
+			*ret_emi = emi;
+			break;
+		}
+	}
+}
+
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+				   unsigned int *phys, unsigned int *len,
+				   unsigned int *flags)
+{
+	unsigned int coff;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map_item *emi;
+
+	spin_lock(&oi->ip_lock);
+
+	__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+	if (emi) {
+		coff = cpos - emi->ei_cpos;
+		*phys = emi->ei_phys + coff;
+		if (len)
+			*len = emi->ei_clusters - coff;
+		if (flags)
+			*flags = emi->ei_flags;
+	}
+
+	spin_unlock(&oi->ip_lock);
+
+	if (emi == NULL)
+		return -ENOENT;
+
+	return 0;
+}
+
+/*
+ * Forget about all clusters equal to or greater than cpos.
+ */
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
+{
+	struct list_head *p, *n;
+	struct ocfs2_extent_map_item *emi;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	LIST_HEAD(tmp_list);
+	unsigned int range;
+
+	spin_lock(&oi->ip_lock);
+	list_for_each_safe(p, n, &em->em_list) {
+		emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+
+		if (emi->ei_cpos >= cpos) {
+			/* Full truncate of this record. */
+			list_move(&emi->ei_list, &tmp_list);
+			BUG_ON(em->em_num_items == 0);
+			em->em_num_items--;
+			continue;
+		}
+
+		range = emi->ei_cpos + emi->ei_clusters;
+		if (range > cpos) {
+			/* Partial truncate */
+			emi->ei_clusters = cpos - emi->ei_cpos;
+		}
+	}
+	spin_unlock(&oi->ip_lock);
+
+	list_for_each_safe(p, n, &tmp_list) {
+		emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+		list_del(&emi->ei_list);
+		kfree(emi);
+	}
+}
+
+/*
+ * Is any part of emi2 contained within emi1
+ */
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+				 struct ocfs2_extent_map_item *emi2)
+{
+	unsigned int range1, range2;
+
+	/*
+	 * Check if logical start of emi2 is inside emi1
+	 */
+	range1 = emi1->ei_cpos + emi1->ei_clusters;
+	if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+		return 1;
+
+	/*
+	 * Check if logical end of emi2 is inside emi1
+	 */
+	range2 = emi2->ei_cpos + emi2->ei_clusters;
+	if (range2 > emi1->ei_cpos && range2 <= range1)
+		return 1;
+
+	return 0;
+}
+
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+				  struct ocfs2_extent_map_item *src)
+{
+	dest->ei_cpos = src->ei_cpos;
+	dest->ei_phys = src->ei_phys;
+	dest->ei_clusters = src->ei_clusters;
+	dest->ei_flags = src->ei_flags;
+}
+
+/*
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
+ */
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+					 struct ocfs2_extent_map_item *ins)
+{
+	/*
+	 * Handle contiguousness
+	 */
+	if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+	    ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+	    ins->ei_flags == emi->ei_flags) {
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
+	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+		   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+		   ins->ei_flags == emi->ei_flags) {
+		emi->ei_phys = ins->ei_phys;
+		emi->ei_cpos = ins->ei_cpos;
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
+	}
+
+	/*
+	 * Overlapping extents - this shouldn't happen unless we've
+	 * split an extent to change it's flags. That is exceedingly
+	 * rare, so there's no sense in trying to optimize it yet.
+	 */
+	if (ocfs2_ei_is_contained(emi, ins) ||
+	    ocfs2_ei_is_contained(ins, emi)) {
+		ocfs2_copy_emi_fields(emi, ins);
+		return 1;
+	}
+
+	/* No merge was possible. */
+	return 0;
+}
+
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+	struct ocfs2_extent_map_item ins;
+
+	ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+	ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+					       le64_to_cpu(rec->e_blkno));
+	ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	ins.ei_flags = rec->e_flags;
+
+search:
+	spin_lock(&oi->ip_lock);
+
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+			list_move(&emi->ei_list, &em->em_list);
+			spin_unlock(&oi->ip_lock);
+			goto out;
+		}
+	}
+
+	/*
+	 * No item could be merged.
+	 *
+	 * Either allocate and add a new item, or overwrite the last recently
+	 * inserted.
+	 */
+
+	if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+		if (new_emi == NULL) {
+			spin_unlock(&oi->ip_lock);
+
+			new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+			if (new_emi == NULL)
+				goto out;
+
+			goto search;
+		}
+
+		ocfs2_copy_emi_fields(new_emi, &ins);
+		list_add(&new_emi->ei_list, &em->em_list);
+		em->em_num_items++;
+		new_emi = NULL;
+	} else {
+		BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+		emi = list_entry(em->em_list.prev,
+				 struct ocfs2_extent_map_item, ei_list);
+		list_move(&emi->ei_list, &em->em_list);
+		ocfs2_copy_emi_fields(emi, &ins);
+	}
+
+	spin_unlock(&oi->ip_lock);
+
+out:
+	if (new_emi)
+		kfree(new_emi);
+}
+
 /*
  * Return the 1st index within el which contains an extent start
  * larger than v_cluster.
@@ -174,6 +422,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	struct ocfs2_extent_rec *rec;
 	u32 coff;
 
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
+
 	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
 			       &di_bh, OCFS2_BH_CACHED, inode);
 	if (ret) {
@@ -245,6 +498,8 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
 
 		flags = rec->e_flags;
+
+		ocfs2_extent_map_insert_rec(inode, rec);
 	}
 
 	if (extent_flags)
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1d745e174af..de91e3e41a2 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,6 +25,26 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
 
+struct ocfs2_extent_map_item {
+	unsigned int			ei_cpos;
+	unsigned int			ei_phys;
+	unsigned int			ei_clusters;
+	unsigned int			ei_flags;
+
+	struct list_head		ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS			3
+struct ocfs2_extent_map {
+	unsigned int			em_num_items;
+	struct list_head		em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec);
+
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
 		       u32 *num_clusters, unsigned int *extent_flags);
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4bfc98c7013..21a605079c6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1008,6 +1008,8 @@ void ocfs2_clear_inode(struct inode *inode)
 			"Clear inode of %llu, inode has io markers\n",
 			(unsigned long long)oi->ip_blkno);
 
+	ocfs2_extent_map_trunc(inode, 0);
+
 	status = ocfs2_drop_inode_locks(inode);
 	if (status < 0)
 		mlog_errno(status);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aa84353d0d1..03ae075869e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H
 
+#include "extent_map.h"
+
 /* OCFS2 Inode Private Data */
 struct ocfs2_inode_info
 {
@@ -63,6 +65,8 @@ struct ocfs2_inode_info
 
 	struct ocfs2_caching_info	ip_metadata_cache;
 
+	struct ocfs2_extent_map		ip_extent_map;
+
 	struct inode			vfs_inode;
 };
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6ab52351943..5c9e8243691 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -942,6 +942,7 @@ static void ocfs2_inode_init_once(void *data,
 		oi->ip_flags = 0;
 		oi->ip_open_count = 0;
 		spin_lock_init(&oi->ip_lock);
+		ocfs2_extent_map_init(&oi->vfs_inode);
 		INIT_LIST_HEAD(&oi->ip_io_markers);
 		oi->ip_created_trans = 0;
 		oi->ip_last_trans = 0;
-- 
cgit v1.2.3-70-g09d2