summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Gouriou <egouriou@google.com>2011-10-27 11:43:23 -0400
committerTheodore Ts'o <tytso@mit.edu>2011-10-27 11:43:23 -0400
commit6f91bc5fda82d2c49b4f7fb29424cf6a3c7574bc (patch)
treee5670070f007c4ad5300e93a4e36fa9a802e2bd9
parent446066724c3629664e29942a00b0aee0d6b1663a (diff)
ext4: optimize ext4_ext_convert_to_initialized()
This patch introduces a fast path in ext4_ext_convert_to_initialized() for the case when the conversion can be performed by transferring the newly initialized blocks from the uninitialized extent into an adjacent initialized extent. Doing so removes the expensive invocations of memmove() which occur during extent insertion and the subsequent merge. In practice this should be the common case for clients performing append writes into files pre-allocated via fallocate(FALLOC_FL_KEEP_SIZE). In such a workload performed via direct IO and when using a suboptimal implementation of memmove() (x86_64 prior to the 2.6.39 rewrite), this patch reduces kernel CPU consumption by 32%. Two new trace points are added to ext4_ext_convert_to_initialized() to offer visibility into its operations. No exit trace point has been added due to the multiplicity of return points. This can be revisited once the upstream cleanup is backported. Signed-off-by: Eric Gouriou <egouriou@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/extents.c93
-rw-r--r--fs/ext4/super.c1
-rw-r--r--include/trace/events/ext4.h82
3 files changed, 176 insertions, 0 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c2ac06cb2d4..8b6a17b6097 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2919,12 +2919,23 @@ out:
* a> There is no split required: Entire extent should be initialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ * - The extent pointed to by 'path' is uninitialized.
+ * - The extent pointed to by 'path' contains a superset
+ * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ * - the returned value is the number of blocks beyond map->l_lblk
+ * that are allocated and initialized.
+ * It is guaranteed to be >= map->m_len.
*/
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
+ struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
struct ext4_extent *ex;
@@ -2944,11 +2955,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
+ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+ /* Pre-conditions */
+ BUG_ON(!ext4_ext_is_uninitialized(ex));
+ BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+ BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+ /*
+ * Attempt to transfer newly initialized blocks from the currently
+ * uninitialized extent to its left neighbor. This is much cheaper
+ * than an insertion followed by a merge as those involve costly
+ * memmove() calls. This is the common case in steady state for
+ * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+ * writes.
+ *
+ * Limitations of the current logic:
+ * - L1: we only deal with writes at the start of the extent.
+ * The approach could be extended to writes at the end
+ * of the extent but this scenario was deemed less common.
+ * - L2: we do not deal with writes covering the whole extent.
+ * This would require removing the extent if the transfer
+ * is possible.
+ * - L3: we only attempt to merge with an extent stored in the
+ * same extent tree node.
+ */
+ if ((map->m_lblk == ee_block) && /*L1*/
+ (map->m_len < ee_len) && /*L2*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
+ struct ext4_extent *prev_ex;
+ ext4_lblk_t prev_lblk;
+ ext4_fsblk_t prev_pblk, ee_pblk;
+ unsigned int prev_len, write_len;
+
+ prev_ex = ex - 1;
+ prev_lblk = le32_to_cpu(prev_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(prev_ex);
+ prev_pblk = ext4_ext_pblock(prev_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+ write_len = map->m_len;
+
+ /*
+ * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+ * upon those conditions:
+ * - C1: prev_ex is initialized,
+ * - C2: prev_ex is logically abutting ex,
+ * - C3: prev_ex is physically abutting ex,
+ * - C4: prev_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
+ ((prev_lblk + prev_len) == ee_block) && /*C2*/
+ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
+ (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, prev_ex);
+
+ /* Shift the start of ex by 'write_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + write_len);
+ ext4_ext_store_pblock(ex, ee_pblk + write_len);
+ ex->ee_len = cpu_to_le16(ee_len - write_len);
+ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+ /* Extend prev_ex by 'write_len' blocks */
+ prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = prev_ex;
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = write_len;
+ goto out;
+ }
+ }
+
WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcc460537bc..9953d80145a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
#include <linux/freezer.h>
#include "ext4.h"
+#include "ext4_extents.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index c9a341e385a..748ff7cbe55 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -9,6 +9,7 @@
struct ext4_allocation_context;
struct ext4_allocation_request;
+struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
@@ -1394,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
TP_ARGS(inode)
);
+/* 'ux' is the uninitialized extent. */
+TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+ struct ext4_extent *ux),
+
+ TP_ARGS(inode, map, ux),
+
+ TP_STRUCT__entry(
+ __field( ino_t, ino )
+ __field( dev_t, dev )
+ __field( ext4_lblk_t, m_lblk )
+ __field( unsigned, m_len )
+ __field( ext4_lblk_t, u_lblk )
+ __field( unsigned, u_len )
+ __field( ext4_fsblk_t, u_pblk )
+ ),
+
+ TP_fast_assign(
+ __entry->ino = inode->i_ino;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->m_lblk = map->m_lblk;
+ __entry->m_len = map->m_len;
+ __entry->u_lblk = le32_to_cpu(ux->ee_block);
+ __entry->u_len = ext4_ext_get_actual_len(ux);
+ __entry->u_pblk = ext4_ext_pblock(ux);
+ ),
+
+ TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
+ "u_pblk %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->m_lblk, __entry->m_len,
+ __entry->u_lblk, __entry->u_len, __entry->u_pblk)
+);
+
+/*
+ * 'ux' is the uninitialized extent.
+ * 'ix' is the initialized extent to which blocks are transferred.
+ */
+TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+ struct ext4_extent *ux, struct ext4_extent *ix),
+
+ TP_ARGS(inode, map, ux, ix),
+
+ TP_STRUCT__entry(
+ __field( ino_t, ino )
+ __field( dev_t, dev )
+ __field( ext4_lblk_t, m_lblk )
+ __field( unsigned, m_len )
+ __field( ext4_lblk_t, u_lblk )
+ __field( unsigned, u_len )
+ __field( ext4_fsblk_t, u_pblk )
+ __field( ext4_lblk_t, i_lblk )
+ __field( unsigned, i_len )
+ __field( ext4_fsblk_t, i_pblk )
+ ),
+
+ TP_fast_assign(
+ __entry->ino = inode->i_ino;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->m_lblk = map->m_lblk;
+ __entry->m_len = map->m_len;
+ __entry->u_lblk = le32_to_cpu(ux->ee_block);
+ __entry->u_len = ext4_ext_get_actual_len(ux);
+ __entry->u_pblk = ext4_ext_pblock(ux);
+ __entry->i_lblk = le32_to_cpu(ix->ee_block);
+ __entry->i_len = ext4_ext_get_actual_len(ix);
+ __entry->i_pblk = ext4_ext_pblock(ix);
+ ),
+
+ TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
+ "u_lblk %u u_len %u u_pblk %llu "
+ "i_lblk %u i_len %u i_pblk %llu ",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->m_lblk, __entry->m_len,
+ __entry->u_lblk, __entry->u_len, __entry->u_pblk,
+ __entry->i_lblk, __entry->i_len, __entry->i_pblk)
+);
+
DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
unsigned int len, unsigned int flags),