From 4d555e3de4624e6c211c4e1e46aa52f7632a149c Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Sat, 28 Sep 2013 20:32:14 -0300
Subject: 9p: remove unused 'p9_client' struct pointer

Get rid of the useless '*clnt' variable.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a0df3e73c2b..27782bb7f4f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -461,14 +461,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
 	int n;
 	loff_t i_size;
 	size_t total = 0;
-	struct p9_client *clnt;
 	loff_t origin = *offset;
 	unsigned long pg_start, pg_end;
 
 	p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
 		 data, (int)count, (int)*offset);
 
-	clnt = fid->clnt;
 	do {
 		n = p9_client_write(fid, NULL, data+total, origin+total, count);
 		if (n <= 0)
-- 
cgit v1.2.3-70-g09d2


From bd126e5e58bd5e6561e39c1364d57497cf7e118e Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Sat, 28 Sep 2013 20:32:13 -0300
Subject: 9p: remove unused 'p9_fid' struct pointer

Get rid of the useless '*fid' variable.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/cache.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 2b7a032c37b..a69260f2755 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -239,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
 	struct v9fs_inode *v9inode = V9FS_I(inode);
-	struct p9_fid *fid;
 
 	if (!v9inode->fscache)
 		return;
 
 	spin_lock(&v9inode->fscache_lock);
-	fid = filp->private_data;
+
 	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
 		v9fs_cache_inode_flush_cookie(inode);
 	else
-- 
cgit v1.2.3-70-g09d2


From 5d62a378ffff85ac9e581c228a9b505fb486e697 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Sat, 28 Sep 2013 20:32:18 -0300
Subject: 9p: remove never used return variable

Get rid of the useless 'err' variable, since the return is treated
farther down without the use of it.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode_dotl.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4c10edec26a..589ef6afc16 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -473,13 +473,11 @@ static int
 v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
-	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_stat_dotl *st;
 
 	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-	err = -EPERM;
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		generic_fillattr(dentry->d_inode, stat);
-- 
cgit v1.2.3-70-g09d2


From 08e15f2df3496f8b2f83bc0a9c10dff89f044358 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Sat, 28 Sep 2013 20:32:16 -0300
Subject: 9p: remove unused 'super_block' struct pointer

Get rid of the useless '*sb' variable.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4e65aa90334..5bbb5a02116 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -779,7 +779,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 				      unsigned int flags)
 {
 	struct dentry *res;
-	struct super_block *sb;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *dfid, *fid;
 	struct inode *inode;
@@ -791,7 +790,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
 	/* We can walk d_parent because we hold the dir->i_mutex */
 	dfid = v9fs_fid_lookup(dentry->d_parent);
-- 
cgit v1.2.3-70-g09d2


From fae7469609aa4229aee20dd68453a0a6df5bd52e Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Sat, 28 Sep 2013 20:32:15 -0300
Subject: 9p: remove useless assignment

There is no use of pointer 'fid' before the next assignment.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5bbb5a02116..af7d531bdec 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -861,7 +861,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		return finish_no_open(file, res);
 
 	err = 0;
-	fid = NULL;
+
 	v9ses = v9fs_inode2v9ses(dir);
 	perm = unixmode2p9mode(v9ses, mode);
 	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-- 
cgit v1.2.3-70-g09d2


From 72fe18c9c71c1911b02d4765661ea1522053b5df Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Sat, 28 Sep 2013 20:32:19 -0300
Subject: 9p: remove useless variable and assignment

There is no use of pointer 'v9ses'. Get rid of useless 'retval'
assignment.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode_dotl.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 589ef6afc16..ecacec098fb 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -554,7 +554,6 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
 int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 {
 	int retval;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_iattr_dotl p9attr;
 	struct inode *inode = dentry->d_inode;
@@ -575,8 +574,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
 	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
 
-	retval = -EPERM;
-	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
-- 
cgit v1.2.3-70-g09d2


From bdd5c28dcb8330b9074404cc92a0b83aae5606a9 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 21 Oct 2013 16:47:58 -0300
Subject: 9p: fix return value in case in v9fs_fid_xattr_set()

In case of error in the p9_client_write, the function v9fs_fid_xattr_set
should return its negative value, what was never being done.

In case of success it only retuned 0. Now it returns the 'offset'
variable (write_count total).

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/xattr.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 3c28cdfb8c4..04133a1fd9c 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -138,8 +138,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
 	if (retval < 0) {
 		p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
 			 retval);
-		p9_client_clunk(fid);
-		return retval;
+		goto err;
 	}
 	msize = fid->clnt->msize;
 	while (value_len) {
@@ -152,12 +151,15 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
 		if (write_count < 0) {
 			/* error in xattr write */
 			retval = write_count;
-			break;
+			goto err;
 		}
 		offset += write_count;
 		value_len -= write_count;
 	}
-	return p9_client_clunk(fid);
+	retval = offset;
+err:
+	p9_client_clunk(fid);
+	return retval;
 }
 
 ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
-- 
cgit v1.2.3-70-g09d2


From dd2a0a35c3121a88af3c2228a0bc497b5e4a7f56 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 21 Oct 2013 15:53:45 -0300
Subject: 9p: remove useless 'name' variable and assignment

There is no use of 'name' pointer. Get rid of its useless assignment.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode_dotl.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index ecacec098fb..98013068f35 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -763,7 +763,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 		struct dentry *dentry)
 {
 	int err;
-	char *name;
 	struct dentry *dir_dentry;
 	struct p9_fid *dfid, *oldfid;
 	struct v9fs_session_info *v9ses;
@@ -781,8 +780,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 	if (IS_ERR(oldfid))
 		return PTR_ERR(oldfid);
 
-	name = (char *) dentry->d_name.name;
-
 	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
 
 	if (err < 0) {
-- 
cgit v1.2.3-70-g09d2


From 33879d4512c021ae65be9706608dacb36b4687b1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 23 Nov 2013 22:33:32 -0800
Subject: block: submit_bio_wait() conversions

It was being open coded in a few places.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Chris Mason <chris.mason@fusionio.com>
Acked-by: NeilBrown <neilb@suse.de>
---
 block/blk-flush.c          | 19 +------------------
 drivers/md/md.c            | 14 +-------------
 fs/btrfs/check-integrity.c | 32 +++++++++++++-------------------
 fs/btrfs/check-integrity.h |  2 ++
 fs/btrfs/extent_io.c       | 12 +-----------
 fs/btrfs/scrub.c           | 33 ++++-----------------------------
 fs/hfsplus/wrapper.c       | 17 +----------------
 fs/logfs/dev_bdev.c        | 13 +------------
 8 files changed, 24 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 331e627301e..fb6f3c0ffa4 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q)
 	}
 }
 
-static void bio_end_flush(struct bio *bio, int err)
-{
-	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	if (bio->bi_private)
-		complete(bio->bi_private);
-	bio_put(bio);
-}
-
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
@@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err)
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 		sector_t *error_sector)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
 	struct bio *bio;
 	int ret = 0;
@@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 		return -ENXIO;
 
 	bio = bio_alloc(gfp_mask, 0);
-	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
-	bio->bi_private = &wait;
 
-	bio_get(bio);
-	submit_bio(WRITE_FLUSH, bio);
-	wait_for_completion_io(&wait);
+	ret = submit_bio_wait(WRITE_FLUSH, bio);
 
 	/*
 	 * The driver must store the error location in ->bi_sector, if
@@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	if (error_sector)
 		*error_sector = bio->bi_sector;
 
-	if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
 	bio_put(bio);
 	return ret;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b6b7a2866c9..739b1ec54e2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -776,20 +776,12 @@ void md_super_wait(struct mddev *mddev)
 	finish_wait(&mddev->sb_wait, &wq);
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion*)bio->bi_private);
-}
-
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 		 struct page *page, int rw, bool metadata_op)
 {
 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
-	struct completion event;
 	int ret;
 
-	rw |= REQ_SYNC;
-
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
 		rdev->meta_bdev : rdev->bdev;
 	if (metadata_op)
@@ -801,11 +793,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 	else
 		bio->bi_sector = sector + rdev->data_offset;
 	bio_add_page(bio, page, size, 0);
-	init_completion(&event);
-	bio->bi_private = &event;
-	bio->bi_end_io = bi_complete;
-	submit_bio(rw, bio);
-	wait_for_completion(&event);
+	submit_bio_wait(rw, bio);
 
 	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	bio_put(bio);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b50764bef14..131d82800b3 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -333,7 +333,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
 			      struct btrfsic_block_data_ctx *block_ctx);
 static void btrfsic_dump_database(struct btrfsic_state *state);
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 				     char **datav, unsigned int num_pages);
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -1687,7 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 	for (i = 0; i < num_pages;) {
 		struct bio *bio;
 		unsigned int j;
-		DECLARE_COMPLETION_ONSTACK(complete);
 
 		bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
 		if (!bio) {
@@ -1698,8 +1696,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		}
 		bio->bi_bdev = block_ctx->dev->bdev;
 		bio->bi_sector = dev_bytenr >> 9;
-		bio->bi_end_io = btrfsic_complete_bio_end_io;
-		bio->bi_private = &complete;
 
 		for (j = i; j < num_pages; j++) {
 			ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1712,12 +1708,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 			       "btrfsic: error, failed to add a single page!\n");
 			return -1;
 		}
-		submit_bio(READ, bio);
-
-		/* this will also unplug the queue */
-		wait_for_completion(&complete);
-
-		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		if (submit_bio_wait(READ, bio)) {
 			printk(KERN_INFO
 			       "btrfsic: read error at logical %llu dev %s!\n",
 			       block_ctx->start, block_ctx->dev->name);
@@ -1740,11 +1731,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 	return block_ctx->len;
 }
 
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 static void btrfsic_dump_database(struct btrfsic_state *state)
 {
 	struct list_head *elem_all;
@@ -3008,14 +2994,12 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
 	return submit_bh(rw, bh);
 }
 
-void btrfsic_submit_bio(int rw, struct bio *bio)
+static void __btrfsic_submit_bio(int rw, struct bio *bio)
 {
 	struct btrfsic_dev_state *dev_state;
 
-	if (!btrfsic_is_initialized) {
-		submit_bio(rw, bio);
+	if (!btrfsic_is_initialized)
 		return;
-	}
 
 	mutex_lock(&btrfsic_mutex);
 	/* since btrfsic_submit_bio() is also called before
@@ -3106,10 +3090,20 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 	}
 leave:
 	mutex_unlock(&btrfsic_mutex);
+}
 
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
 	submit_bio(rw, bio);
 }
 
+int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
+	return submit_bio_wait(rw, bio);
+}
+
 int btrfsic_mount(struct btrfs_root *root,
 		  struct btrfs_fs_devices *fs_devices,
 		  int including_extent_data, u32 print_mask)
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 8b59175cc50..13b8566c97a 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -22,9 +22,11 @@
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 int btrfsic_submit_bh(int rw, struct buffer_head *bh);
 void btrfsic_submit_bio(int rw, struct bio *bio);
+int btrfsic_submit_bio_wait(int rw, struct bio *bio);
 #else
 #define btrfsic_submit_bh submit_bh
 #define btrfsic_submit_bio submit_bio
+#define btrfsic_submit_bio_wait submit_bio_wait
 #endif
 
 int btrfsic_mount(struct btrfs_root *root,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8e457fca0a0..ff43802a7c8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1952,11 +1952,6 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
 	return err;
 }
 
-static void repair_io_failure_callback(struct bio *bio, int err)
-{
-	complete(bio->bi_private);
-}
-
 /*
  * this bypasses the standard btrfs submit functions deliberately, as
  * the standard behavior is to write all copies in a raid setup. here we only
@@ -1973,7 +1968,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 {
 	struct bio *bio;
 	struct btrfs_device *dev;
-	DECLARE_COMPLETION_ONSTACK(compl);
 	u64 map_length = 0;
 	u64 sector;
 	struct btrfs_bio *bbio = NULL;
@@ -1990,8 +1984,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio)
 		return -EIO;
-	bio->bi_private = &compl;
-	bio->bi_end_io = repair_io_failure_callback;
 	bio->bi_size = 0;
 	map_length = length;
 
@@ -2012,10 +2004,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	}
 	bio->bi_bdev = dev->bdev;
 	bio_add_page(bio, page, length, start - page_offset(page));
-	btrfsic_submit_bio(WRITE_SYNC, bio);
-	wait_for_completion(&compl);
 
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
 		/* try to remap that extent elsewhere? */
 		bio_put(bio);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 561e2f16ba3..1fd3f33c330 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -208,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 					 int is_metadata, int have_csum,
 					 const u8 *csum, u64 generation,
 					 u16 csum_size);
-static void scrub_complete_bio_end_io(struct bio *bio, int err);
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good,
 					     int force_write);
@@ -1294,7 +1293,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
 		struct scrub_page *page = sblock->pagev[page_num];
-		DECLARE_COMPLETION_ONSTACK(complete);
 
 		if (page->dev->bdev == NULL) {
 			page->io_error = 1;
@@ -1311,18 +1309,11 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 		}
 		bio->bi_bdev = page->dev->bdev;
 		bio->bi_sector = page->physical >> 9;
-		bio->bi_end_io = scrub_complete_bio_end_io;
-		bio->bi_private = &complete;
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		btrfsic_submit_bio(READ, bio);
-
-		/* this will also unplug the queue */
-		wait_for_completion(&complete);
-
-		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
-		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		if (btrfsic_submit_bio_wait(READ, bio))
 			sblock->no_io_error_seen = 0;
+
 		bio_put(bio);
 	}
 
@@ -1391,11 +1382,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 		sblock->checksum_error = 1;
 }
 
-static void scrub_complete_bio_end_io(struct bio *bio, int err)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good,
 					     int force_write)
@@ -1430,7 +1416,6 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 	    sblock_bad->checksum_error || page_bad->io_error) {
 		struct bio *bio;
 		int ret;
-		DECLARE_COMPLETION_ONSTACK(complete);
 
 		if (!page_bad->dev->bdev) {
 			printk_ratelimited(KERN_WARNING
@@ -1443,19 +1428,14 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 			return -EIO;
 		bio->bi_bdev = page_bad->dev->bdev;
 		bio->bi_sector = page_bad->physical >> 9;
-		bio->bi_end_io = scrub_complete_bio_end_io;
-		bio->bi_private = &complete;
 
 		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
 		if (PAGE_SIZE != ret) {
 			bio_put(bio);
 			return -EIO;
 		}
-		btrfsic_submit_bio(WRITE, bio);
 
-		/* this will also unplug the queue */
-		wait_for_completion(&complete);
-		if (!bio_flagged(bio, BIO_UPTODATE)) {
+		if (btrfsic_submit_bio_wait(WRITE, bio)) {
 			btrfs_dev_stat_inc_and_print(page_bad->dev,
 				BTRFS_DEV_STAT_WRITE_ERRS);
 			btrfs_dev_replace_stats_inc(
@@ -3375,7 +3355,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	struct bio *bio;
 	struct btrfs_device *dev;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(compl);
 
 	dev = sctx->wr_ctx.tgtdev;
 	if (!dev)
@@ -3392,8 +3371,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 		spin_unlock(&sctx->stat_lock);
 		return -ENOMEM;
 	}
-	bio->bi_private = &compl;
-	bio->bi_end_io = scrub_complete_bio_end_io;
 	bio->bi_size = 0;
 	bio->bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
@@ -3404,10 +3381,8 @@ leave_with_eio:
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		return -EIO;
 	}
-	btrfsic_submit_bio(WRITE_SYNC, bio);
-	wait_for_completion(&compl);
 
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
 		goto leave_with_eio;
 
 	bio_put(bio);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b51a6079108..e9a97a0d431 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,13 +24,6 @@ struct hfsplus_wd {
 	u16 embed_count;
 };
 
-static void hfsplus_end_io_sync(struct bio *bio, int err)
-{
-	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	complete(bio->bi_private);
-}
-
 /*
  * hfsplus_submit_bio - Perfrom block I/O
  * @sb: super block of volume for I/O
@@ -53,7 +46,6 @@ static void hfsplus_end_io_sync(struct bio *bio, int err)
 int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 		void *buf, void **data, int rw)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct bio *bio;
 	int ret = 0;
 	u64 io_size;
@@ -73,8 +65,6 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_sector = sector;
 	bio->bi_bdev = sb->s_bdev;
-	bio->bi_end_io = hfsplus_end_io_sync;
-	bio->bi_private = &wait;
 
 	if (!(rw & WRITE) && data)
 		*data = (u8 *)buf + offset;
@@ -93,12 +83,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 		buf = (u8 *)buf + len;
 	}
 
-	submit_bio(rw, bio);
-	wait_for_completion(&wait);
-
-	if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
+	ret = submit_bio_wait(rw, bio);
 out:
 	bio_put(bio);
 	return ret < 0 ? ret : 0;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 550475ca6a0..0f95f0d0b31 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -14,16 +14,10 @@
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
 
-static void request_complete(struct bio *bio, int err)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 static int sync_request(struct page *page, struct block_device *bdev, int rw)
 {
 	struct bio bio;
 	struct bio_vec bio_vec;
-	struct completion complete;
 
 	bio_init(&bio);
 	bio.bi_max_vecs = 1;
@@ -35,13 +29,8 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	bio.bi_size = PAGE_SIZE;
 	bio.bi_bdev = bdev;
 	bio.bi_sector = page->index * (PAGE_SIZE >> 9);
-	init_completion(&complete);
-	bio.bi_private = &complete;
-	bio.bi_end_io = request_complete;
 
-	submit_bio(rw, &bio);
-	wait_for_completion(&complete);
-	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+	return submit_bio_wait(rw, &bio);
 }
 
 static int bdev_readpage(void *_sb, struct page *page)
-- 
cgit v1.2.3-70-g09d2


From 2c30c71bd653afcbed7f6754e8fe3d16e0e708a1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 7 Nov 2013 12:20:26 -0800
Subject: block: Convert various code to bio_for_each_segment()

With immutable biovecs we don't want code accessing bi_io_vec directly -
the uses this patch changes weren't incorrect since they all own the
bio, but it makes the code harder to audit for no good reason - also,
this will help with multipage bvecs later.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <chris.mason@fusionio.com>
Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/btrfs/compression.c           | 10 ++++------
 fs/btrfs/disk-io.c               | 11 ++++-------
 fs/btrfs/extent_io.c             | 35 +++++++++++++----------------------
 fs/btrfs/inode.c                 | 15 ++++++---------
 fs/ext4/page-io.c                |  4 ++--
 fs/f2fs/data.c                   | 13 +++++--------
 fs/f2fs/segment.c                | 12 +++++-------
 fs/logfs/dev_bdev.c              | 18 +++++++-----------
 fs/mpage.c                       | 17 ++++++++---------
 fs/nfs/blocklayout/blocklayout.c | 34 +++++++++++++---------------------
 10 files changed, 67 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1499b27b418..eac6784e43d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -201,18 +201,16 @@ csum_failed:
 	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
 	} else {
-		int bio_index = 0;
-		struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+		int i;
+		struct bio_vec *bvec;
 
 		/*
 		 * we have verified the checksum already, set page
 		 * checked so the end_io handlers know about it
 		 */
-		while (bio_index < cb->orig_bio->bi_vcnt) {
+		bio_for_each_segment_all(bvec, cb->orig_bio, i)
 			SetPageChecked(bvec->bv_page);
-			bvec++;
-			bio_index++;
-		}
+
 		bio_endio(cb->orig_bio, 0);
 	}
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8072cfa8a3b..5a10c61adaf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -842,20 +842,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 static int btree_csum_one_bio(struct bio *bio)
 {
-	struct bio_vec *bvec = bio->bi_io_vec;
-	int bio_index = 0;
+	struct bio_vec *bvec;
 	struct btrfs_root *root;
-	int ret = 0;
+	int i, ret = 0;
 
-	WARN_ON(bio->bi_vcnt <= 0);
-	while (bio_index < bio->bi_vcnt) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 		ret = csum_dirty_buffer(root, bvec->bv_page);
 		if (ret)
 			break;
-		bio_index++;
-		bvec++;
 	}
+
 	return ret;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ff43802a7c8..8b5f9e1d1f0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2332,12 +2332,13 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  */
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec;
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
+	int i;
 
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
@@ -2355,14 +2356,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		start = page_offset(page);
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
 		if (end_extent_writepage(page, err, start, end))
 			continue;
 
 		end_page_writeback(page);
-	} while (bvec >= bio->bi_io_vec);
+	}
 
 	bio_put(bio);
 }
@@ -2392,9 +2390,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
  */
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
+	struct bio_vec *bvec;
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct bio_vec *bvec = bio->bi_io_vec;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct extent_io_tree *tree;
 	u64 offset = 0;
@@ -2405,11 +2402,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	u64 extent_len = 0;
 	int mirror;
 	int ret;
+	int i;
 
 	if (err)
 		uptodate = 0;
 
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
 
@@ -2433,9 +2431,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
 		len = bvec->bv_len;
 
-		if (++bvec <= bvec_end)
-			prefetchw(&bvec->bv_page->flags);
-
 		mirror = io_bio->mirror_num;
 		if (likely(uptodate && tree->ops &&
 			   tree->ops->readpage_end_io_hook)) {
@@ -2516,7 +2511,7 @@ readpage_ok:
 			extent_start = start;
 			extent_len = end + 1 - start;
 		}
-	} while (bvec <= bvec_end);
+	}
 
 	if (extent_len)
 		endio_readpage_release_extent(tree, extent_start, extent_len,
@@ -2547,7 +2542,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	}
 
 	if (bio) {
-		bio->bi_size = 0;
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
 		btrfs_bio = btrfs_io_bio(bio);
@@ -3410,20 +3404,18 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
 
 static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
 {
-	int uptodate = err == 0;
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec;
 	struct extent_buffer *eb;
-	int done;
+	int i, done;
 
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
-		bvec--;
 		eb = (struct extent_buffer *)page->private;
 		BUG_ON(!eb);
 		done = atomic_dec_and_test(&eb->io_pages);
 
-		if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+		if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 			ClearPageUptodate(page);
 			SetPageError(page);
@@ -3435,10 +3427,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
 			continue;
 
 		end_extent_buffer_writeback(eb);
-	} while (bvec >= bio->bi_io_vec);
+	}
 
 	bio_put(bio);
-
 }
 
 static int write_one_eb(struct extent_buffer *eb,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d03..d6630dc130b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6779,17 +6779,16 @@ unlock_err:
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
-	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio_vec *bvec;
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct bio *dio_bio;
 	u32 *csums = (u32 *)dip->csum;
-	int index = 0;
 	u64 start;
+	int i;
 
 	start = dip->logical_offset;
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 			struct page *page = bvec->bv_page;
 			char *kaddr;
@@ -6805,18 +6804,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			local_irq_restore(flags);
 
 			flush_dcache_page(bvec->bv_page);
-			if (csum != csums[index]) {
+			if (csum != csums[i]) {
 				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
 					  btrfs_ino(inode), start, csum,
-					  csums[index]);
+					  csums[i]);
 				err = -EIO;
 			}
 		}
 
 		start += bvec->bv_len;
-		bvec++;
-		index++;
-	} while (bvec <= bvec_end);
+	}
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
 		      dip->logical_offset + dip->bytes - 1);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d488f80ee32..a31e4da1450 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio)
 {
 	int i;
 	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec;
 
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct bio_vec *bvec = &bio->bi_io_vec[i];
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 		struct buffer_head *bh, *head;
 		unsigned bio_start = bvec->bv_offset;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3438c571f..a4949096cf4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -351,23 +351,20 @@ repeat:
 
 static void read_end_io(struct bio *bio, int err)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec;
+	int i;
 
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate) {
+		if (!err) {
 			SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
 		unlock_page(page);
-	} while (bvec >= bio->bi_io_vec);
+	}
 	bio_put(bio);
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fa284d39719..a90c6bc0d12 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -575,16 +575,14 @@ static const struct segment_allocation default_salloc_ops = {
 
 static void f2fs_end_io_write(struct bio *bio, int err)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct bio_private *p = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
 
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-		if (!uptodate) {
+		if (err) {
 			SetPageError(page);
 			if (page->mapping)
 				set_bit(AS_EIO, &page->mapping->flags);
@@ -593,7 +591,7 @@ static void f2fs_end_io_write(struct bio *bio, int err)
 		}
 		end_page_writeback(page);
 		dec_page_count(p->sbi, F2FS_WRITEBACK);
-	} while (bvec >= bio->bi_io_vec);
+	}
 
 	if (p->is_sync)
 		complete(p->wait);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 0f95f0d0b31..e6df3be3b31 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -56,22 +56,18 @@ static DECLARE_WAIT_QUEUE_HEAD(wq);
 static void writeseg_end_io(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec;
+	int i;
 	struct super_block *sb = bio->bi_private;
 	struct logfs_super *super = logfs_super(sb);
-	struct page *page;
 
 	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
 	BUG_ON(err);
-	BUG_ON(bio->bi_vcnt == 0);
-	do {
-		page = bvec->bv_page;
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		end_page_writeback(page);
-		page_cache_release(page);
-	} while (bvec >= bio->bi_io_vec);
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		end_page_writeback(bvec->bv_page);
+		page_cache_release(bvec->bv_page);
+	}
 	bio_put(bio);
 	if (atomic_dec_and_test(&super->s_pending_writes))
 		wake_up(&wq);
diff --git a/fs/mpage.c b/fs/mpage.c
index 0face1c4d4c..dd6d5878f4d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -43,16 +43,14 @@
  */
 static void mpage_end_io(struct bio *bio, int err)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bv;
+	int i;
 
-	do {
-		struct page *page = bvec->bv_page;
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
 
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
 		if (bio_data_dir(bio) == READ) {
-			if (uptodate) {
+			if (!err) {
 				SetPageUptodate(page);
 			} else {
 				ClearPageUptodate(page);
@@ -60,14 +58,15 @@ static void mpage_end_io(struct bio *bio, int err)
 			}
 			unlock_page(page);
 		} else { /* bio_data_dir(bio) == WRITE */
-			if (!uptodate) {
+			if (err) {
 				SetPageError(page);
 				if (page->mapping)
 					set_bit(AS_EIO, &page->mapping->flags);
 			}
 			end_page_writeback(page);
 		}
-	} while (bvec >= bio->bi_io_vec);
+	}
+
 	bio_put(bio);
 }
 
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e242bbf7297..da768923bf7 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -201,18 +201,14 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
 static void bl_end_io_read(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec;
+	int i;
 
-	do {
-		struct page *page = bvec->bv_page;
+	if (!err)
+		bio_for_each_segment_all(bvec, bio, i)
+			SetPageUptodate(bvec->bv_page);
 
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-		if (uptodate)
-			SetPageUptodate(page);
-	} while (bvec >= bio->bi_io_vec);
-	if (!uptodate) {
+	if (err) {
 		struct nfs_read_data *rdata = par->data;
 		struct nfs_pgio_header *header = rdata->header;
 
@@ -383,20 +379,16 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 static void bl_end_io_write_zero(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
-	do {
-		struct page *page = bvec->bv_page;
+	struct bio_vec *bvec;
+	int i;
 
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
+	bio_for_each_segment_all(bvec, bio, i) {
 		/* This is the zeroing page we added */
-		end_page_writeback(page);
-		page_cache_release(page);
-	} while (bvec >= bio->bi_io_vec);
+		end_page_writeback(bvec->bv_page);
+		page_cache_release(bvec->bv_page);
+	}
 
-	if (unlikely(!uptodate)) {
+	if (unlikely(err)) {
 		struct nfs_write_data *data = par->data;
 		struct nfs_pgio_header *header = data->header;
 
-- 
cgit v1.2.3-70-g09d2


From 4f024f3797c43cb4b73cd2c50cec728842d0e49e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 11 Oct 2013 15:44:27 -0700
Subject: block: Abstract out bvec iterator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Immutable biovecs are going to require an explicit iterator. To
implement immutable bvecs, a later patch is going to add a bi_bvec_done
member to this struct; for now, this patch effectively just renames
things.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Ed L. Cashin" <ecashin@coraid.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Yehuda Sadeh <yehuda@inktank.com>
Cc: Sage Weil <sage@inktank.com>
Cc: Alex Elder <elder@inktank.com>
Cc: ceph-devel@vger.kernel.org
Cc: Joshua Morris <josh.h.morris@us.ibm.com>
Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux390@de.ibm.com
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: Benny Halevy <bhalevy@tonian.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <chris.mason@fusionio.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@kernel.org>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Ben Myers <bpm@sgi.com>
Cc: xfs@oss.sgi.com
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Guo Chao <yan@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: "Roger Pau Monné" <roger.pau@citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchand@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Peng Tao <tao.peng@emc.com>
Cc: Andy Adamson <andros@netapp.com>
Cc: fanchaoting <fanchaoting@cn.fujitsu.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Namjae Jeon <namjae.jeon@samsung.com>
Cc: Pankaj Kumar <pankaj.km@samsung.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Mel Gorman <mgorman@suse.de>6
---
 Documentation/block/biodoc.txt              |  7 +--
 arch/m68k/emu/nfblock.c                     |  2 +-
 arch/powerpc/sysdev/axonram.c               |  3 +-
 block/blk-core.c                            | 36 ++++++------
 block/blk-flush.c                           |  2 +-
 block/blk-lib.c                             | 12 ++--
 block/blk-map.c                             |  6 +-
 block/blk-merge.c                           |  4 +-
 block/blk-mq.c                              |  2 +-
 block/blk-throttle.c                        | 14 ++---
 block/elevator.c                            |  2 +-
 drivers/block/aoe/aoecmd.c                  |  6 +-
 drivers/block/brd.c                         |  4 +-
 drivers/block/drbd/drbd_actlog.c            |  2 +-
 drivers/block/drbd/drbd_bitmap.c            |  2 +-
 drivers/block/drbd/drbd_receiver.c          |  6 +-
 drivers/block/drbd/drbd_req.c               |  6 +-
 drivers/block/drbd/drbd_req.h               |  2 +-
 drivers/block/floppy.c                      |  4 +-
 drivers/block/loop.c                        |  4 +-
 drivers/block/mtip32xx/mtip32xx.c           |  7 ++-
 drivers/block/nvme-core.c                   | 25 ++++----
 drivers/block/pktcdvd.c                     | 54 +++++++++--------
 drivers/block/ps3disk.c                     |  2 +-
 drivers/block/ps3vram.c                     |  2 +-
 drivers/block/rbd.c                         | 21 +++----
 drivers/block/rsxx/dev.c                    |  6 +-
 drivers/block/rsxx/dma.c                    |  4 +-
 drivers/block/umem.c                        |  9 +--
 drivers/block/xen-blkback/blkback.c         |  2 +-
 drivers/block/xen-blkfront.c                |  2 +-
 drivers/md/bcache/btree.c                   |  4 +-
 drivers/md/bcache/debug.c                   |  2 +-
 drivers/md/bcache/io.c                      | 26 ++++-----
 drivers/md/bcache/journal.c                 | 12 ++--
 drivers/md/bcache/movinggc.c                |  4 +-
 drivers/md/bcache/request.c                 | 58 +++++++++---------
 drivers/md/bcache/super.c                   | 16 ++---
 drivers/md/bcache/util.c                    |  4 +-
 drivers/md/bcache/writeback.c               |  6 +-
 drivers/md/bcache/writeback.h               |  2 +-
 drivers/md/dm-bio-record.h                  | 12 ++--
 drivers/md/dm-bufio.c                       |  2 +-
 drivers/md/dm-cache-policy-mq.c             |  4 +-
 drivers/md/dm-cache-target.c                | 22 +++----
 drivers/md/dm-crypt.c                       | 19 +++---
 drivers/md/dm-delay.c                       |  7 ++-
 drivers/md/dm-flakey.c                      |  7 ++-
 drivers/md/dm-io.c                          |  6 +-
 drivers/md/dm-linear.c                      |  3 +-
 drivers/md/dm-raid1.c                       | 16 ++---
 drivers/md/dm-region-hash.c                 |  3 +-
 drivers/md/dm-snap.c                        | 18 +++---
 drivers/md/dm-stripe.c                      | 13 +++--
 drivers/md/dm-switch.c                      |  4 +-
 drivers/md/dm-thin.c                        | 22 +++----
 drivers/md/dm-verity.c                      |  8 +--
 drivers/md/dm.c                             | 25 ++++----
 drivers/md/faulty.c                         | 19 +++---
 drivers/md/linear.c                         | 12 ++--
 drivers/md/md.c                             | 10 ++--
 drivers/md/multipath.c                      | 13 +++--
 drivers/md/raid0.c                          | 16 ++---
 drivers/md/raid1.c                          | 75 ++++++++++++------------
 drivers/md/raid10.c                         | 91 ++++++++++++++++-------------
 drivers/md/raid5.c                          | 72 ++++++++++++-----------
 drivers/s390/block/dcssblk.c                |  5 +-
 drivers/s390/block/xpram.c                  |  9 +--
 drivers/scsi/osd/osd_initiator.c            |  2 +-
 drivers/staging/lustre/lustre/llite/lloop.c | 12 ++--
 drivers/staging/zram/zram_drv.c             | 14 +++--
 drivers/target/target_core_iblock.c         |  2 +-
 fs/bio-integrity.c                          |  8 +--
 fs/bio.c                                    | 56 +++++++++---------
 fs/btrfs/check-integrity.c                  |  8 +--
 fs/btrfs/compression.c                      | 17 +++---
 fs/btrfs/extent_io.c                        | 14 ++---
 fs/btrfs/file-item.c                        | 19 +++---
 fs/btrfs/inode.c                            | 22 +++----
 fs/btrfs/raid56.c                           | 22 +++----
 fs/btrfs/scrub.c                            | 12 ++--
 fs/btrfs/volumes.c                          | 12 ++--
 fs/buffer.c                                 | 12 ++--
 fs/direct-io.c                              |  4 +-
 fs/ext4/page-io.c                           |  4 +-
 fs/f2fs/data.c                              |  2 +-
 fs/f2fs/segment.c                           |  2 +-
 fs/gfs2/lops.c                              |  2 +-
 fs/gfs2/ops_fstype.c                        |  2 +-
 fs/hfsplus/wrapper.c                        |  2 +-
 fs/jfs/jfs_logmgr.c                         | 12 ++--
 fs/jfs/jfs_metapage.c                       |  9 +--
 fs/logfs/dev_bdev.c                         | 20 +++----
 fs/mpage.c                                  |  2 +-
 fs/nfs/blocklayout/blocklayout.c            |  9 +--
 fs/nilfs2/segbuf.c                          |  3 +-
 fs/ocfs2/cluster/heartbeat.c                |  2 +-
 fs/xfs/xfs_aops.c                           |  2 +-
 fs/xfs/xfs_buf.c                            |  4 +-
 include/linux/bio.h                         | 16 ++---
 include/linux/blk_types.h                   | 19 +++---
 include/trace/events/bcache.h               | 26 ++++-----
 include/trace/events/block.h                | 26 ++++-----
 include/trace/events/f2fs.h                 |  4 +-
 kernel/power/block_io.c                     |  2 +-
 kernel/trace/blktrace.c                     | 15 ++---
 mm/page_io.c                                | 10 ++--
 107 files changed, 700 insertions(+), 638 deletions(-)

(limited to 'fs')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 8df5e8e6dce..2101e718670 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -447,14 +447,13 @@ struct bio_vec {
  * main unit of I/O for the block layer and lower layers (ie drivers)
  */
 struct bio {
-       sector_t            bi_sector;
        struct bio          *bi_next;    /* request queue link */
        struct block_device *bi_bdev;	/* target device */
        unsigned long       bi_flags;    /* status, command, etc */
        unsigned long       bi_rw;       /* low bits: r/w, high: priority */
 
        unsigned int	bi_vcnt;     /* how may bio_vec's */
-       unsigned int	bi_idx;		/* current index into bio_vec array */
+       struct bvec_iter	bi_iter;	/* current index into bio_vec array */
 
        unsigned int	bi_size;     /* total size in bytes */
        unsigned short 	bi_phys_segments; /* segments after physaddr coalesce*/
@@ -480,7 +479,7 @@ With this multipage bio design:
 - Code that traverses the req list can find all the segments of a bio
   by using rq_for_each_segment.  This handles the fact that a request
   has multiple bios, each of which can have multiple segments.
-- Drivers which can't process a large bio in one shot can use the bi_idx
+- Drivers which can't process a large bio in one shot can use the bi_iter
   field to keep track of the next bio_vec entry to process.
   (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
   [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
@@ -589,7 +588,7 @@ driver should not modify these values. The block layer sets up the
 nr_sectors and current_nr_sectors fields (based on the corresponding
 hard_xxx values and the number of bytes transferred) and updates it on
 every transfer that invokes end_that_request_first. It does the same for the
-buffer, bio, bio->bi_idx fields too.
+buffer, bio, bio->bi_iter fields too.
 
 The buffer field is just a virtual address mapping of the current segment
 of the i/o buffer in cases where the buffer resides in low-memory. For high
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 0721858fbd1..0a9d0b3c794 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -64,7 +64,7 @@ static void nfhd_make_request(struct request_queue *queue, struct bio *bio)
 	struct nfhd_device *dev = queue->queuedata;
 	struct bio_vec *bvec;
 	int i, dir, len, shift;
-	sector_t sec = bio->bi_sector;
+	sector_t sec = bio->bi_iter.bi_sector;
 
 	dir = bio_data_dir(bio);
 	shift = dev->bshift;
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 1c16141c031..f33bcbaa6a0 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -113,7 +113,8 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 	unsigned int transfered;
 	unsigned short idx;
 
-	phys_mem = bank->io_addr + (bio->bi_sector << AXON_RAM_SECTOR_SHIFT);
+	phys_mem = bank->io_addr + (bio->bi_iter.bi_sector <<
+				    AXON_RAM_SECTOR_SHIFT);
 	phys_end = bank->io_addr + bank->size;
 	transfered = 0;
 	bio_for_each_segment(vec, bio, idx) {
diff --git a/block/blk-core.c b/block/blk-core.c
index 8bdd0121212..5c2ab2c7406 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -130,7 +130,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 		bio_endio(bio, error);
 }
 
@@ -1326,7 +1326,7 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 	bio->bi_io_vec->bv_offset = 0;
 	bio->bi_io_vec->bv_len = len;
 
-	bio->bi_size = len;
+	bio->bi_iter.bi_size = len;
 	bio->bi_vcnt = 1;
 	bio->bi_phys_segments = 1;
 
@@ -1351,7 +1351,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
-	req->__data_len += bio->bi_size;
+	req->__data_len += bio->bi_iter.bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
 	blk_account_io_start(req, false);
@@ -1380,8 +1380,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 	 * not touch req->buffer either...
 	 */
 	req->buffer = bio_data(bio);
-	req->__sector = bio->bi_sector;
-	req->__data_len += bio->bi_size;
+	req->__sector = bio->bi_iter.bi_sector;
+	req->__data_len += bio->bi_iter.bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
 	blk_account_io_start(req, false);
@@ -1459,7 +1459,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
 	req->errors = 0;
-	req->__sector = bio->bi_sector;
+	req->__sector = bio->bi_iter.bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
@@ -1583,12 +1583,12 @@ static inline void blk_partition_remap(struct bio *bio)
 	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 
-		bio->bi_sector += p->start_sect;
+		bio->bi_iter.bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
 		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
 				      bdev->bd_dev,
-				      bio->bi_sector - p->start_sect);
+				      bio->bi_iter.bi_sector - p->start_sect);
 	}
 }
 
@@ -1654,7 +1654,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 	/* Test device or partition size, when known. */
 	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
 	if (maxsector) {
-		sector_t sector = bio->bi_sector;
+		sector_t sector = bio->bi_iter.bi_sector;
 
 		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
 			/*
@@ -1690,7 +1690,7 @@ generic_make_request_checks(struct bio *bio)
 		       "generic_make_request: Trying to access "
 			"nonexistent block-device %s (%Lu)\n",
 			bdevname(bio->bi_bdev, b),
-			(long long) bio->bi_sector);
+			(long long) bio->bi_iter.bi_sector);
 		goto end_io;
 	}
 
@@ -1704,9 +1704,9 @@ generic_make_request_checks(struct bio *bio)
 	}
 
 	part = bio->bi_bdev->bd_part;
-	if (should_fail_request(part, bio->bi_size) ||
+	if (should_fail_request(part, bio->bi_iter.bi_size) ||
 	    should_fail_request(&part_to_disk(part)->part0,
-				bio->bi_size))
+				bio->bi_iter.bi_size))
 		goto end_io;
 
 	/*
@@ -1865,7 +1865,7 @@ void submit_bio(int rw, struct bio *bio)
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
-			task_io_account_read(bio->bi_size);
+			task_io_account_read(bio->bi_iter.bi_size);
 			count_vm_events(PGPGIN, count);
 		}
 
@@ -1874,7 +1874,7 @@ void submit_bio(int rw, struct bio *bio)
 			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
-				(unsigned long long)bio->bi_sector,
+				(unsigned long long)bio->bi_iter.bi_sector,
 				bdevname(bio->bi_bdev, b),
 				count);
 		}
@@ -2007,7 +2007,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
 	for (bio = rq->bio; bio; bio = bio->bi_next) {
 		if ((bio->bi_rw & ff) != ff)
 			break;
-		bytes += bio->bi_size;
+		bytes += bio->bi_iter.bi_size;
 	}
 
 	/* this could lead to infinite loop */
@@ -2378,9 +2378,9 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	total_bytes = 0;
 	while (req->bio) {
 		struct bio *bio = req->bio;
-		unsigned bio_bytes = min(bio->bi_size, nr_bytes);
+		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 
-		if (bio_bytes == bio->bi_size)
+		if (bio_bytes == bio->bi_iter.bi_size)
 			req->bio = bio->bi_next;
 
 		req_bio_endio(req, bio, bio_bytes, error);
@@ -2728,7 +2728,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
-	rq->__data_len = bio->bi_size;
+	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;
 
 	if (bio->bi_bdev)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index fb6f3c0ffa4..9288aaf35c2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -548,7 +548,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	 * copied from blk_rq_pos(rq).
 	 */
 	if (error_sector)
-		*error_sector = bio->bi_sector;
+		*error_sector = bio->bi_iter.bi_sector;
 
 	bio_put(bio);
 	return ret;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9b5b561cb92..2da76c999ef 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -108,12 +108,12 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			req_sects = end_sect - sector;
 		}
 
-		bio->bi_sector = sector;
+		bio->bi_iter.bi_sector = sector;
 		bio->bi_end_io = bio_batch_end_io;
 		bio->bi_bdev = bdev;
 		bio->bi_private = &bb;
 
-		bio->bi_size = req_sects << 9;
+		bio->bi_iter.bi_size = req_sects << 9;
 		nr_sects -= req_sects;
 		sector = end_sect;
 
@@ -174,7 +174,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 			break;
 		}
 
-		bio->bi_sector = sector;
+		bio->bi_iter.bi_sector = sector;
 		bio->bi_end_io = bio_batch_end_io;
 		bio->bi_bdev = bdev;
 		bio->bi_private = &bb;
@@ -184,11 +184,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
 
 		if (nr_sects > max_write_same_sectors) {
-			bio->bi_size = max_write_same_sectors << 9;
+			bio->bi_iter.bi_size = max_write_same_sectors << 9;
 			nr_sects -= max_write_same_sectors;
 			sector += max_write_same_sectors;
 		} else {
-			bio->bi_size = nr_sects << 9;
+			bio->bi_iter.bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
 
@@ -240,7 +240,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			break;
 		}
 
-		bio->bi_sector = sector;
+		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev   = bdev;
 		bio->bi_end_io = bio_batch_end_io;
 		bio->bi_private = &bb;
diff --git a/block/blk-map.c b/block/blk-map.c
index 623e1cd4cff..ae4ae1047fd 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -20,7 +20,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 
-		rq->__data_len += bio->bi_size;
+		rq->__data_len += bio->bi_iter.bi_size;
 	}
 	return 0;
 }
@@ -76,7 +76,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 
 	ret = blk_rq_append_bio(q, rq, bio);
 	if (!ret)
-		return bio->bi_size;
+		return bio->bi_iter.bi_size;
 
 	/* if it was boucned we must call the end io function */
 	bio_endio(bio, 0);
@@ -220,7 +220,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	if (bio->bi_size != len) {
+	if (bio->bi_iter.bi_size != len) {
 		/*
 		 * Grab an extra reference to this bio, as bio_unmap_user()
 		 * expects to be able to drop it twice as it happens on the
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1ffc5897783..03bc083c28c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -543,9 +543,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 
 int blk_try_merge(struct request *rq, struct bio *bio)
 {
-	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector)
+	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
-	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector)
+	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
 		return ELEVATOR_FRONT_MERGE;
 	return ELEVATOR_NO_MERGE;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cdc629cf075..e4fbcc3fd2d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -301,7 +301,7 @@ void blk_mq_complete_request(struct request *rq, int error)
 		struct bio *next = bio->bi_next;
 
 		bio->bi_next = NULL;
-		bytes += bio->bi_size;
+		bytes += bio->bi_iter.bi_size;
 		blk_mq_bio_endio(rq, bio, error);
 		bio = next;
 	}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 06534049afb..20f82003777 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -877,14 +877,14 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 	do_div(tmp, HZ);
 	bytes_allowed = tmp;
 
-	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+	if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 
 	/* Calc approx time to dispatch */
-	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+	extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
 	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
 
 	if (!jiffy_wait)
@@ -987,7 +987,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	bool rw = bio_data_dir(bio);
 
 	/* Charge the bio to the group */
-	tg->bytes_disp[rw] += bio->bi_size;
+	tg->bytes_disp[rw] += bio->bi_iter.bi_size;
 	tg->io_disp[rw]++;
 
 	/*
@@ -1003,8 +1003,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	 */
 	if (!(bio->bi_rw & REQ_THROTTLED)) {
 		bio->bi_rw |= REQ_THROTTLED;
-		throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
-					     bio->bi_rw);
+		throtl_update_dispatch_stats(tg_to_blkg(tg),
+					     bio->bi_iter.bi_size, bio->bi_rw);
 	}
 }
 
@@ -1508,7 +1508,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	if (tg) {
 		if (!tg->has_rules[rw]) {
 			throtl_update_dispatch_stats(tg_to_blkg(tg),
-						     bio->bi_size, bio->bi_rw);
+					bio->bi_iter.bi_size, bio->bi_rw);
 			goto out_unlock_rcu;
 		}
 	}
@@ -1564,7 +1564,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	/* out-of-limit, queue to @tg */
 	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
 		   rw == READ ? 'R' : 'W',
-		   tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+		   tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
 		   tg->io_disp[rw], tg->iops[rw],
 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
diff --git a/block/elevator.c b/block/elevator.c
index b7ff2861b6b..42c45a7d671 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -440,7 +440,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	/*
 	 * See if our hash lookup can find a potential backmerge.
 	 */
-	__rq = elv_rqhash_find(q, bio->bi_sector);
+	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_BACK_MERGE;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index d2515435e23..877ba119b3f 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -929,8 +929,8 @@ bufinit(struct buf *buf, struct request *rq, struct bio *bio)
 	memset(buf, 0, sizeof(*buf));
 	buf->rq = rq;
 	buf->bio = bio;
-	buf->resid = bio->bi_size;
-	buf->sector = bio->bi_sector;
+	buf->resid = bio->bi_iter.bi_size;
+	buf->sector = bio->bi_iter.bi_sector;
 	bio_pageinc(bio);
 	buf->bv = bio_iovec(bio);
 	buf->bv_resid = buf->bv->bv_len;
@@ -1152,7 +1152,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
 	do {
 		bio = rq->bio;
 		bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
-	} while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
+	} while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size));
 
 	/* cf. http://lkml.org/lkml/2006/10/31/28 */
 	if (!fastfail)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index d91f1a56e86..66f5aaae15a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -333,13 +333,13 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
 	int i;
 	int err = -EIO;
 
-	sector = bio->bi_sector;
+	sector = bio->bi_iter.bi_sector;
 	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
 		goto out;
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
 		err = 0;
-		discard_from_brd(brd, sector, bio->bi_size);
+		discard_from_brd(brd, sector, bio->bi_iter.bi_size);
 		goto out;
 	}
 
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 28c73ca320a..a9b13f2cc42 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -159,7 +159,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 
 	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	err = -EIO;
 	if (bio_add_page(bio, page, size, 0) != size)
 		goto out;
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index b12c11ec4bd..597f111df67 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1028,7 +1028,7 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 	} else
 		page = b->bm_pages[page_nr];
 	bio->bi_bdev = mdev->ldev->md_bdev;
-	bio->bi_sector = on_disk_sector;
+	bio->bi_iter.bi_sector = on_disk_sector;
 	/* bio_add_page of a single page to an empty bio will always succeed,
 	 * according to api.  Do we want to assert that? */
 	bio_add_page(bio, page, len, 0);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6fa6673b36b..5326c22cdb9 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1333,7 +1333,7 @@ next_bio:
 		goto fail;
 	}
 	/* > peer_req->i.sector, unless this is the first bio */
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	bio->bi_bdev = mdev->ldev->backing_bdev;
 	bio->bi_rw = rw;
 	bio->bi_private = peer_req;
@@ -1353,7 +1353,7 @@ next_bio:
 				dev_err(DEV,
 					"bio_add_page failed for len=%u, "
 					"bi_vcnt=0 (bi_sector=%llu)\n",
-					len, (unsigned long long)bio->bi_sector);
+					len, (uint64_t)bio->bi_iter.bi_sector);
 				err = -ENOSPC;
 				goto fail;
 			}
@@ -1615,7 +1615,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 	mdev->recv_cnt += data_size>>9;
 
 	bio = req->master_bio;
-	D_ASSERT(sector == bio->bi_sector);
+	D_ASSERT(sector == bio->bi_iter.bi_sector);
 
 	bio_for_each_segment(bvec, bio, i) {
 		void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index fec7bef4499..104a040f24d 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -77,8 +77,8 @@ static struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
 	req->epoch       = 0;
 
 	drbd_clear_interval(&req->i);
-	req->i.sector     = bio_src->bi_sector;
-	req->i.size      = bio_src->bi_size;
+	req->i.sector     = bio_src->bi_iter.bi_sector;
+	req->i.size      = bio_src->bi_iter.bi_size;
 	req->i.local = true;
 	req->i.waiting = false;
 
@@ -1280,7 +1280,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	/*
 	 * what we "blindly" assume:
 	 */
-	D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
+	D_ASSERT(IS_ALIGNED(bio->bi_iter.bi_size, 512));
 
 	inc_ap_bio(mdev);
 	__drbd_make_request(mdev, bio, start_time);
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 978cb1addc9..28e15d91197 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -269,7 +269,7 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi
 
 /* Short lived temporary struct on the stack.
  * We could squirrel the error to be returned into
- * bio->bi_size, or similar. But that would be too ugly. */
+ * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
 struct bio_and_error {
 	struct bio *bio;
 	int error;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 000abe2f105..6a86fe7b730 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3775,9 +3775,9 @@ static int __floppy_read_block_0(struct block_device *bdev)
 	bio_vec.bv_len = size;
 	bio_vec.bv_offset = 0;
 	bio.bi_vcnt = 1;
-	bio.bi_size = size;
+	bio.bi_iter.bi_size = size;
 	bio.bi_bdev = bdev;
-	bio.bi_sector = 0;
+	bio.bi_iter.bi_sector = 0;
 	bio.bi_flags = (1 << BIO_QUIET);
 	init_completion(&complete);
 	bio.bi_private = &complete;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c8dac730524..f5e39989add 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -415,7 +415,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	loff_t pos;
 	int ret;
 
-	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
+	pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
 
 	if (bio_rw(bio) == WRITE) {
 		struct file *file = lo->lo_backing_file;
@@ -444,7 +444,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 				goto out;
 			}
 			ret = file->f_op->fallocate(file, mode, pos,
-						    bio->bi_size);
+						    bio->bi_iter.bi_size);
 			if (unlikely(ret && ret != -EINVAL &&
 				     ret != -EOPNOTSUPP))
 				ret = -EIO;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 050c71267f1..69e9eb5a6b3 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3993,7 +3993,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 	}
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
-		bio_endio(bio, mtip_send_trim(dd, bio->bi_sector,
+		bio_endio(bio, mtip_send_trim(dd, bio->bi_iter.bi_sector,
 						bio_sectors(bio)));
 		return;
 	}
@@ -4006,7 +4006,8 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 
 	if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 &&
 							dd->unal_qdepth) {
-		if (bio->bi_sector % 8 != 0) /* Unaligned on 4k boundaries */
+		if (bio->bi_iter.bi_sector % 8 != 0)
+			/* Unaligned on 4k boundaries */
 			unaligned = 1;
 		else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */
 			unaligned = 1;
@@ -4035,7 +4036,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 
 		/* Issue the read/write. */
 		mtip_hw_submit_io(dd,
-				bio->bi_sector,
+				bio->bi_iter.bi_sector,
 				bio_sectors(bio),
 				nents,
 				tag,
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 26d03fa0bf2..53d21738187 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -468,7 +468,7 @@ static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx,
 {
 	struct nvme_bio_pair *bp;
 
-	BUG_ON(len > bio->bi_size);
+	BUG_ON(len > bio->bi_iter.bi_size);
 	BUG_ON(idx > bio->bi_vcnt);
 
 	bp = kmalloc(sizeof(*bp), GFP_ATOMIC);
@@ -479,11 +479,11 @@ static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx,
 	bp->b1 = *bio;
 	bp->b2 = *bio;
 
-	bp->b1.bi_size = len;
-	bp->b2.bi_size -= len;
+	bp->b1.bi_iter.bi_size = len;
+	bp->b2.bi_iter.bi_size -= len;
 	bp->b1.bi_vcnt = idx;
-	bp->b2.bi_idx = idx;
-	bp->b2.bi_sector += len >> 9;
+	bp->b2.bi_iter.bi_idx = idx;
+	bp->b2.bi_iter.bi_sector += len >> 9;
 
 	if (offset) {
 		bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec),
@@ -552,11 +552,12 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 {
 	struct bio_vec *bvec, *bvprv = NULL;
 	struct scatterlist *sg = NULL;
-	int i, length = 0, nsegs = 0, split_len = bio->bi_size;
+	int i, length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
 
 	if (nvmeq->dev->stripe_size)
 		split_len = nvmeq->dev->stripe_size -
-			((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1));
+			((bio->bi_iter.bi_sector << 9) &
+			 (nvmeq->dev->stripe_size - 1));
 
 	sg_init_table(iod->sg, psegs);
 	bio_for_each_segment(bvec, bio, i) {
@@ -584,7 +585,7 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 	if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
 		return -ENOMEM;
 
-	BUG_ON(length != bio->bi_size);
+	BUG_ON(length != bio->bi_iter.bi_size);
 	return length;
 }
 
@@ -608,8 +609,8 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	iod->npages = 0;
 
 	range->cattr = cpu_to_le32(0);
-	range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift);
-	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector));
+	range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
+	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
@@ -674,7 +675,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	}
 
 	result = -ENOMEM;
-	iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
+	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
 	if (!iod)
 		goto nomem;
 	iod->private = bio;
@@ -723,7 +724,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
 								GFP_ATOMIC);
-	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector));
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
 	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ff8668c5efb..ce986bacf7b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -651,7 +651,7 @@ static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s
 
 	for (;;) {
 		tmp = rb_entry(n, struct pkt_rb_node, rb_node);
-		if (s <= tmp->bio->bi_sector)
+		if (s <= tmp->bio->bi_iter.bi_sector)
 			next = n->rb_left;
 		else
 			next = n->rb_right;
@@ -660,12 +660,12 @@ static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s
 		n = next;
 	}
 
-	if (s > tmp->bio->bi_sector) {
+	if (s > tmp->bio->bi_iter.bi_sector) {
 		tmp = pkt_rbtree_next(tmp);
 		if (!tmp)
 			return NULL;
 	}
-	BUG_ON(s > tmp->bio->bi_sector);
+	BUG_ON(s > tmp->bio->bi_iter.bi_sector);
 	return tmp;
 }
 
@@ -676,13 +676,13 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod
 {
 	struct rb_node **p = &pd->bio_queue.rb_node;
 	struct rb_node *parent = NULL;
-	sector_t s = node->bio->bi_sector;
+	sector_t s = node->bio->bi_iter.bi_sector;
 	struct pkt_rb_node *tmp;
 
 	while (*p) {
 		parent = *p;
 		tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
-		if (s < tmp->bio->bi_sector)
+		if (s < tmp->bio->bi_iter.bi_sector)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -857,7 +857,8 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 			spin_lock(&pd->iosched.lock);
 			bio = bio_list_peek(&pd->iosched.write_queue);
 			spin_unlock(&pd->iosched.lock);
-			if (bio && (bio->bi_sector == pd->iosched.last_write))
+			if (bio && (bio->bi_iter.bi_sector ==
+				    pd->iosched.last_write))
 				need_write_seek = 0;
 			if (need_write_seek && reads_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
@@ -888,7 +889,8 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 			continue;
 
 		if (bio_data_dir(bio) == READ)
-			pd->iosched.successive_reads += bio->bi_size >> 10;
+			pd->iosched.successive_reads +=
+				bio->bi_iter.bi_size >> 10;
 		else {
 			pd->iosched.successive_reads = 0;
 			pd->iosched.last_write = bio_end_sector(bio);
@@ -978,7 +980,7 @@ static void pkt_end_io_read(struct bio *bio, int err)
 
 	pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
 		bio, (unsigned long long)pkt->sector,
-		(unsigned long long)bio->bi_sector, err);
+		(unsigned long long)bio->bi_iter.bi_sector, err);
 
 	if (err)
 		atomic_inc(&pkt->io_errors);
@@ -1026,8 +1028,9 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	memset(written, 0, sizeof(written));
 	spin_lock(&pkt->lock);
 	bio_list_for_each(bio, &pkt->orig_bios) {
-		int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
-		int num_frames = bio->bi_size / CD_FRAMESIZE;
+		int first_frame = (bio->bi_iter.bi_sector - pkt->sector) /
+			(CD_FRAMESIZE >> 9);
+		int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE;
 		pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
 		BUG_ON(first_frame < 0);
 		BUG_ON(first_frame + num_frames > pkt->frames);
@@ -1053,7 +1056,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 		bio = pkt->r_bios[f];
 		bio_reset(bio);
-		bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
+		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
 		bio->bi_bdev = pd->bdev;
 		bio->bi_end_io = pkt_end_io_read;
 		bio->bi_private = pkt;
@@ -1150,8 +1153,8 @@ static int pkt_start_recovery(struct packet_data *pkt)
 	bio_reset(pkt->bio);
 	pkt->bio->bi_bdev = pd->bdev;
 	pkt->bio->bi_rw = REQ_WRITE;
-	pkt->bio->bi_sector = new_sector;
-	pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE;
+	pkt->bio->bi_iter.bi_sector = new_sector;
+	pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE;
 	pkt->bio->bi_vcnt = pkt->frames;
 
 	pkt->bio->bi_end_io = pkt_end_io_packet_write;
@@ -1213,7 +1216,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
 	node = first_node;
 	while (node) {
 		bio = node->bio;
-		zone = get_zone(bio->bi_sector, pd);
+		zone = get_zone(bio->bi_iter.bi_sector, pd);
 		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
 			if (p->sector == zone) {
 				bio = NULL;
@@ -1252,14 +1255,14 @@ try_next_bio:
 	pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone);
 	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
 		bio = node->bio;
-		pkt_dbg(2, pd, "found zone=%llx\n",
-			(unsigned long long)get_zone(bio->bi_sector, pd));
-		if (get_zone(bio->bi_sector, pd) != zone)
+		pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long)
+			get_zone(bio->bi_iter.bi_sector, pd));
+		if (get_zone(bio->bi_iter.bi_sector, pd) != zone)
 			break;
 		pkt_rbtree_erase(pd, node);
 		spin_lock(&pkt->lock);
 		bio_list_add(&pkt->orig_bios, bio);
-		pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+		pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE;
 		spin_unlock(&pkt->lock);
 	}
 	/* check write congestion marks, and if bio_queue_size is
@@ -1293,7 +1296,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
 
 	bio_reset(pkt->w_bio);
-	pkt->w_bio->bi_sector = pkt->sector;
+	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
 	pkt->w_bio->bi_bdev = pd->bdev;
 	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
 	pkt->w_bio->bi_private = pkt;
@@ -2370,20 +2373,20 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
 		pkt_notice(pd, "WRITE for ro device (%llu)\n",
-			   (unsigned long long)bio->bi_sector);
+			   (unsigned long long)bio->bi_iter.bi_sector);
 		goto end_io;
 	}
 
-	if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) {
+	if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
 		pkt_err(pd, "wrong bio size\n");
 		goto end_io;
 	}
 
 	blk_queue_bounce(q, &bio);
 
-	zone = get_zone(bio->bi_sector, pd);
+	zone = get_zone(bio->bi_iter.bi_sector, pd);
 	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
-		(unsigned long long)bio->bi_sector,
+		(unsigned long long)bio->bi_iter.bi_sector,
 		(unsigned long long)bio_end_sector(bio));
 
 	/* Check if we have to split the bio */
@@ -2395,7 +2398,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 		last_zone = get_zone(bio_end_sector(bio) - 1, pd);
 		if (last_zone != zone) {
 			BUG_ON(last_zone != zone + pd->settings.size);
-			first_sectors = last_zone - bio->bi_sector;
+			first_sectors = last_zone - bio->bi_iter.bi_sector;
 			bp = bio_split(bio, first_sectors);
 			BUG_ON(!bp);
 			pkt_make_request(q, &bp->bio1);
@@ -2417,7 +2420,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 			if ((pkt->state == PACKET_WAITING_STATE) ||
 			    (pkt->state == PACKET_READ_WAIT_STATE)) {
 				bio_list_add(&pkt->orig_bios, bio);
-				pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+				pkt->write_size +=
+					bio->bi_iter.bi_size / CD_FRAMESIZE;
 				if ((pkt->write_size >= pkt->frames) &&
 				    (pkt->state == PACKET_WAITING_STATE)) {
 					atomic_inc(&pkt->run_sm);
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index d754a88d758..464be78a083 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -104,7 +104,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
 		dev_dbg(&dev->sbd.core,
 			"%s:%u: bio %u: %u segs %u sectors from %lu\n",
 			__func__, __LINE__, i, bio_segments(iter.bio),
-			bio_sectors(iter.bio), iter.bio->bi_sector);
+			bio_sectors(iter.bio), iter.bio->bi_iter.bi_sector);
 
 		size = bvec->bv_len;
 		buf = bvec_kmap_irq(bvec, &flags);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 06a2e53e5f3..320bbfc9b90 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -553,7 +553,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev,
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	int write = bio_data_dir(bio) == WRITE;
 	const char *op = write ? "write" : "read";
-	loff_t offset = bio->bi_sector << 9;
+	loff_t offset = bio->bi_iter.bi_sector << 9;
 	int error = 0;
 	struct bio_vec *bvec;
 	unsigned int i;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index cb1db2979d3..a8f4fe2d4d1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1183,14 +1183,14 @@ static struct bio *bio_clone_range(struct bio *bio_src,
 
 	/* Handle the easy case for the caller */
 
-	if (!offset && len == bio_src->bi_size)
+	if (!offset && len == bio_src->bi_iter.bi_size)
 		return bio_clone(bio_src, gfpmask);
 
 	if (WARN_ON_ONCE(!len))
 		return NULL;
-	if (WARN_ON_ONCE(len > bio_src->bi_size))
+	if (WARN_ON_ONCE(len > bio_src->bi_iter.bi_size))
 		return NULL;
-	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
+	if (WARN_ON_ONCE(offset > bio_src->bi_iter.bi_size - len))
 		return NULL;
 
 	/* Find first affected segment... */
@@ -1220,7 +1220,8 @@ static struct bio *bio_clone_range(struct bio *bio_src,
 		return NULL;	/* ENOMEM */
 
 	bio->bi_bdev = bio_src->bi_bdev;
-	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
+	bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector +
+		(offset >> SECTOR_SHIFT);
 	bio->bi_rw = bio_src->bi_rw;
 	bio->bi_flags |= 1 << BIO_CLONED;
 
@@ -1239,8 +1240,7 @@ static struct bio *bio_clone_range(struct bio *bio_src,
 	}
 
 	bio->bi_vcnt = vcnt;
-	bio->bi_size = len;
-	bio->bi_idx = 0;
+	bio->bi_iter.bi_size = len;
 
 	return bio;
 }
@@ -1271,7 +1271,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 
 	/* Build up a chain of clone bios up to the limit */
 
-	if (!bi || off >= bi->bi_size || !len)
+	if (!bi || off >= bi->bi_iter.bi_size || !len)
 		return NULL;		/* Nothing to clone */
 
 	end = &chain;
@@ -1283,7 +1283,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
 			goto out_err;	/* EINVAL; ran out of bio's */
 		}
-		bi_size = min_t(unsigned int, bi->bi_size - off, len);
+		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
 		if (!bio)
 			goto out_err;	/* ENOMEM */
@@ -1292,7 +1292,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
 		end = &bio->bi_next;
 
 		off += bi_size;
-		if (off == bi->bi_size) {
+		if (off == bi->bi_iter.bi_size) {
 			bi = bi->bi_next;
 			off = 0;
 		}
@@ -2186,7 +2186,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 
 	if (type == OBJ_REQUEST_BIO) {
 		bio_list = data_desc;
-		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
+		rbd_assert(img_offset ==
+			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
 	} else {
 		rbd_assert(type == OBJ_REQUEST_PAGES);
 		pages = data_desc;
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 2284f5d3a54..2839d37e5af 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -174,7 +174,7 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
 	if (!card)
 		goto req_err;
 
-	if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk))
+	if (bio_end_sector(bio) > get_capacity(card->gendisk))
 		goto req_err;
 
 	if (unlikely(card->halt)) {
@@ -187,7 +187,7 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
 		goto req_err;
 	}
 
-	if (bio->bi_size == 0) {
+	if (bio->bi_iter.bi_size == 0) {
 		dev_err(CARD_TO_DEV(card), "size zero BIO!\n");
 		goto req_err;
 	}
@@ -208,7 +208,7 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
 
 	dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
 		 bio_data_dir(bio) ? 'W' : 'R', bio_meta,
-		 (u64)bio->bi_sector << 9, bio->bi_size);
+		 (u64)bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
 
 	st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas,
 				    bio_dma_done_cb, bio_meta);
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index fc88ba3e1bd..3716633be3c 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -696,7 +696,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 	int st;
 	int i;
 
-	addr8 = bio->bi_sector << 9; /* sectors are 512 bytes */
+	addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */
 	atomic_set(n_dmas, 0);
 
 	for (i = 0; i < card->n_targets; i++) {
@@ -705,7 +705,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 	}
 
 	if (bio->bi_rw & REQ_DISCARD) {
-		bv_len = bio->bi_size;
+		bv_len = bio->bi_iter.bi_size;
 
 		while (bv_len > 0) {
 			tgt   = rsxx_get_dma_tgt(card, addr8);
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index ad70868f8a9..dab4f1afeae 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -352,8 +352,8 @@ static int add_bio(struct cardinfo *card)
 	bio = card->currentbio;
 	if (!bio && card->bio) {
 		card->currentbio = card->bio;
-		card->current_idx = card->bio->bi_idx;
-		card->current_sector = card->bio->bi_sector;
+		card->current_idx = card->bio->bi_iter.bi_idx;
+		card->current_sector = card->bio->bi_iter.bi_sector;
 		card->bio = card->bio->bi_next;
 		if (card->bio == NULL)
 			card->biotail = &card->bio;
@@ -451,7 +451,7 @@ static void process_page(unsigned long data)
 		if (page->idx >= bio->bi_vcnt) {
 			page->bio = bio->bi_next;
 			if (page->bio)
-				page->idx = page->bio->bi_idx;
+				page->idx = page->bio->bi_iter.bi_idx;
 		}
 
 		pci_unmap_page(card->dev, desc->data_dma_handle,
@@ -532,7 +532,8 @@ static void mm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct cardinfo *card = q->queuedata;
 	pr_debug("mm_make_request %llu %u\n",
-		 (unsigned long long)bio->bi_sector, bio->bi_size);
+		 (unsigned long long)bio->bi_iter.bi_sector,
+		 bio->bi_iter.bi_size);
 
 	spin_lock_irq(&card->lock);
 	*card->biotail = bio;
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 6620b73d049..4b97b86da92 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1257,7 +1257,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 			bio->bi_bdev    = preq.bdev;
 			bio->bi_private = pending_req;
 			bio->bi_end_io  = end_block_io_op;
-			bio->bi_sector  = preq.sector_number;
+			bio->bi_iter.bi_sector  = preq.sector_number;
 		}
 
 		preq.sector_number += seg[i].nsec;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 432db1b59b0..80e86307dd4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1547,7 +1547,7 @@ static int blkif_recover(struct blkfront_info *info)
 			for (i = 0; i < pending; i++) {
 				offset = (i * segs * PAGE_SIZE) >> 9;
 				size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
-					   (unsigned int)(bio->bi_size >> 9) - offset);
+					   (unsigned int)bio_sectors(bio) - offset);
 				cloned_bio = bio_clone(bio, GFP_NOIO);
 				BUG_ON(cloned_bio == NULL);
 				bio_trim(cloned_bio, offset, size);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5e2765aadce..038a6d2aced 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -299,7 +299,7 @@ void bch_btree_node_read(struct btree *b)
 
 	bio = bch_bbio_alloc(b->c);
 	bio->bi_rw	= REQ_META|READ_SYNC;
-	bio->bi_size	= KEY_SIZE(&b->key) << 9;
+	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
 	bio->bi_end_io	= btree_node_read_endio;
 	bio->bi_private	= &cl;
 
@@ -395,7 +395,7 @@ static void do_btree_node_write(struct btree *b)
 	b->bio->bi_end_io	= btree_node_write_endio;
 	b->bio->bi_private	= cl;
 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
-	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c);
+	b->bio->bi_iter.bi_size	= set_blocks(i, b->c) * block_bytes(b->c);
 	bch_bio_map(b->bio, i);
 
 	/*
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 264fcfbd629..92b3fd468a0 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -195,7 +195,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 				 dc->disk.c,
 				 "verify failed at dev %s sector %llu",
 				 bdevname(dc->bdev, name),
-				 (uint64_t) bio->bi_sector);
+				 (uint64_t) bio->bi_iter.bi_sector);
 
 		kunmap_atomic(p1);
 	}
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9056632995b..cc4ba2da5fb 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -21,18 +21,18 @@ static void bch_bi_idx_hack_endio(struct bio *bio, int error)
 
 static void bch_generic_make_request_hack(struct bio *bio)
 {
-	if (bio->bi_idx) {
+	if (bio->bi_iter.bi_idx) {
 		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
 
 		memcpy(clone->bi_io_vec,
 		       bio_iovec(bio),
 		       bio_segments(bio) * sizeof(struct bio_vec));
 
-		clone->bi_sector	= bio->bi_sector;
+		clone->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 		clone->bi_bdev		= bio->bi_bdev;
 		clone->bi_rw		= bio->bi_rw;
 		clone->bi_vcnt		= bio_segments(bio);
-		clone->bi_size		= bio->bi_size;
+		clone->bi_iter.bi_size	= bio->bi_iter.bi_size;
 
 		clone->bi_private	= bio;
 		clone->bi_end_io	= bch_bi_idx_hack_endio;
@@ -72,7 +72,7 @@ static void bch_generic_make_request_hack(struct bio *bio)
 struct bio *bch_bio_split(struct bio *bio, int sectors,
 			  gfp_t gfp, struct bio_set *bs)
 {
-	unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
+	unsigned idx = bio->bi_iter.bi_idx, vcnt = 0, nbytes = sectors << 9;
 	struct bio_vec *bv;
 	struct bio *ret = NULL;
 
@@ -90,7 +90,7 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
 	}
 
 	bio_for_each_segment(bv, bio, idx) {
-		vcnt = idx - bio->bi_idx;
+		vcnt = idx - bio->bi_iter.bi_idx;
 
 		if (!nbytes) {
 			ret = bio_alloc_bioset(gfp, vcnt, bs);
@@ -119,15 +119,15 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
 	}
 out:
 	ret->bi_bdev	= bio->bi_bdev;
-	ret->bi_sector	= bio->bi_sector;
-	ret->bi_size	= sectors << 9;
+	ret->bi_iter.bi_sector	= bio->bi_iter.bi_sector;
+	ret->bi_iter.bi_size	= sectors << 9;
 	ret->bi_rw	= bio->bi_rw;
 	ret->bi_vcnt	= vcnt;
 	ret->bi_max_vecs = vcnt;
 
-	bio->bi_sector	+= sectors;
-	bio->bi_size	-= sectors << 9;
-	bio->bi_idx	 = idx;
+	bio->bi_iter.bi_sector	+= sectors;
+	bio->bi_iter.bi_size	-= sectors << 9;
+	bio->bi_iter.bi_idx	 = idx;
 
 	if (bio_integrity(bio)) {
 		if (bio_integrity_clone(ret, bio, gfp)) {
@@ -162,7 +162,7 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
 		bio_for_each_segment(bv, bio, i) {
 			struct bvec_merge_data bvm = {
 				.bi_bdev	= bio->bi_bdev,
-				.bi_sector	= bio->bi_sector,
+				.bi_sector	= bio->bi_iter.bi_sector,
 				.bi_size	= ret << 9,
 				.bi_rw		= bio->bi_rw,
 			};
@@ -272,8 +272,8 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 
-	bio->bi_sector	= PTR_OFFSET(&b->key, 0);
-	bio->bi_bdev	= PTR_CACHE(c, &b->key, 0)->bdev;
+	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
+	bio->bi_bdev		= PTR_CACHE(c, &b->key, 0)->bdev;
 
 	b->submit_time_us = local_clock_us();
 	closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ecdaa671bd5..7eafdf09a0a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -51,10 +51,10 @@ reread:		left = ca->sb.bucket_size - offset;
 		len = min_t(unsigned, left, PAGE_SECTORS * 8);
 
 		bio_reset(bio);
-		bio->bi_sector	= bucket + offset;
+		bio->bi_iter.bi_sector	= bucket + offset;
 		bio->bi_bdev	= ca->bdev;
 		bio->bi_rw	= READ;
-		bio->bi_size	= len << 9;
+		bio->bi_iter.bi_size	= len << 9;
 
 		bio->bi_end_io	= journal_read_endio;
 		bio->bi_private = &cl;
@@ -437,13 +437,13 @@ static void do_journal_discard(struct cache *ca)
 		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
 
 		bio_init(bio);
-		bio->bi_sector		= bucket_to_sector(ca->set,
+		bio->bi_iter.bi_sector	= bucket_to_sector(ca->set,
 						ca->sb.d[ja->discard_idx]);
 		bio->bi_bdev		= ca->bdev;
 		bio->bi_rw		= REQ_WRITE|REQ_DISCARD;
 		bio->bi_max_vecs	= 1;
 		bio->bi_io_vec		= bio->bi_inline_vecs;
-		bio->bi_size		= bucket_bytes(ca);
+		bio->bi_iter.bi_size	= bucket_bytes(ca);
 		bio->bi_end_io		= journal_discard_endio;
 
 		closure_get(&ca->set->cl);
@@ -608,10 +608,10 @@ static void journal_write_unlocked(struct closure *cl)
 		atomic_long_add(sectors, &ca->meta_sectors_written);
 
 		bio_reset(bio);
-		bio->bi_sector	= PTR_OFFSET(k, i);
+		bio->bi_iter.bi_sector	= PTR_OFFSET(k, i);
 		bio->bi_bdev	= ca->bdev;
 		bio->bi_rw	= REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
-		bio->bi_size	= sectors << 9;
+		bio->bi_iter.bi_size = sectors << 9;
 
 		bio->bi_end_io	= journal_write_endio;
 		bio->bi_private = w;
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 7c1275e6602..581f95df826 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -82,7 +82,7 @@ static void moving_init(struct moving_io *io)
 	bio_get(bio);
 	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 
-	bio->bi_size		= KEY_SIZE(&io->w->key) << 9;
+	bio->bi_iter.bi_size	= KEY_SIZE(&io->w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
 					       PAGE_SECTORS);
 	bio->bi_private		= &io->cl;
@@ -98,7 +98,7 @@ static void write_moving(struct closure *cl)
 	if (!op->error) {
 		moving_init(io);
 
-		io->bio.bio.bi_sector = KEY_START(&io->w->key);
+		io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
 		op->write_prio		= 1;
 		op->bio			= &io->bio.bio;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 78bab4154e9..47a9bbc7512 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -261,7 +261,7 @@ static void bch_data_invalidate(struct closure *cl)
 	struct bio *bio = op->bio;
 
 	pr_debug("invalidating %i sectors from %llu",
-		 bio_sectors(bio), (uint64_t) bio->bi_sector);
+		 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
 
 	while (bio_sectors(bio)) {
 		unsigned sectors = min(bio_sectors(bio),
@@ -270,11 +270,11 @@ static void bch_data_invalidate(struct closure *cl)
 		if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
 			goto out;
 
-		bio->bi_sector	+= sectors;
-		bio->bi_size	-= sectors << 9;
+		bio->bi_iter.bi_sector	+= sectors;
+		bio->bi_iter.bi_size	-= sectors << 9;
 
 		bch_keylist_add(&op->insert_keys,
-				&KEY(op->inode, bio->bi_sector, sectors));
+				&KEY(op->inode, bio->bi_iter.bi_sector, sectors));
 	}
 
 	op->insert_data_done = true;
@@ -364,7 +364,7 @@ static void bch_data_insert_start(struct closure *cl)
 		k = op->insert_keys.top;
 		bkey_init(k);
 		SET_KEY_INODE(k, op->inode);
-		SET_KEY_OFFSET(k, bio->bi_sector);
+		SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
 
 		if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
 				       op->write_point, op->write_prio,
@@ -522,7 +522,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     (bio->bi_rw & REQ_WRITE)))
 		goto skip;
 
-	if (bio->bi_sector & (c->sb.block_size - 1) ||
+	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
 		goto skip;
@@ -546,8 +546,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 
 	spin_lock(&dc->io_lock);
 
-	hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-		if (i->last == bio->bi_sector &&
+	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+		if (i->last == bio->bi_iter.bi_sector &&
 		    time_before(jiffies, i->jiffies))
 			goto found;
 
@@ -556,8 +556,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	add_sequential(task);
 	i->sequential = 0;
 found:
-	if (i->sequential + bio->bi_size > i->sequential)
-		i->sequential	+= bio->bi_size;
+	if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+		i->sequential	+= bio->bi_iter.bi_size;
 
 	i->last			 = bio_end_sector(bio);
 	i->jiffies		 = jiffies + msecs_to_jiffies(5000);
@@ -650,15 +650,15 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	struct bkey *bio_key;
 	unsigned ptr;
 
-	if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
+	if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
 		return MAP_CONTINUE;
 
 	if (KEY_INODE(k) != s->iop.inode ||
-	    KEY_START(k) > bio->bi_sector) {
+	    KEY_START(k) > bio->bi_iter.bi_sector) {
 		unsigned bio_sectors = bio_sectors(bio);
 		unsigned sectors = KEY_INODE(k) == s->iop.inode
 			? min_t(uint64_t, INT_MAX,
-				KEY_START(k) - bio->bi_sector)
+				KEY_START(k) - bio->bi_iter.bi_sector)
 			: INT_MAX;
 
 		int ret = s->d->cache_miss(b, s, bio, sectors);
@@ -681,13 +681,13 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 		s->read_dirty_data = true;
 
 	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
-				     KEY_OFFSET(k) - bio->bi_sector),
+				     KEY_OFFSET(k) - bio->bi_iter.bi_sector),
 			  GFP_NOIO, s->d->bio_split);
 
 	bio_key = &container_of(n, struct bbio, bio)->key;
 	bch_bkey_copy_single_ptr(bio_key, k, ptr);
 
-	bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
+	bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
 	bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
 
 	n->bi_end_io	= bch_cache_read_endio;
@@ -714,7 +714,7 @@ static void cache_lookup(struct closure *cl)
 	struct bio *bio = &s->bio.bio;
 
 	int ret = bch_btree_map_keys(&s->op, s->iop.c,
-				     &KEY(s->iop.inode, bio->bi_sector, 0),
+				     &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
 				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
 		continue_at(cl, cache_lookup, bcache_wq);
@@ -872,9 +872,9 @@ static void cached_dev_read_done(struct closure *cl)
 
 	if (s->iop.bio) {
 		bio_reset(s->iop.bio);
-		s->iop.bio->bi_sector = s->cache_miss->bi_sector;
+		s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
 		s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
-		s->iop.bio->bi_size = s->insert_bio_sectors << 9;
+		s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
 		bch_bio_map(s->iop.bio, NULL);
 
 		bio_copy_data(s->cache_miss, s->iop.bio);
@@ -937,7 +937,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
 	s->iop.replace_key = KEY(s->iop.inode,
-				 bio->bi_sector + s->insert_bio_sectors,
+				 bio->bi_iter.bi_sector + s->insert_bio_sectors,
 				 s->insert_bio_sectors);
 
 	ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
@@ -957,9 +957,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	if (!cache_bio)
 		goto out_submit;
 
-	cache_bio->bi_sector	= miss->bi_sector;
-	cache_bio->bi_bdev	= miss->bi_bdev;
-	cache_bio->bi_size	= s->insert_bio_sectors << 9;
+	cache_bio->bi_iter.bi_sector	= miss->bi_iter.bi_sector;
+	cache_bio->bi_bdev		= miss->bi_bdev;
+	cache_bio->bi_iter.bi_size	= s->insert_bio_sectors << 9;
 
 	cache_bio->bi_end_io	= request_endio;
 	cache_bio->bi_private	= &s->cl;
@@ -1009,7 +1009,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
-	struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
+	struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
 	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 
 	bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
@@ -1104,13 +1104,13 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 
 	bio->bi_bdev = dc->bdev;
-	bio->bi_sector += dc->sb.data_offset;
+	bio->bi_iter.bi_sector += dc->sb.data_offset;
 
 	if (cached_dev_get(dc)) {
 		s = search_alloc(bio, d);
 		trace_bcache_request_start(s->d, bio);
 
-		if (!bio->bi_size) {
+		if (!bio->bi_iter.bi_size) {
 			/*
 			 * can't call bch_journal_meta from under
 			 * generic_make_request
@@ -1197,9 +1197,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
 		sectors	-= j;
 	}
 
-	bio_advance(bio, min(sectors << 9, bio->bi_size));
+	bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
 
-	if (!bio->bi_size)
+	if (!bio->bi_iter.bi_size)
 		return MAP_DONE;
 
 	return MAP_CONTINUE;
@@ -1233,7 +1233,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 
 	trace_bcache_request_start(s->d, bio);
 
-	if (!bio->bi_size) {
+	if (!bio->bi_iter.bi_size) {
 		/*
 		 * can't call bch_journal_meta from under
 		 * generic_make_request
@@ -1243,7 +1243,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 				      bcache_wq);
 	} else if (rw) {
 		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
-					&KEY(d->id, bio->bi_sector, 0),
+					&KEY(d->id, bio->bi_iter.bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
 
 		s->iop.bypass		= (bio->bi_rw & REQ_DISCARD) != 0;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1d9ee67d14e..60fb6044b95 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -233,9 +233,9 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
 	struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
 	unsigned i;
 
-	bio->bi_sector	= SB_SECTOR;
-	bio->bi_rw	= REQ_SYNC|REQ_META;
-	bio->bi_size	= SB_SIZE;
+	bio->bi_iter.bi_sector	= SB_SECTOR;
+	bio->bi_rw		= REQ_SYNC|REQ_META;
+	bio->bi_iter.bi_size	= SB_SIZE;
 	bch_bio_map(bio, NULL);
 
 	out->offset		= cpu_to_le64(sb->offset);
@@ -347,7 +347,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
 		struct bio *bio = bch_bbio_alloc(c);
 
 		bio->bi_rw	= REQ_SYNC|REQ_META|rw;
-		bio->bi_size	= KEY_SIZE(k) << 9;
+		bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
 
 		bio->bi_end_io	= uuid_endio;
 		bio->bi_private = cl;
@@ -503,10 +503,10 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
 
 	closure_init_stack(cl);
 
-	bio->bi_sector	= bucket * ca->sb.bucket_size;
-	bio->bi_bdev	= ca->bdev;
-	bio->bi_rw	= REQ_SYNC|REQ_META|rw;
-	bio->bi_size	= bucket_bytes(ca);
+	bio->bi_iter.bi_sector	= bucket * ca->sb.bucket_size;
+	bio->bi_bdev		= ca->bdev;
+	bio->bi_rw		= REQ_SYNC|REQ_META|rw;
+	bio->bi_iter.bi_size	= bucket_bytes(ca);
 
 	bio->bi_end_io	= prio_endio;
 	bio->bi_private = ca;
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 462214eeacb..c57621e49dc 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -218,10 +218,10 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 
 void bch_bio_map(struct bio *bio, void *base)
 {
-	size_t size = bio->bi_size;
+	size_t size = bio->bi_iter.bi_size;
 	struct bio_vec *bv = bio->bi_io_vec;
 
-	BUG_ON(!bio->bi_size);
+	BUG_ON(!bio->bi_iter.bi_size);
 	BUG_ON(bio->bi_vcnt);
 
 	bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 99053b1251b..04657e93f4f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -113,7 +113,7 @@ static void dirty_init(struct keybuf_key *w)
 	if (!io->dc->writeback_percent)
 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 
-	bio->bi_size		= KEY_SIZE(&w->key) << 9;
+	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
 	bio->bi_private		= w;
 	bio->bi_io_vec		= bio->bi_inline_vecs;
@@ -186,7 +186,7 @@ static void write_dirty(struct closure *cl)
 
 	dirty_init(w);
 	io->bio.bi_rw		= WRITE;
-	io->bio.bi_sector	= KEY_START(&w->key);
+	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
 	io->bio.bi_bdev		= io->dc->bdev;
 	io->bio.bi_end_io	= dirty_endio;
 
@@ -255,7 +255,7 @@ static void read_dirty(struct cached_dev *dc)
 		io->dc		= dc;
 
 		dirty_init(w);
-		io->bio.bi_sector	= PTR_OFFSET(&w->key, 0);
+		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
 		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
 						    &w->key, 0)->bdev;
 		io->bio.bi_rw		= READ;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c9ddcf4614b..e2f8598937a 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -50,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		return false;
 
 	if (dc->partial_stripes_expensive &&
-	    bcache_dev_stripe_dirty(dc, bio->bi_sector,
+	    bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
 				    bio_sectors(bio)))
 		return true;
 
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 3a8cfa2645c..5ace48ee9f5 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -40,10 +40,10 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
 	unsigned i;
 
-	bd->bi_sector = bio->bi_sector;
+	bd->bi_sector = bio->bi_iter.bi_sector;
 	bd->bi_bdev = bio->bi_bdev;
-	bd->bi_size = bio->bi_size;
-	bd->bi_idx = bio->bi_idx;
+	bd->bi_size = bio->bi_iter.bi_size;
+	bd->bi_idx = bio->bi_iter.bi_idx;
 	bd->bi_flags = bio->bi_flags;
 
 	for (i = 0; i < bio->bi_vcnt; i++) {
@@ -56,10 +56,10 @@ static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
 	unsigned i;
 
-	bio->bi_sector = bd->bi_sector;
+	bio->bi_iter.bi_sector = bd->bi_sector;
 	bio->bi_bdev = bd->bi_bdev;
-	bio->bi_size = bd->bi_size;
-	bio->bi_idx = bd->bi_idx;
+	bio->bi_iter.bi_size = bd->bi_size;
+	bio->bi_iter.bi_idx = bd->bi_idx;
 	bio->bi_flags = bd->bi_flags;
 
 	for (i = 0; i < bio->bi_vcnt; i++) {
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 173cbb20d10..4113b6044b8 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -538,7 +538,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	bio_init(&b->bio);
 	b->bio.bi_io_vec = b->bio_vec;
 	b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
-	b->bio.bi_sector = block << b->c->sectors_per_block_bits;
+	b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
 	b->bio.bi_bdev = b->c->bdev;
 	b->bio.bi_end_io = end_io;
 
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 416b7b752a6..bfba97dcde2 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -72,7 +72,7 @@ static enum io_pattern iot_pattern(struct io_tracker *t)
 
 static void iot_update_stats(struct io_tracker *t, struct bio *bio)
 {
-	if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
+	if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
 		t->nr_seq_samples++;
 	else {
 		/*
@@ -87,7 +87,7 @@ static void iot_update_stats(struct io_tracker *t, struct bio *bio)
 		t->nr_rand_samples++;
 	}
 
-	t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
+	t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
 }
 
 static void iot_check_for_pattern_switch(struct io_tracker *t)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9efcf1059b9..86f9c83eb30 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -664,15 +664,17 @@ static void remap_to_origin(struct cache *cache, struct bio *bio)
 static void remap_to_cache(struct cache *cache, struct bio *bio,
 			   dm_cblock_t cblock)
 {
-	sector_t bi_sector = bio->bi_sector;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	bio->bi_bdev = cache->cache_dev->bdev;
 	if (!block_size_is_power_of_two(cache))
-		bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
-				sector_div(bi_sector, cache->sectors_per_block);
+		bio->bi_iter.bi_sector =
+			(from_cblock(cblock) * cache->sectors_per_block) +
+			sector_div(bi_sector, cache->sectors_per_block);
 	else
-		bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
-				(bi_sector & (cache->sectors_per_block - 1));
+		bio->bi_iter.bi_sector =
+			(from_cblock(cblock) << cache->sectors_per_block_shift) |
+			(bi_sector & (cache->sectors_per_block - 1));
 }
 
 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
@@ -712,7 +714,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 
 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 {
-	sector_t block_nr = bio->bi_sector;
+	sector_t block_nr = bio->bi_iter.bi_sector;
 
 	if (!block_size_is_power_of_two(cache))
 		(void) sector_div(block_nr, cache->sectors_per_block);
@@ -1027,7 +1029,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
 {
 	return (bio_data_dir(bio) == WRITE) &&
-		(bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
 }
 
 static void avoid_copy(struct dm_cache_migration *mg)
@@ -1252,7 +1254,7 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
-	BUG_ON(bio->bi_size);
+	BUG_ON(bio->bi_iter.bi_size);
 	if (!pb->req_nr)
 		remap_to_origin(cache, bio);
 	else
@@ -1275,9 +1277,9 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
  */
 static void process_discard_bio(struct cache *cache, struct bio *bio)
 {
-	dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
+	dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
 						  cache->discard_block_size);
-	dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
+	dm_block_t end_block = bio_end_sector(bio);
 	dm_block_t b;
 
 	end_block = block_div(end_block, cache->discard_block_size);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 81b0fa66045..1e2e5465d28 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -828,8 +828,8 @@ static void crypt_convert_init(struct crypt_config *cc,
 	ctx->bio_out = bio_out;
 	ctx->offset_in = 0;
 	ctx->offset_out = 0;
-	ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
-	ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
+	ctx->idx_in = bio_in ? bio_in->bi_iter.bi_idx : 0;
+	ctx->idx_out = bio_out ? bio_out->bi_iter.bi_idx : 0;
 	ctx->cc_sector = sector + cc->iv_offset;
 	init_completion(&ctx->restart);
 }
@@ -1021,7 +1021,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
 		size -= len;
 	}
 
-	if (!clone->bi_size) {
+	if (!clone->bi_iter.bi_size) {
 		bio_put(clone);
 		return NULL;
 	}
@@ -1161,7 +1161,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	crypt_inc_pending(io);
 
 	clone_init(io, clone);
-	clone->bi_sector = cc->start + io->sector;
+	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	generic_make_request(clone);
 	return 0;
@@ -1209,7 +1209,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	/* crypt_convert should have filled the clone bio */
 	BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
 
-	clone->bi_sector = cc->start + io->sector;
+	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	if (async)
 		kcryptd_queue_io(io);
@@ -1224,7 +1224,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	struct dm_crypt_io *new_io;
 	int crypt_finished;
 	unsigned out_of_pages = 0;
-	unsigned remaining = io->base_bio->bi_size;
+	unsigned remaining = io->base_bio->bi_iter.bi_size;
 	sector_t sector = io->sector;
 	int r;
 
@@ -1248,7 +1248,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		io->ctx.bio_out = clone;
 		io->ctx.idx_out = 0;
 
-		remaining -= clone->bi_size;
+		remaining -= clone->bi_iter.bi_size;
 		sector += bio_sectors(clone);
 
 		crypt_inc_pending(io);
@@ -1869,11 +1869,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
 		bio->bi_bdev = cc->dev->bdev;
 		if (bio_sectors(bio))
-			bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
+			bio->bi_iter.bi_sector = cc->start +
+				dm_target_offset(ti, bio->bi_iter.bi_sector);
 		return DM_MAPIO_REMAPPED;
 	}
 
-	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector));
+	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 496d5f3646a..84c860191a2 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -281,14 +281,15 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
 	if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
 		bio->bi_bdev = dc->dev_write->bdev;
 		if (bio_sectors(bio))
-			bio->bi_sector = dc->start_write +
-					 dm_target_offset(ti, bio->bi_sector);
+			bio->bi_iter.bi_sector = dc->start_write +
+				dm_target_offset(ti, bio->bi_iter.bi_sector);
 
 		return delay_bio(dc, dc->write_delay, bio);
 	}
 
 	bio->bi_bdev = dc->dev_read->bdev;
-	bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector);
+	bio->bi_iter.bi_sector = dc->start_read +
+		dm_target_offset(ti, bio->bi_iter.bi_sector);
 
 	return delay_bio(dc, dc->read_delay, bio);
 }
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index c80a0ec5f12..b257e46876d 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -248,7 +248,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_bdev = fc->dev->bdev;
 	if (bio_sectors(bio))
-		bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
+		bio->bi_iter.bi_sector =
+			flakey_map_sector(ti, bio->bi_iter.bi_sector);
 }
 
 static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
@@ -265,8 +266,8 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
 		DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
 			"(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
 			bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
-			(bio_data_dir(bio) == WRITE) ? 'w' : 'r',
-			bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+			(bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw,
+			(unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
 	}
 }
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2a20986a2fe..01558b09330 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -304,14 +304,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
 
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
-		bio->bi_sector = where->sector + (where->count - remaining);
+		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
 		store_io_and_region_in_bio(bio, io, region);
 
 		if (rw & REQ_DISCARD) {
 			num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
-			bio->bi_size = num_sectors << SECTOR_SHIFT;
+			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 			remaining -= num_sectors;
 		} else if (rw & REQ_WRITE_SAME) {
 			/*
@@ -320,7 +320,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 			dp->get_page(dp, &page, &len, &offset);
 			bio_add_page(bio, page, logical_block_size, offset);
 			num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
-			bio->bi_size = num_sectors << SECTOR_SHIFT;
+			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 
 			offset = 0;
 			remaining -= num_sectors;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4f99d267340..53e848c1093 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -85,7 +85,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_bdev = lc->dev->bdev;
 	if (bio_sectors(bio))
-		bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+		bio->bi_iter.bi_sector =
+			linear_map_sector(ti, bio->bi_iter.bi_sector);
 }
 
 static int linear_map(struct dm_target *ti, struct bio *bio)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9584443c561..9f6d8e6baa7 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -432,7 +432,7 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
 	region_t region = dm_rh_bio_to_region(ms->rh, bio);
 
 	if (log->type->in_sync(log, region, 0))
-		return choose_mirror(ms,  bio->bi_sector) ? 1 : 0;
+		return choose_mirror(ms,  bio->bi_iter.bi_sector) ? 1 : 0;
 
 	return 0;
 }
@@ -442,15 +442,15 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
  */
 static sector_t map_sector(struct mirror *m, struct bio *bio)
 {
-	if (unlikely(!bio->bi_size))
+	if (unlikely(!bio->bi_iter.bi_size))
 		return 0;
-	return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector);
+	return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector);
 }
 
 static void map_bio(struct mirror *m, struct bio *bio)
 {
 	bio->bi_bdev = m->dev->bdev;
-	bio->bi_sector = map_sector(m, bio);
+	bio->bi_iter.bi_sector = map_sector(m, bio);
 }
 
 static void map_region(struct dm_io_region *io, struct mirror *m,
@@ -527,7 +527,7 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
 	struct dm_io_request io_req = {
 		.bi_rw = READ,
 		.mem.type = DM_IO_BVEC,
-		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_iter.bi_idx,
 		.notify.fn = read_callback,
 		.notify.context = bio,
 		.client = m->ms->io_client,
@@ -559,7 +559,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 		 * We can only read balance if the region is in sync.
 		 */
 		if (likely(region_in_sync(ms, region, 1)))
-			m = choose_mirror(ms, bio->bi_sector);
+			m = choose_mirror(ms, bio->bi_iter.bi_sector);
 		else if (m && atomic_read(&m->error_count))
 			m = NULL;
 
@@ -630,7 +630,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
 		.mem.type = DM_IO_BVEC,
-		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_iter.bi_idx,
 		.notify.fn = write_callback,
 		.notify.context = bio,
 		.client = ms->io_client,
@@ -1181,7 +1181,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	 * The region is in-sync and we can perform reads directly.
 	 * Store enough information so we can retry if it fails.
 	 */
-	m = choose_mirror(ms, bio->bi_sector);
+	m = choose_mirror(ms, bio->bi_iter.bi_sector);
 	if (unlikely(!m))
 		return -EIO;
 
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 69732e03eb3..b929fd5f498 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -126,7 +126,8 @@ EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
 
 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
 {
-	return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
+	return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector -
+				      rh->target_begin);
 }
 EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index aec57d76db5..3ded8c729df 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1562,11 +1562,10 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
 			    struct bio *bio, chunk_t chunk)
 {
 	bio->bi_bdev = s->cow->bdev;
-	bio->bi_sector = chunk_to_sector(s->store,
-					 dm_chunk_number(e->new_chunk) +
-					 (chunk - e->old_chunk)) +
-					 (bio->bi_sector &
-					  s->store->chunk_mask);
+	bio->bi_iter.bi_sector =
+		chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
+				(chunk - e->old_chunk)) +
+		(bio->bi_iter.bi_sector & s->store->chunk_mask);
 }
 
 static int snapshot_map(struct dm_target *ti, struct bio *bio)
@@ -1584,7 +1583,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 
-	chunk = sector_to_chunk(s->store, bio->bi_sector);
+	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
 	/* Full snapshots are not usable */
 	/* To get here the table must be live so s->active is always set. */
@@ -1645,7 +1644,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 		r = DM_MAPIO_SUBMITTED;
 
 		if (!pe->started &&
-		    bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+		    bio->bi_iter.bi_size ==
+		    (s->store->chunk_size << SECTOR_SHIFT)) {
 			pe->started = 1;
 			up_write(&s->lock);
 			start_full_bio(pe, bio);
@@ -1701,7 +1701,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 
-	chunk = sector_to_chunk(s->store, bio->bi_sector);
+	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
 	down_write(&s->lock);
 
@@ -2038,7 +2038,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
 	down_read(&_origins_lock);
 	o = __lookup_origin(origin->bdev);
 	if (o)
-		r = __origin_write(&o->snapshots, bio->bi_sector, bio);
+		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
 	up_read(&_origins_lock);
 
 	return r;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 73c1712dad9..d1600d2aa2e 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -259,13 +259,15 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
 {
 	sector_t begin, end;
 
-	stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
+	stripe_map_range_sector(sc, bio->bi_iter.bi_sector,
+				target_stripe, &begin);
 	stripe_map_range_sector(sc, bio_end_sector(bio),
 				target_stripe, &end);
 	if (begin < end) {
 		bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
-		bio->bi_sector = begin + sc->stripe[target_stripe].physical_start;
-		bio->bi_size = to_bytes(end - begin);
+		bio->bi_iter.bi_sector = begin +
+			sc->stripe[target_stripe].physical_start;
+		bio->bi_iter.bi_size = to_bytes(end - begin);
 		return DM_MAPIO_REMAPPED;
 	} else {
 		/* The range doesn't map to the target stripe */
@@ -293,9 +295,10 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 		return stripe_map_range(sc, bio, target_bio_nr);
 	}
 
-	stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
+	stripe_map_sector(sc, bio->bi_iter.bi_sector,
+			  &stripe, &bio->bi_iter.bi_sector);
 
-	bio->bi_sector += sc->stripe[stripe].physical_start;
+	bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
 	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
 
 	return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index ff9ac4be472..09a688b3d48 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -311,11 +311,11 @@ error:
 static int switch_map(struct dm_target *ti, struct bio *bio)
 {
 	struct switch_ctx *sctx = ti->private;
-	sector_t offset = dm_target_offset(ti, bio->bi_sector);
+	sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
 	unsigned path_nr = switch_get_path_nr(sctx, offset);
 
 	bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
-	bio->bi_sector = sctx->path_list[path_nr].start + offset;
+	bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
 
 	return DM_MAPIO_REMAPPED;
 }
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2c0cf511ec2..a65402480c8 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -413,7 +413,7 @@ static bool block_size_is_power_of_two(struct pool *pool)
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
-	sector_t block_nr = bio->bi_sector;
+	sector_t block_nr = bio->bi_iter.bi_sector;
 
 	if (block_size_is_power_of_two(pool))
 		block_nr >>= pool->sectors_per_block_shift;
@@ -426,14 +426,15 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
 	struct pool *pool = tc->pool;
-	sector_t bi_sector = bio->bi_sector;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	bio->bi_bdev = tc->pool_dev->bdev;
 	if (block_size_is_power_of_two(pool))
-		bio->bi_sector = (block << pool->sectors_per_block_shift) |
-				(bi_sector & (pool->sectors_per_block - 1));
+		bio->bi_iter.bi_sector =
+			(block << pool->sectors_per_block_shift) |
+			(bi_sector & (pool->sectors_per_block - 1));
 	else
-		bio->bi_sector = (block * pool->sectors_per_block) +
+		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
 				 sector_div(bi_sector, pool->sectors_per_block);
 }
 
@@ -721,7 +722,8 @@ static void process_prepared(struct pool *pool, struct list_head *head,
  */
 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-	return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
+	return bio->bi_iter.bi_size ==
+		(pool->sectors_per_block << SECTOR_SHIFT);
 }
 
 static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -1130,7 +1132,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 	if (bio_detain(pool, &key, bio, &cell))
 		return;
 
-	if (bio_data_dir(bio) == WRITE && bio->bi_size)
+	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
 		break_sharing(tc, bio, block, &key, lookup_result, cell);
 	else {
 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1153,7 +1155,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	/*
 	 * Remap empty bios (flushes) immediately, without provisioning.
 	 */
-	if (!bio->bi_size) {
+	if (!bio->bi_iter.bi_size) {
 		inc_all_io_entry(pool, bio);
 		cell_defer_no_holder(tc, cell);
 
@@ -1253,7 +1255,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
-		if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
 			bio_io_error(bio);
 		else {
 			inc_all_io_entry(tc->pool, bio);
@@ -2867,7 +2869,7 @@ out_unlock:
 
 static int thin_map(struct dm_target *ti, struct bio *bio)
 {
-	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
+	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
 
 	return thin_bio_map(ti, bio);
 }
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 4b7941db3af..132b3154d46 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -493,9 +493,9 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	struct dm_verity_io *io;
 
 	bio->bi_bdev = v->data_dev->bdev;
-	bio->bi_sector = verity_map_sector(v, bio->bi_sector);
+	bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
 
-	if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
+	if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
 	    ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
 		DMERR_LIMIT("unaligned io");
 		return -EIO;
@@ -514,8 +514,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	io->v = v;
 	io->orig_bi_end_io = bio->bi_end_io;
 	io->orig_bi_private = bio->bi_private;
-	io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
-	io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
+	io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
+	io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
 
 	bio->bi_end_io = verity_end_io;
 	bio->bi_private = io;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0704c523a76..ccd064ea4fe 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -575,7 +575,7 @@ static void start_io_acct(struct dm_io *io)
 		atomic_inc_return(&md->pending[rw]));
 
 	if (unlikely(dm_stats_used(&md->stats)))
-		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 				    bio_sectors(bio), false, 0, &io->stats_aux);
 }
 
@@ -593,7 +593,7 @@ static void end_io_acct(struct dm_io *io)
 	part_stat_unlock();
 
 	if (unlikely(dm_stats_used(&md->stats)))
-		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 				    bio_sectors(bio), true, duration, &io->stats_aux);
 
 	/*
@@ -742,7 +742,7 @@ static void dec_pending(struct dm_io *io, int error)
 		if (io_error == DM_ENDIO_REQUEUE)
 			return;
 
-		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
+		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
 			/*
 			 * Preflush done for flush with data, reissue
 			 * without REQ_FLUSH.
@@ -797,7 +797,7 @@ static void end_clone_bio(struct bio *clone, int error)
 	struct dm_rq_clone_bio_info *info = clone->bi_private;
 	struct dm_rq_target_io *tio = info->tio;
 	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_size;
+	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
 
 	bio_put(clone);
 
@@ -1128,7 +1128,7 @@ static void __map_bio(struct dm_target_io *tio)
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
-	sector = clone->bi_sector;
+	sector = clone->bi_iter.bi_sector;
 	r = ti->type->map(ti, clone);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
@@ -1160,13 +1160,13 @@ struct clone_info {
 
 static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
 {
-	bio->bi_sector = sector;
-	bio->bi_size = to_bytes(len);
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_iter.bi_size = to_bytes(len);
 }
 
 static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
 {
-	bio->bi_idx = idx;
+	bio->bi_iter.bi_idx = idx;
 	bio->bi_vcnt = idx + bv_count;
 	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 }
@@ -1202,7 +1202,7 @@ static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
 	clone->bi_rw = bio->bi_rw;
 	clone->bi_vcnt = 1;
 	clone->bi_io_vec->bv_offset = offset;
-	clone->bi_io_vec->bv_len = clone->bi_size;
+	clone->bi_io_vec->bv_len = clone->bi_iter.bi_size;
 	clone->bi_flags |= 1 << BIO_CLONED;
 
 	clone_bio_integrity(bio, clone, idx, len, offset, 1);
@@ -1222,7 +1222,8 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio,
 	bio_setup_sector(clone, sector, len);
 	bio_setup_bv(clone, idx, bv_count);
 
-	if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+	if (idx != bio->bi_iter.bi_idx ||
+	    clone->bi_iter.bi_size < bio->bi_iter.bi_size)
 		trim = 1;
 	clone_bio_integrity(bio, clone, idx, len, 0, trim);
 }
@@ -1510,8 +1511,8 @@ static void __split_and_process_bio(struct mapped_device *md,
 	ci.io->bio = bio;
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
-	ci.sector = bio->bi_sector;
-	ci.idx = bio->bi_idx;
+	ci.sector = bio->bi_iter.bi_sector;
+	ci.idx = bio->bi_iter.bi_idx;
 
 	start_io_acct(ci.io);
 
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 3193aefe982..e8b4574956c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -74,8 +74,8 @@ static void faulty_fail(struct bio *bio, int error)
 {
 	struct bio *b = bio->bi_private;
 
-	b->bi_size = bio->bi_size;
-	b->bi_sector = bio->bi_sector;
+	b->bi_iter.bi_size = bio->bi_iter.bi_size;
+	b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 
 	bio_put(bio);
 
@@ -185,26 +185,31 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 			return;
 		}
 
-		if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE))
+		if (check_sector(conf, bio->bi_iter.bi_sector,
+				 bio_end_sector(bio), WRITE))
 			failit = 1;
 		if (check_mode(conf, WritePersistent)) {
-			add_sector(conf, bio->bi_sector, WritePersistent);
+			add_sector(conf, bio->bi_iter.bi_sector,
+				   WritePersistent);
 			failit = 1;
 		}
 		if (check_mode(conf, WriteTransient))
 			failit = 1;
 	} else {
 		/* read request */
-		if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ))
+		if (check_sector(conf, bio->bi_iter.bi_sector,
+				 bio_end_sector(bio), READ))
 			failit = 1;
 		if (check_mode(conf, ReadTransient))
 			failit = 1;
 		if (check_mode(conf, ReadPersistent)) {
-			add_sector(conf, bio->bi_sector, ReadPersistent);
+			add_sector(conf, bio->bi_iter.bi_sector,
+				   ReadPersistent);
 			failit = 1;
 		}
 		if (check_mode(conf, ReadFixable)) {
-			add_sector(conf, bio->bi_sector, ReadFixable);
+			add_sector(conf, bio->bi_iter.bi_sector,
+				   ReadFixable);
 			failit = 1;
 		}
 	}
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index f03fabd2b37..fb3b0d04edf 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -297,19 +297,19 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	}
 
 	rcu_read_lock();
-	tmp_dev = which_dev(mddev, bio->bi_sector);
+	tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
 	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
 
 
-	if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
-		     || (bio->bi_sector < start_sector))) {
+	if (unlikely(bio->bi_iter.bi_sector >= (tmp_dev->end_sector)
+		     || (bio->bi_iter.bi_sector < start_sector))) {
 		char b[BDEVNAME_SIZE];
 
 		printk(KERN_ERR
 		       "md/linear:%s: make_request: Sector %llu out of bounds on "
 		       "dev %s: %llu sectors, offset %llu\n",
 		       mdname(mddev),
-		       (unsigned long long)bio->bi_sector,
+		       (unsigned long long)bio->bi_iter.bi_sector,
 		       bdevname(tmp_dev->rdev->bdev, b),
 		       (unsigned long long)tmp_dev->rdev->sectors,
 		       (unsigned long long)start_sector);
@@ -326,7 +326,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 
 		rcu_read_unlock();
 
-		bp = bio_split(bio, end_sector - bio->bi_sector);
+		bp = bio_split(bio, end_sector - bio->bi_iter.bi_sector);
 
 		linear_make_request(mddev, &bp->bio1);
 		linear_make_request(mddev, &bp->bio2);
@@ -335,7 +335,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	}
 		    
 	bio->bi_bdev = tmp_dev->rdev->bdev;
-	bio->bi_sector = bio->bi_sector - start_sector
+	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector
 		+ tmp_dev->rdev->data_offset;
 	rcu_read_unlock();
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 739b1ec54e2..b07fed398fd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -393,7 +393,7 @@ static void md_submit_flush_data(struct work_struct *ws)
 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct bio *bio = mddev->flush_bio;
 
-	if (bio->bi_size == 0)
+	if (bio->bi_iter.bi_size == 0)
 		/* an empty barrier - all done */
 		bio_endio(bio, 0);
 	else {
@@ -754,7 +754,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
 	bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
@@ -785,13 +785,13 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
 		rdev->meta_bdev : rdev->bdev;
 	if (metadata_op)
-		bio->bi_sector = sector + rdev->sb_start;
+		bio->bi_iter.bi_sector = sector + rdev->sb_start;
 	else if (rdev->mddev->reshape_position != MaxSector &&
 		 (rdev->mddev->reshape_backwards ==
 		  (sector >= rdev->mddev->reshape_position)))
-		bio->bi_sector = sector + rdev->new_data_offset;
+		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
 	else
-		bio->bi_sector = sector + rdev->data_offset;
+		bio->bi_iter.bi_sector = sector + rdev->data_offset;
 	bio_add_page(bio, page, size, 0);
 	submit_bio_wait(rw, bio);
 
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1642eae75a3..849ad39f547 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -100,7 +100,7 @@ static void multipath_end_request(struct bio *bio, int error)
 		md_error (mp_bh->mddev, rdev);
 		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 
 		       bdevname(rdev->bdev,b), 
-		       (unsigned long long)bio->bi_sector);
+		       (unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
 	} else
 		multipath_end_bh_io(mp_bh, error);
@@ -132,7 +132,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	multipath = conf->multipaths + mp_bh->path;
 
 	mp_bh->bio = *bio;
-	mp_bh->bio.bi_sector += multipath->rdev->data_offset;
+	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
 	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
 	mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
@@ -355,21 +355,22 @@ static void multipathd(struct md_thread *thread)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		bio = &mp_bh->bio;
-		bio->bi_sector = mp_bh->master_bio->bi_sector;
+		bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
 		
 		if ((mp_bh->path = multipath_map (conf))<0) {
 			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
 				" error for block %llu\n",
 				bdevname(bio->bi_bdev,b),
-				(unsigned long long)bio->bi_sector);
+				(unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, -EIO);
 		} else {
 			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
 				" to another IO path\n",
 				bdevname(bio->bi_bdev,b),
-				(unsigned long long)bio->bi_sector);
+				(unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
-			bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
+			bio->bi_iter.bi_sector +=
+				conf->multipaths[mp_bh->path].rdev->data_offset;
 			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
 			bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
 			bio->bi_end_io = multipath_end_request;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c4d420b7d2f..e38d1d3226f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -501,10 +501,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 			unsigned int chunk_sects, struct bio *bio)
 {
 	if (likely(is_power_of_2(chunk_sects))) {
-		return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
+		return chunk_sects >=
+			((bio->bi_iter.bi_sector & (chunk_sects-1))
 					+ bio_sectors(bio));
 	} else{
-		sector_t sector = bio->bi_sector;
+		sector_t sector = bio->bi_iter.bi_sector;
 		return chunk_sects >= (sector_div(sector, chunk_sects)
 						+ bio_sectors(bio));
 	}
@@ -524,7 +525,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 
 	chunk_sects = mddev->chunk_sectors;
 	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
-		sector_t sector = bio->bi_sector;
+		sector_t sector = bio->bi_iter.bi_sector;
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
 		if (bio_segments(bio) > 1)
@@ -544,12 +545,12 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 		return;
 	}
 
-	sector_offset = bio->bi_sector;
+	sector_offset = bio->bi_iter.bi_sector;
 	zone = find_zone(mddev->private, &sector_offset);
-	tmp_dev = map_sector(mddev, zone, bio->bi_sector,
+	tmp_dev = map_sector(mddev, zone, bio->bi_iter.bi_sector,
 			     &sector_offset);
 	bio->bi_bdev = tmp_dev->bdev;
-	bio->bi_sector = sector_offset + zone->dev_start +
+	bio->bi_iter.bi_sector = sector_offset + zone->dev_start +
 		tmp_dev->data_offset;
 
 	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
@@ -566,7 +567,8 @@ bad_map:
 	printk("md/raid0:%s: make_request bug: can't convert block across chunks"
 	       " or bigger than %dk %llu %d\n",
 	       mdname(mddev), chunk_sects / 2,
-	       (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
+	       (unsigned long long)bio->bi_iter.bi_sector,
+	       bio_sectors(bio) / 2);
 
 	bio_io_error(bio);
 	return;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1e5a540995e..db3b9d7314f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -229,7 +229,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
 	sector_t start_next_window = r1_bio->start_next_window;
-	sector_t bi_sector = bio->bi_sector;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
@@ -265,9 +265,8 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
 	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 		pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
 			 (bio_data_dir(bio) == WRITE) ? "write" : "read",
-			 (unsigned long long) bio->bi_sector,
-			 (unsigned long long) bio->bi_sector +
-			 bio_sectors(bio) - 1);
+			 (unsigned long long) bio->bi_iter.bi_sector,
+			 (unsigned long long) bio_end_sector(bio) - 1);
 
 		call_bio_endio(r1_bio);
 	}
@@ -466,9 +465,8 @@ static void raid1_end_write_request(struct bio *bio, int error)
 				struct bio *mbio = r1_bio->master_bio;
 				pr_debug("raid1: behind end write sectors"
 					 " %llu-%llu\n",
-					 (unsigned long long) mbio->bi_sector,
-					 (unsigned long long) mbio->bi_sector +
-					 bio_sectors(mbio) - 1);
+					 (unsigned long long) mbio->bi_iter.bi_sector,
+					 (unsigned long long) bio_end_sector(mbio) - 1);
 				call_bio_endio(r1_bio);
 			}
 		}
@@ -875,7 +873,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
 		else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
 				>= bio_end_sector(bio)) ||
 			 (conf->next_resync + NEXT_NORMALIO_DISTANCE
-				<= bio->bi_sector))
+				<= bio->bi_iter.bi_sector))
 			wait = false;
 		else
 			wait = true;
@@ -913,19 +911,19 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
 
 	if (bio && bio_data_dir(bio) == WRITE) {
 		if (conf->next_resync + NEXT_NORMALIO_DISTANCE
-		    <= bio->bi_sector) {
+		    <= bio->bi_iter.bi_sector) {
 			if (conf->start_next_window == MaxSector)
 				conf->start_next_window =
 					conf->next_resync +
 					NEXT_NORMALIO_DISTANCE;
 
 			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-			    <= bio->bi_sector)
+			    <= bio->bi_iter.bi_sector)
 				conf->next_window_requests++;
 			else
 				conf->current_window_requests++;
 		}
-		if (bio->bi_sector >= conf->start_next_window)
+		if (bio->bi_iter.bi_sector >= conf->start_next_window)
 			sector = conf->start_next_window;
 	}
 
@@ -1028,7 +1026,8 @@ do_sync_io:
 		if (bvecs[i].bv_page)
 			put_page(bvecs[i].bv_page);
 	kfree(bvecs);
-	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+	pr_debug("%dB behind alloc failed, doing sync I/O\n",
+		 bio->bi_iter.bi_size);
 }
 
 struct raid1_plug_cb {
@@ -1108,7 +1107,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 
 	if (bio_data_dir(bio) == WRITE &&
 	    bio_end_sector(bio) > mddev->suspend_lo &&
-	    bio->bi_sector < mddev->suspend_hi) {
+	    bio->bi_iter.bi_sector < mddev->suspend_hi) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
 		 * wait.
@@ -1119,7 +1118,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_INTERRUPTIBLE);
 			if (bio_end_sector(bio) <= mddev->suspend_lo ||
-			    bio->bi_sector >= mddev->suspend_hi)
+			    bio->bi_iter.bi_sector >= mddev->suspend_hi)
 				break;
 			schedule();
 		}
@@ -1141,7 +1140,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	r1_bio->sectors = bio_sectors(bio);
 	r1_bio->state = 0;
 	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_sector;
+	r1_bio->sector = bio->bi_iter.bi_sector;
 
 	/* We might need to issue multiple reads to different
 	 * devices if there are bad blocks around, so we keep
@@ -1181,12 +1180,13 @@ read_again:
 		r1_bio->read_disk = rdisk;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r1_bio->sector - bio->bi_sector,
+		bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
 			 max_sectors);
 
 		r1_bio->bios[rdisk] = read_bio;
 
-		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+		read_bio->bi_iter.bi_sector = r1_bio->sector +
+			mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid1_end_read_request;
 		read_bio->bi_rw = READ | do_sync;
@@ -1198,7 +1198,7 @@ read_again:
 			 */
 
 			sectors_handled = (r1_bio->sector + max_sectors
-					   - bio->bi_sector);
+					   - bio->bi_iter.bi_sector);
 			r1_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (bio->bi_phys_segments == 0)
@@ -1219,7 +1219,8 @@ read_again:
 			r1_bio->sectors = bio_sectors(bio) - sectors_handled;
 			r1_bio->state = 0;
 			r1_bio->mddev = mddev;
-			r1_bio->sector = bio->bi_sector + sectors_handled;
+			r1_bio->sector = bio->bi_iter.bi_sector +
+				sectors_handled;
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
@@ -1322,7 +1323,7 @@ read_again:
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
-		allow_barrier(conf, start_next_window, bio->bi_sector);
+		allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
 		start_next_window = wait_barrier(conf, bio);
 		/*
@@ -1349,7 +1350,7 @@ read_again:
 			bio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 	}
-	sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
+	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
 
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
@@ -1361,7 +1362,7 @@ read_again:
 			continue;
 
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -1395,7 +1396,7 @@ read_again:
 
 		r1_bio->bios[i] = mbio;
 
-		mbio->bi_sector	= (r1_bio->sector +
+		mbio->bi_iter.bi_sector	= (r1_bio->sector +
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
@@ -1435,7 +1436,7 @@ read_again:
 		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
 		r1_bio->state = 0;
 		r1_bio->mddev = mddev;
-		r1_bio->sector = bio->bi_sector + sectors_handled;
+		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
 		goto retry_write;
 	}
 
@@ -1959,14 +1960,14 @@ static int process_checks(struct r1bio *r1_bio)
 		/* fixup the bio for reuse */
 		bio_reset(b);
 		b->bi_vcnt = vcnt;
-		b->bi_size = r1_bio->sectors << 9;
-		b->bi_sector = r1_bio->sector +
+		b->bi_iter.bi_size = r1_bio->sectors << 9;
+		b->bi_iter.bi_sector = r1_bio->sector +
 			conf->mirrors[i].rdev->data_offset;
 		b->bi_bdev = conf->mirrors[i].rdev->bdev;
 		b->bi_end_io = end_sync_read;
 		b->bi_private = r1_bio;
 
-		size = b->bi_size;
+		size = b->bi_iter.bi_size;
 		for (j = 0; j < vcnt ; j++) {
 			struct bio_vec *bi;
 			bi = &b->bi_io_vec[j];
@@ -2221,11 +2222,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		}
 
 		wbio->bi_rw = WRITE;
-		wbio->bi_sector = r1_bio->sector;
-		wbio->bi_size = r1_bio->sectors << 9;
+		wbio->bi_iter.bi_sector = r1_bio->sector;
+		wbio->bi_iter.bi_size = r1_bio->sectors << 9;
 
 		bio_trim(wbio, sector - r1_bio->sector, sectors);
-		wbio->bi_sector += rdev->data_offset;
+		wbio->bi_iter.bi_sector += rdev->data_offset;
 		wbio->bi_bdev = rdev->bdev;
 		if (submit_bio_wait(WRITE, wbio) == 0)
 			/* failure! */
@@ -2339,7 +2340,8 @@ read_more:
 		}
 		r1_bio->read_disk = disk;
 		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
-		bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+		bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+			 max_sectors);
 		r1_bio->bios[r1_bio->read_disk] = bio;
 		rdev = conf->mirrors[disk].rdev;
 		printk_ratelimited(KERN_ERR
@@ -2348,7 +2350,7 @@ read_more:
 				   mdname(mddev),
 				   (unsigned long long)r1_bio->sector,
 				   bdevname(rdev->bdev, b));
-		bio->bi_sector = r1_bio->sector + rdev->data_offset;
+		bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
 		bio->bi_bdev = rdev->bdev;
 		bio->bi_end_io = raid1_end_read_request;
 		bio->bi_rw = READ | do_sync;
@@ -2357,7 +2359,7 @@ read_more:
 			/* Drat - have to split this up more */
 			struct bio *mbio = r1_bio->master_bio;
 			int sectors_handled = (r1_bio->sector + max_sectors
-					       - mbio->bi_sector);
+					       - mbio->bi_iter.bi_sector);
 			r1_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (mbio->bi_phys_segments == 0)
@@ -2375,7 +2377,8 @@ read_more:
 			r1_bio->state = 0;
 			set_bit(R1BIO_ReadError, &r1_bio->state);
 			r1_bio->mddev = mddev;
-			r1_bio->sector = mbio->bi_sector + sectors_handled;
+			r1_bio->sector = mbio->bi_iter.bi_sector +
+				sectors_handled;
 
 			goto read_more;
 		} else
@@ -2599,7 +2602,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		}
 		if (bio->bi_end_io) {
 			atomic_inc(&rdev->nr_pending);
-			bio->bi_sector = sector_nr + rdev->data_offset;
+			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
 			bio->bi_bdev = rdev->bdev;
 			bio->bi_private = r1_bio;
 		}
@@ -2699,7 +2702,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 							continue;
 						/* remove last page from this bio */
 						bio->bi_vcnt--;
-						bio->bi_size -= len;
+						bio->bi_iter.bi_size -= len;
 						bio->bi_flags &= ~(1<< BIO_SEG_VALID);
 					}
 					goto bio_full;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c504e8389e6..dbf3b63c275 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1182,7 +1182,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	/* If this request crosses a chunk boundary, we need to
 	 * split it.  This will only happen for 1 PAGE (or less) requests.
 	 */
-	if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
+	if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + bio_sectors(bio)
 		     > chunk_sects
 		     && (conf->geo.near_copies < conf->geo.raid_disks
 			 || conf->prev.near_copies < conf->prev.raid_disks))) {
@@ -1193,8 +1193,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		bp = bio_split(bio,
-			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+		bp = bio_split(bio, chunk_sects -
+			       (bio->bi_iter.bi_sector & (chunk_sects - 1)));
 
 		/* Each of these 'make_request' calls will call 'wait_barrier'.
 		 * If the first succeeds but the second blocks due to the resync
@@ -1221,7 +1221,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	bad_map:
 		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
 		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-		       (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
+		       (unsigned long long)bio->bi_iter.bi_sector,
+		       bio_sectors(bio) / 2);
 
 		bio_io_error(bio);
 		return;
@@ -1238,24 +1239,25 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 
 	sectors = bio_sectors(bio);
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    bio->bi_sector < conf->reshape_progress &&
-	    bio->bi_sector + sectors > conf->reshape_progress) {
+	    bio->bi_iter.bi_sector < conf->reshape_progress &&
+	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
 		/* IO spans the reshape position.  Need to wait for
 		 * reshape to pass
 		 */
 		allow_barrier(conf);
 		wait_event(conf->wait_barrier,
-			   conf->reshape_progress <= bio->bi_sector ||
-			   conf->reshape_progress >= bio->bi_sector + sectors);
+			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
+			   conf->reshape_progress >= bio->bi_iter.bi_sector +
+			   sectors);
 		wait_barrier(conf);
 	}
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio_data_dir(bio) == WRITE &&
 	    (mddev->reshape_backwards
-	     ? (bio->bi_sector < conf->reshape_safe &&
-		bio->bi_sector + sectors > conf->reshape_progress)
-	     : (bio->bi_sector + sectors > conf->reshape_safe &&
-		bio->bi_sector < conf->reshape_progress))) {
+	     ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
+		bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
+	     : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
+		bio->bi_iter.bi_sector < conf->reshape_progress))) {
 		/* Need to update reshape_position in metadata */
 		mddev->reshape_position = conf->reshape_progress;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1273,7 +1275,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	r10_bio->sectors = sectors;
 
 	r10_bio->mddev = mddev;
-	r10_bio->sector = bio->bi_sector;
+	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 
 	/* We might need to issue multiple reads to different
@@ -1302,13 +1304,13 @@ read_again:
 		slot = r10_bio->read_slot;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
+		bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
 			 max_sectors);
 
 		r10_bio->devs[slot].bio = read_bio;
 		r10_bio->devs[slot].rdev = rdev;
 
-		read_bio->bi_sector = r10_bio->devs[slot].addr +
+		read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
 			choose_data_offset(r10_bio, rdev);
 		read_bio->bi_bdev = rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
@@ -1320,7 +1322,7 @@ read_again:
 			 * need another r10_bio.
 			 */
 			sectors_handled = (r10_bio->sectors + max_sectors
-					   - bio->bi_sector);
+					   - bio->bi_iter.bi_sector);
 			r10_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (bio->bi_phys_segments == 0)
@@ -1341,7 +1343,8 @@ read_again:
 			r10_bio->sectors = bio_sectors(bio) - sectors_handled;
 			r10_bio->state = 0;
 			r10_bio->mddev = mddev;
-			r10_bio->sector = bio->bi_sector + sectors_handled;
+			r10_bio->sector = bio->bi_iter.bi_sector +
+				sectors_handled;
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
@@ -1499,7 +1502,8 @@ retry_write:
 			bio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 	}
-	sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+	sectors_handled = r10_bio->sector + max_sectors -
+		bio->bi_iter.bi_sector;
 
 	atomic_set(&r10_bio->remaining, 1);
 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@@ -1510,11 +1514,11 @@ retry_write:
 		if (r10_bio->devs[i].bio) {
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
 			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-			bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
 				 max_sectors);
 			r10_bio->devs[i].bio = mbio;
 
-			mbio->bi_sector	= (r10_bio->devs[i].addr+
+			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
 					   choose_data_offset(r10_bio,
 							      rdev));
 			mbio->bi_bdev = rdev->bdev;
@@ -1553,11 +1557,11 @@ retry_write:
 				rdev = conf->mirrors[d].rdev;
 			}
 			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-			bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
 				 max_sectors);
 			r10_bio->devs[i].repl_bio = mbio;
 
-			mbio->bi_sector	= (r10_bio->devs[i].addr +
+			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
 					   choose_data_offset(
 						   r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
@@ -1591,7 +1595,7 @@ retry_write:
 		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
 
 		r10_bio->mddev = mddev;
-		r10_bio->sector = bio->bi_sector + sectors_handled;
+		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
 		r10_bio->state = 0;
 		goto retry_write;
 	}
@@ -2124,10 +2128,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		bio_reset(tbio);
 
 		tbio->bi_vcnt = vcnt;
-		tbio->bi_size = r10_bio->sectors << 9;
+		tbio->bi_iter.bi_size = r10_bio->sectors << 9;
 		tbio->bi_rw = WRITE;
 		tbio->bi_private = r10_bio;
-		tbio->bi_sector = r10_bio->devs[i].addr;
+		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 
 		for (j=0; j < vcnt ; j++) {
 			tbio->bi_io_vec[j].bv_offset = 0;
@@ -2144,7 +2148,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		atomic_inc(&r10_bio->remaining);
 		md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
 
-		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
+		tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
 		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		generic_make_request(tbio);
 	}
@@ -2614,8 +2618,8 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
 		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(wbio, sector - bio->bi_sector, sectors);
-		wbio->bi_sector = (r10_bio->devs[i].addr+
+		bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
+		wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
 				   choose_data_offset(r10_bio, rdev) +
 				   (sector - r10_bio->sector));
 		wbio->bi_bdev = rdev->bdev;
@@ -2687,10 +2691,10 @@ read_more:
 		(unsigned long long)r10_bio->sector);
 	bio = bio_clone_mddev(r10_bio->master_bio,
 			      GFP_NOIO, mddev);
-	bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
+	bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	r10_bio->devs[slot].rdev = rdev;
-	bio->bi_sector = r10_bio->devs[slot].addr
+	bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
 		+ choose_data_offset(r10_bio, rdev);
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_rw = READ | do_sync;
@@ -2701,7 +2705,7 @@ read_more:
 		struct bio *mbio = r10_bio->master_bio;
 		int sectors_handled =
 			r10_bio->sector + max_sectors
-			- mbio->bi_sector;
+			- mbio->bi_iter.bi_sector;
 		r10_bio->sectors = max_sectors;
 		spin_lock_irq(&conf->device_lock);
 		if (mbio->bi_phys_segments == 0)
@@ -2719,7 +2723,7 @@ read_more:
 		set_bit(R10BIO_ReadError,
 			&r10_bio->state);
 		r10_bio->mddev = mddev;
-		r10_bio->sector = mbio->bi_sector
+		r10_bio->sector = mbio->bi_iter.bi_sector
 			+ sectors_handled;
 
 		goto read_more;
@@ -3157,7 +3161,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				bio->bi_end_io = end_sync_read;
 				bio->bi_rw = READ;
 				from_addr = r10_bio->devs[j].addr;
-				bio->bi_sector = from_addr + rdev->data_offset;
+				bio->bi_iter.bi_sector = from_addr +
+					rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				atomic_inc(&rdev->nr_pending);
 				/* and we write to 'i' (if not in_sync) */
@@ -3181,7 +3186,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 					bio->bi_private = r10_bio;
 					bio->bi_end_io = end_sync_write;
 					bio->bi_rw = WRITE;
-					bio->bi_sector = to_addr
+					bio->bi_iter.bi_sector = to_addr
 						+ rdev->data_offset;
 					bio->bi_bdev = rdev->bdev;
 					atomic_inc(&r10_bio->remaining);
@@ -3210,7 +3215,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio->bi_rw = WRITE;
-				bio->bi_sector = to_addr + rdev->data_offset;
+				bio->bi_iter.bi_sector = to_addr +
+					rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				atomic_inc(&r10_bio->remaining);
 				break;
@@ -3328,7 +3334,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio->bi_rw = READ;
-			bio->bi_sector = sector +
+			bio->bi_iter.bi_sector = sector +
 				conf->mirrors[d].rdev->data_offset;
 			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 			count++;
@@ -3350,7 +3356,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_write;
 			bio->bi_rw = WRITE;
-			bio->bi_sector = sector +
+			bio->bi_iter.bi_sector = sector +
 				conf->mirrors[d].replacement->data_offset;
 			bio->bi_bdev = conf->mirrors[d].replacement->bdev;
 			count++;
@@ -3397,7 +3403,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			     bio2 = bio2->bi_next) {
 				/* remove last page from this bio */
 				bio2->bi_vcnt--;
-				bio2->bi_size -= len;
+				bio2->bi_iter.bi_size -= len;
 				bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
 			}
 			goto bio_full;
@@ -4417,7 +4423,7 @@ read_more:
 	read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
 
 	read_bio->bi_bdev = rdev->bdev;
-	read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
 	read_bio->bi_end_io = end_sync_read;
@@ -4425,7 +4431,7 @@ read_more:
 	read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 	read_bio->bi_flags |= 1 << BIO_UPTODATE;
 	read_bio->bi_vcnt = 0;
-	read_bio->bi_size = 0;
+	read_bio->bi_iter.bi_size = 0;
 	r10_bio->master_bio = read_bio;
 	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
 
@@ -4451,7 +4457,8 @@ read_more:
 
 		bio_reset(b);
 		b->bi_bdev = rdev2->bdev;
-		b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
+		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
+			rdev2->new_data_offset;
 		b->bi_private = r10_bio;
 		b->bi_end_io = end_reshape_write;
 		b->bi_rw = WRITE;
@@ -4478,7 +4485,7 @@ read_more:
 			     bio2 = bio2->bi_next) {
 				/* Remove last page from this bio */
 				bio2->bi_vcnt--;
-				bio2->bi_size -= len;
+				bio2->bi_iter.bi_size -= len;
 				bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
 			}
 			goto bio_full;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 47da0af6322..a5d9c0ee4d6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -133,7 +133,7 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 {
 	int sectors = bio_sectors(bio);
-	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
 		return bio->bi_next;
 	else
 		return NULL;
@@ -225,7 +225,7 @@ static void return_io(struct bio *return_bi)
 
 		return_bi = bi->bi_next;
 		bi->bi_next = NULL;
-		bi->bi_size = 0;
+		bi->bi_iter.bi_size = 0;
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);
 		bio_endio(bi, 0);
@@ -854,10 +854,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				bi->bi_rw, i);
 			atomic_inc(&sh->count);
 			if (use_new_offset(conf, sh))
-				bi->bi_sector = (sh->sector
+				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->new_data_offset);
 			else
-				bi->bi_sector = (sh->sector
+				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->data_offset);
 			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 				bi->bi_rw |= REQ_NOMERGE;
@@ -865,7 +865,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			bi->bi_vcnt = 1;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			bi->bi_io_vec[0].bv_offset = 0;
-			bi->bi_size = STRIPE_SIZE;
+			bi->bi_iter.bi_size = STRIPE_SIZE;
 			/*
 			 * If this is discard request, set bi_vcnt 0. We don't
 			 * want to confuse SCSI because SCSI will replace payload
@@ -901,15 +901,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				rbi->bi_rw, i);
 			atomic_inc(&sh->count);
 			if (use_new_offset(conf, sh))
-				rbi->bi_sector = (sh->sector
+				rbi->bi_iter.bi_sector = (sh->sector
 						  + rrdev->new_data_offset);
 			else
-				rbi->bi_sector = (sh->sector
+				rbi->bi_iter.bi_sector = (sh->sector
 						  + rrdev->data_offset);
 			rbi->bi_vcnt = 1;
 			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			rbi->bi_io_vec[0].bv_offset = 0;
-			rbi->bi_size = STRIPE_SIZE;
+			rbi->bi_iter.bi_size = STRIPE_SIZE;
 			/*
 			 * If this is discard request, set bi_vcnt 0. We don't
 			 * want to confuse SCSI because SCSI will replace payload
@@ -944,10 +944,10 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 	struct async_submit_ctl submit;
 	enum async_tx_flags flags = 0;
 
-	if (bio->bi_sector >= sector)
-		page_offset = (signed)(bio->bi_sector - sector) * 512;
+	if (bio->bi_iter.bi_sector >= sector)
+		page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
 	else
-		page_offset = (signed)(sector - bio->bi_sector) * -512;
+		page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
 
 	if (frombio)
 		flags |= ASYNC_TX_FENCE;
@@ -1014,7 +1014,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			BUG_ON(!dev->read);
 			rbi = dev->read;
 			dev->read = NULL;
-			while (rbi && rbi->bi_sector <
+			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
 				if (!raid5_dec_bi_active_stripes(rbi)) {
@@ -1050,7 +1050,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			dev->read = rbi = dev->toread;
 			dev->toread = NULL;
 			spin_unlock_irq(&sh->stripe_lock);
-			while (rbi && rbi->bi_sector <
+			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, dev->page,
 					dev->sector, tx);
@@ -1392,7 +1392,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
 
-			while (wbi && wbi->bi_sector <
+			while (wbi && wbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				if (wbi->bi_rw & REQ_FUA)
 					set_bit(R5_WantFUA, &dev->flags);
@@ -2616,7 +2616,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	int firstwrite=0;
 
 	pr_debug("adding bi b#%llu to stripe s#%llu\n",
-		(unsigned long long)bi->bi_sector,
+		(unsigned long long)bi->bi_iter.bi_sector,
 		(unsigned long long)sh->sector);
 
 	/*
@@ -2634,12 +2634,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 			firstwrite = 1;
 	} else
 		bip = &sh->dev[dd_idx].toread;
-	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-		if (bio_end_sector(*bip) > bi->bi_sector)
+	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
+		if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
 			goto overlap;
 		bip = & (*bip)->bi_next;
 	}
-	if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
+	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2653,7 +2653,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		sector_t sector = sh->dev[dd_idx].sector;
 		for (bi=sh->dev[dd_idx].towrite;
 		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
-			     bi && bi->bi_sector <= sector;
+			     bi && bi->bi_iter.bi_sector <= sector;
 		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
 			if (bio_end_sector(bi) >= sector)
 				sector = bio_end_sector(bi);
@@ -2663,7 +2663,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	}
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-		(unsigned long long)(*bip)->bi_sector,
+		(unsigned long long)(*bip)->bi_iter.bi_sector,
 		(unsigned long long)sh->sector, dd_idx);
 	spin_unlock_irq(&sh->stripe_lock);
 
@@ -2738,7 +2738,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
 
-		while (bi && bi->bi_sector <
+		while (bi && bi->bi_iter.bi_sector <
 			sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2757,7 +2757,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		bi = sh->dev[i].written;
 		sh->dev[i].written = NULL;
 		if (bi) bitmap_end = 1;
-		while (bi && bi->bi_sector <
+		while (bi && bi->bi_iter.bi_sector <
 		       sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2781,7 +2781,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			spin_unlock_irq(&sh->stripe_lock);
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 				wake_up(&conf->wait_for_overlap);
-			while (bi && bi->bi_sector <
+			while (bi && bi->bi_iter.bi_sector <
 			       sh->dev[i].sector + STRIPE_SECTORS) {
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
@@ -3005,7 +3005,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 					clear_bit(R5_UPTODATE, &dev->flags);
 				wbi = dev->written;
 				dev->written = NULL;
-				while (wbi && wbi->bi_sector <
+				while (wbi && wbi->bi_iter.bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					if (!raid5_dec_bi_active_stripes(wbi)) {
@@ -4097,7 +4097,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 
 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 {
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bio_sectors(bio);
 
@@ -4234,9 +4234,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 	/*
 	 *	compute position
 	 */
-	align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
-						    0,
-						    &dd_idx, NULL);
+	align_bi->bi_iter.bi_sector =
+		raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+				     0, &dd_idx, NULL);
 
 	end_sector = bio_end_sector(align_bi);
 	rcu_read_lock();
@@ -4261,7 +4261,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 		if (!bio_fits_rdev(align_bi) ||
-		    is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
+		    is_badblock(rdev, align_bi->bi_iter.bi_sector,
+				bio_sectors(align_bi),
 				&first_bad, &bad_sectors)) {
 			/* too big in some way, or has a known bad block */
 			bio_put(align_bi);
@@ -4270,7 +4271,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		}
 
 		/* No reshape active, so we can trust rdev->data_offset */
-		align_bi->bi_sector += rdev->data_offset;
+		align_bi->bi_iter.bi_sector += rdev->data_offset;
 
 		spin_lock_irq(&conf->device_lock);
 		wait_event_lock_irq(conf->wait_for_stripe,
@@ -4282,7 +4283,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		if (mddev->gendisk)
 			trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
 					      align_bi, disk_devt(mddev->gendisk),
-					      raid_bio->bi_sector);
+					      raid_bio->bi_iter.bi_sector);
 		generic_make_request(align_bi);
 		return 1;
 	} else {
@@ -4465,8 +4466,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		/* Skip discard while reshape is happening */
 		return;
 
-	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-	last_sector = bi->bi_sector + (bi->bi_size>>9);
+	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -4570,7 +4571,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		return;
 	}
 
-	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
@@ -5054,7 +5055,8 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	int remaining;
 	int handled = 0;
 
-	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	logical_sector = raid_bio->bi_iter.bi_sector &
+		~((sector_t)STRIPE_SECTORS-1);
 	sector = raid5_compute_sector(conf, logical_sector,
 				      0, &dd_idx, NULL);
 	last_sector = bio_end_sector(raid_bio);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 6eca019bcf3..16814a8457f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -819,7 +819,8 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 	dev_info = bio->bi_bdev->bd_disk->private_data;
 	if (dev_info == NULL)
 		goto fail;
-	if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
+	if ((bio->bi_iter.bi_sector & 7) != 0 ||
+	    (bio->bi_iter.bi_size & 4095) != 0)
 		/* Request is not page-aligned. */
 		goto fail;
 	if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
@@ -842,7 +843,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 		}
 	}
 
-	index = (bio->bi_sector >> 3);
+	index = (bio->bi_iter.bi_sector >> 3);
 	bio_for_each_segment(bvec, bio, i) {
 		page_addr = (unsigned long)
 			page_address(bvec->bv_page) + bvec->bv_offset;
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 464dd29d06c..dd4e73fdb32 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -190,15 +190,16 @@ static void xpram_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long bytes;
 	int i;
 
-	if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
+	if ((bio->bi_iter.bi_sector & 7) != 0 ||
+	    (bio->bi_iter.bi_size & 4095) != 0)
 		/* Request is not page-aligned. */
 		goto fail;
-	if ((bio->bi_size >> 12) > xdev->size)
+	if ((bio->bi_iter.bi_size >> 12) > xdev->size)
 		/* Request size is no page-aligned. */
 		goto fail;
-	if ((bio->bi_sector >> 3) > 0xffffffffU - xdev->offset)
+	if ((bio->bi_iter.bi_sector >> 3) > 0xffffffffU - xdev->offset)
 		goto fail;
-	index = (bio->bi_sector >> 3) + xdev->offset;
+	index = (bio->bi_iter.bi_sector >> 3) + xdev->offset;
 	bio_for_each_segment(bvec, bio, i) {
 		page_addr = (unsigned long)
 			kmap(bvec->bv_page) + bvec->bv_offset;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index aa66361ed44..bac04c2335a 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -731,7 +731,7 @@ static int _osd_req_list_objects(struct osd_request *or,
 
 	bio->bi_rw &= ~REQ_WRITE;
 	or->in.bio = bio;
-	or->in.total_bytes = bio->bi_size;
+	or->in.total_bytes = bio->bi_iter.bi_size;
 	return 0;
 }
 
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
index e2421ea6135..53741be754b 100644
--- a/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -220,7 +220,7 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 	for (bio = head; bio != NULL; bio = bio->bi_next) {
 		LASSERT(rw == bio->bi_rw);
 
-		offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+		offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
 		bio_for_each_segment(bvec, bio, i) {
 			BUG_ON(bvec->bv_offset != 0);
 			BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
@@ -313,7 +313,8 @@ static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
 	bio = &lo->lo_bio;
 	while (*bio && (*bio)->bi_rw == rw) {
 		CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
-		       (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+		       (unsigned long long)(*bio)->bi_iter.bi_sector,
+		       (*bio)->bi_iter.bi_size,
 		       page_count, (*bio)->bi_vcnt);
 		if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
 			break;
@@ -347,7 +348,8 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio)
 		goto err;
 
 	CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
-	       (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+	       (unsigned long long)old_bio->bi_iter.bi_sector,
+	       old_bio->bi_iter.bi_size);
 
 	spin_lock_irq(&lo->lo_lock);
 	inactive = (lo->lo_state != LLOOP_BOUND);
@@ -367,7 +369,7 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio)
 	loop_add_bio(lo, old_bio);
 	return;
 err:
-	cfs_bio_io_error(old_bio, old_bio->bi_size);
+	cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size);
 }
 
 
@@ -378,7 +380,7 @@ static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
 	while (bio) {
 		struct bio *tmp = bio->bi_next;
 		bio->bi_next = NULL;
-		cfs_bio_endio(bio, bio->bi_size, ret);
+		cfs_bio_endio(bio, bio->bi_iter.bi_size, ret);
 		bio = tmp;
 	}
 }
diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index 79ce363b2ea..e9e6f984092 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -171,13 +171,14 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio)
 	u64 start, end, bound;
 
 	/* unaligned request */
-	if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+	if (unlikely(bio->bi_iter.bi_sector &
+		     (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
 		return 0;
-	if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+	if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
 		return 0;
 
-	start = bio->bi_sector;
-	end = start + (bio->bi_size >> SECTOR_SHIFT);
+	start = bio->bi_iter.bi_sector;
+	end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
 	bound = zram->disksize >> SECTOR_SHIFT;
 	/* out of range range */
 	if (unlikely(start >= bound || end > bound || start > end))
@@ -684,8 +685,9 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
 		break;
 	}
 
-	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
-	offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
+	offset = (bio->bi_iter.bi_sector &
+		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
 	bio_for_each_segment(bvec, bio, i) {
 		int max_transfer_size = PAGE_SIZE - offset;
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index c87959f1276..2d29356d0c8 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -319,7 +319,7 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num)
 	bio->bi_bdev = ib_dev->ibd_bd;
 	bio->bi_private = cmd;
 	bio->bi_end_io = &iblock_bio_done;
-	bio->bi_sector = lba;
+	bio->bi_iter.bi_sector = lba;
 
 	return bio;
 }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index fc60b31453e..08e3d1388c6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -215,9 +215,9 @@ unsigned int bio_integrity_tag_size(struct bio *bio)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 
-	BUG_ON(bio->bi_size == 0);
+	BUG_ON(bio->bi_iter.bi_size == 0);
 
-	return bi->tag_size * (bio->bi_size / bi->sector_size);
+	return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size);
 }
 EXPORT_SYMBOL(bio_integrity_tag_size);
 
@@ -300,7 +300,7 @@ static void bio_integrity_generate(struct bio *bio)
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 	struct blk_integrity_exchg bix;
 	struct bio_vec *bv;
-	sector_t sector = bio->bi_sector;
+	sector_t sector = bio->bi_iter.bi_sector;
 	unsigned int i, sectors, total;
 	void *prot_buf = bio->bi_integrity->bip_buf;
 
@@ -387,7 +387,7 @@ int bio_integrity_prep(struct bio *bio)
 	bip->bip_owns_buf = 1;
 	bip->bip_buf = buf;
 	bip->bip_size = len;
-	bip->bip_sector = bio->bi_sector;
+	bip->bip_sector = bio->bi_iter.bi_sector;
 
 	/* Map it */
 	offset = offset_in_page(buf);
diff --git a/fs/bio.c b/fs/bio.c
index 33d79a4eb92..a402ad6e753 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -532,13 +532,13 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 	 * most users will be overriding ->bi_bdev with a new target,
 	 * so we don't set nor calculate new physical/hw segment counts here
 	 */
-	bio->bi_sector = bio_src->bi_sector;
+	bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
 	bio->bi_bdev = bio_src->bi_bdev;
 	bio->bi_flags |= 1 << BIO_CLONED;
 	bio->bi_rw = bio_src->bi_rw;
 	bio->bi_vcnt = bio_src->bi_vcnt;
-	bio->bi_size = bio_src->bi_size;
-	bio->bi_idx = bio_src->bi_idx;
+	bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
+	bio->bi_iter.bi_idx = bio_src->bi_iter.bi_idx;
 }
 EXPORT_SYMBOL(__bio_clone);
 
@@ -612,7 +612,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	if (unlikely(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
-	if (((bio->bi_size + len) >> 9) > max_sectors)
+	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
 		return 0;
 
 	/*
@@ -635,8 +635,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 					   simulate merging updated prev_bvec
 					   as new bvec. */
 					.bi_bdev = bio->bi_bdev,
-					.bi_sector = bio->bi_sector,
-					.bi_size = bio->bi_size - prev_bv_len,
+					.bi_sector = bio->bi_iter.bi_sector,
+					.bi_size = bio->bi_iter.bi_size -
+						prev_bv_len,
 					.bi_rw = bio->bi_rw,
 				};
 
@@ -684,8 +685,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	if (q->merge_bvec_fn) {
 		struct bvec_merge_data bvm = {
 			.bi_bdev = bio->bi_bdev,
-			.bi_sector = bio->bi_sector,
-			.bi_size = bio->bi_size,
+			.bi_sector = bio->bi_iter.bi_sector,
+			.bi_size = bio->bi_iter.bi_size,
 			.bi_rw = bio->bi_rw,
 		};
 
@@ -708,7 +709,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;
  done:
-	bio->bi_size += len;
+	bio->bi_iter.bi_size += len;
 	return len;
 }
 
@@ -807,22 +808,22 @@ void bio_advance(struct bio *bio, unsigned bytes)
 	if (bio_integrity(bio))
 		bio_integrity_advance(bio, bytes);
 
-	bio->bi_sector += bytes >> 9;
-	bio->bi_size -= bytes;
+	bio->bi_iter.bi_sector += bytes >> 9;
+	bio->bi_iter.bi_size -= bytes;
 
 	if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
 		return;
 
 	while (bytes) {
-		if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+		if (unlikely(bio->bi_iter.bi_idx >= bio->bi_vcnt)) {
 			WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
-				  bio->bi_idx, bio->bi_vcnt);
+				  bio->bi_iter.bi_idx, bio->bi_vcnt);
 			break;
 		}
 
 		if (bytes >= bio_iovec(bio)->bv_len) {
 			bytes -= bio_iovec(bio)->bv_len;
-			bio->bi_idx++;
+			bio->bi_iter.bi_idx++;
 		} else {
 			bio_iovec(bio)->bv_len -= bytes;
 			bio_iovec(bio)->bv_offset += bytes;
@@ -1485,7 +1486,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 	if (IS_ERR(bio))
 		return bio;
 
-	if (bio->bi_size == len)
+	if (bio->bi_iter.bi_size == len)
 		return bio;
 
 	/*
@@ -1763,16 +1764,16 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 		return bp;
 
 	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
-				bi->bi_sector + first_sectors);
+				bi->bi_iter.bi_sector + first_sectors);
 
 	BUG_ON(bio_segments(bi) > 1);
 	atomic_set(&bp->cnt, 3);
 	bp->error = 0;
 	bp->bio1 = *bi;
 	bp->bio2 = *bi;
-	bp->bio2.bi_sector += first_sectors;
-	bp->bio2.bi_size -= first_sectors << 9;
-	bp->bio1.bi_size = first_sectors << 9;
+	bp->bio2.bi_iter.bi_sector += first_sectors;
+	bp->bio2.bi_iter.bi_size -= first_sectors << 9;
+	bp->bio1.bi_iter.bi_size = first_sectors << 9;
 
 	if (bi->bi_vcnt != 0) {
 		bp->bv1 = *bio_iovec(bi);
@@ -1821,21 +1822,22 @@ void bio_trim(struct bio *bio, int offset, int size)
 	int sofar = 0;
 
 	size <<= 9;
-	if (offset == 0 && size == bio->bi_size)
+	if (offset == 0 && size == bio->bi_iter.bi_size)
 		return;
 
 	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
 
 	bio_advance(bio, offset << 9);
 
-	bio->bi_size = size;
+	bio->bi_iter.bi_size = size;
 
 	/* avoid any complications with bi_idx being non-zero*/
-	if (bio->bi_idx) {
-		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
-			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
-		bio->bi_vcnt -= bio->bi_idx;
-		bio->bi_idx = 0;
+	if (bio->bi_iter.bi_idx) {
+		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_iter.bi_idx,
+			(bio->bi_vcnt - bio->bi_iter.bi_idx) *
+			sizeof(struct bio_vec));
+		bio->bi_vcnt -= bio->bi_iter.bi_idx;
+		bio->bi_iter.bi_idx = 0;
 	}
 	/* Make sure vcnt and last bv are not too big */
 	bio_for_each_segment(bvec, bio, i) {
@@ -1871,7 +1873,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
 	sectors = 0;
 
-	if (index >= bio->bi_idx)
+	if (index >= bio->bi_iter.bi_idx)
 		index = bio->bi_vcnt - 1;
 
 	bio_for_each_segment_all(bv, bio, i) {
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 131d82800b3..cb05e1c842c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1695,7 +1695,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 			return -1;
 		}
 		bio->bi_bdev = block_ctx->dev->bdev;
-		bio->bi_sector = dev_bytenr >> 9;
+		bio->bi_iter.bi_sector = dev_bytenr >> 9;
 
 		for (j = i; j < num_pages; j++) {
 			ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -3013,7 +3013,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
 		int bio_is_patched;
 		char **mapped_datav;
 
-		dev_bytenr = 512 * bio->bi_sector;
+		dev_bytenr = 512 * bio->bi_iter.bi_sector;
 		bio_is_patched = 0;
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
@@ -3021,8 +3021,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
 			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
 			       " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
 			       rw, bio->bi_vcnt,
-			       (unsigned long long)bio->bi_sector, dev_bytenr,
-			       bio->bi_bdev);
+			       (unsigned long long)bio->bi_iter.bi_sector,
+			       dev_bytenr, bio->bi_bdev);
 
 		mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
 				       GFP_NOFS);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index eac6784e43d..f5cdeb4b553 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -172,7 +172,8 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 		goto out;
 
 	inode = cb->inode;
-	ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+	ret = check_compressed_csum(inode, cb,
+				    (u64)bio->bi_iter.bi_sector << 9);
 	if (ret)
 		goto csum_failed;
 
@@ -370,7 +371,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
 		page = compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
-		if (bio->bi_size)
+		if (bio->bi_iter.bi_size)
 			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
 							   PAGE_CACHE_SIZE,
 							   bio, 0);
@@ -504,7 +505,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 
 		if (!em || last_offset < em->start ||
 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
-		    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
 			free_extent_map(em);
 			unlock_extent(tree, last_offset, end);
 			unlock_page(page);
@@ -550,7 +551,7 @@ next:
  * in it.  We don't actually do IO on those pages but allocate new ones
  * to hold the compressed pages on disk.
  *
- * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_iter.bi_sector points to the compressed extent on disk
  * bio->bi_io_vec points to all of the inode pages
  * bio->bi_vcnt is a count of pages
  *
@@ -571,7 +572,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct page *page;
 	struct block_device *bdev;
 	struct bio *comp_bio;
-	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
 	u64 em_len;
 	u64 em_start;
 	struct extent_map *em;
@@ -657,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		page->mapping = inode->i_mapping;
 		page->index = em_start >> PAGE_CACHE_SHIFT;
 
-		if (comp_bio->bi_size)
+		if (comp_bio->bi_iter.bi_size)
 			ret = tree->ops->merge_bio_hook(READ, page, 0,
 							PAGE_CACHE_SIZE,
 							comp_bio, 0);
@@ -685,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 							comp_bio, sums);
 				BUG_ON(ret); /* -ENOMEM */
 			}
-			sums += (comp_bio->bi_size + root->sectorsize - 1) /
-				root->sectorsize;
+			sums += (comp_bio->bi_iter.bi_size +
+				 root->sectorsize - 1) / root->sectorsize;
 
 			ret = btrfs_map_bio(root, READ, comp_bio,
 					    mirror_num, 0);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8b5f9e1d1f0..bcb6f1b780d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1984,7 +1984,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio)
 		return -EIO;
-	bio->bi_size = 0;
+	bio->bi_iter.bi_size = 0;
 	map_length = length;
 
 	ret = btrfs_map_block(fs_info, WRITE, logical,
@@ -1995,7 +1995,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	}
 	BUG_ON(mirror_num != bbio->mirror_num);
 	sector = bbio->stripes[mirror_num-1].physical >> 9;
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	dev = bbio->stripes[mirror_num-1].dev;
 	kfree(bbio);
 	if (!dev || !dev->bdev || !dev->writeable) {
@@ -2268,9 +2268,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 		return -EIO;
 	}
 	bio->bi_end_io = failed_bio->bi_end_io;
-	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_iter.bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-	bio->bi_size = 0;
+	bio->bi_iter.bi_size = 0;
 
 	btrfs_failed_bio = btrfs_io_bio(failed_bio);
 	if (btrfs_failed_bio->csum) {
@@ -2412,7 +2412,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		struct inode *inode = page->mapping->host;
 
 		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-			 "mirror=%lu\n", (u64)bio->bi_sector, err,
+			 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
 			 io_bio->mirror_num);
 		tree = &BTRFS_I(inode)->io_tree;
 
@@ -2543,7 +2543,7 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 
 	if (bio) {
 		bio->bi_bdev = bdev;
-		bio->bi_sector = first_sector;
+		bio->bi_iter.bi_sector = first_sector;
 		btrfs_bio = btrfs_io_bio(bio);
 		btrfs_bio->csum = NULL;
 		btrfs_bio->csum_allocated = NULL;
@@ -2637,7 +2637,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
 		if (old_compressed)
-			contig = bio->bi_sector == sector;
+			contig = bio->bi_iter.bi_sector == sector;
 		else
 			contig = bio_end_sector(bio) == sector;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6f384886028..84a46a42d26 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	if (!path)
 		return -ENOMEM;
 
-	nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits;
+	nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
 	if (!dst) {
 		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
 			btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
@@ -201,7 +201,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 		csum = (u8 *)dst;
 	}
 
-	if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+	if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
 		path->reada = 2;
 
 	WARN_ON(bio->bi_vcnt <= 0);
@@ -217,7 +217,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 		path->skip_locking = 1;
 	}
 
-	disk_bytenr = (u64)bio->bi_sector << 9;
+	disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
 	if (dio)
 		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
@@ -302,7 +302,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
 			      struct btrfs_dio_private *dip, struct bio *bio,
 			      u64 offset)
 {
-	int len = (bio->bi_sector << 9) - dip->disk_bytenr;
+	int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int ret;
 
@@ -447,11 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	u64 offset;
 
 	WARN_ON(bio->bi_vcnt <= 0);
-	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
+		       GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
 
-	sums->len = bio->bi_size;
+	sums->len = bio->bi_iter.bi_size;
 	INIT_LIST_HEAD(&sums->list);
 
 	if (contig)
@@ -461,7 +462,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	BUG_ON(!ordered); /* Logic error */
-	sums->bytenr = (u64)bio->bi_sector << 9;
+	sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
 	index = 0;
 
 	while (bio_index < bio->bi_vcnt) {
@@ -476,7 +477,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 			btrfs_add_ordered_sum(inode, ordered, sums);
 			btrfs_put_ordered_extent(ordered);
 
-			bytes_left = bio->bi_size - total_bytes;
+			bytes_left = bio->bi_iter.bi_size - total_bytes;
 
 			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
 				       GFP_NOFS);
@@ -484,7 +485,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 			sums->len = bytes_left;
 			ordered = btrfs_lookup_ordered_extent(inode, offset);
 			BUG_ON(!ordered); /* Logic error */
-			sums->bytenr = ((u64)bio->bi_sector << 9) +
+			sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
 				       total_bytes;
 			index = 0;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d6630dc130b..7ab0e94ad49 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1577,7 +1577,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	u64 logical = (u64)bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	int ret;
@@ -1585,7 +1585,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 	if (bio_flags & EXTENT_BIO_COMPRESSED)
 		return 0;
 
-	length = bio->bi_size;
+	length = bio->bi_iter.bi_size;
 	map_length = length;
 	ret = btrfs_map_block(root->fs_info, rw, logical,
 			      &map_length, NULL, 0);
@@ -6894,7 +6894,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
 		      "sector %#Lx len %u err no %d\n",
 		      btrfs_ino(dip->inode), bio->bi_rw,
-		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+		      (unsigned long long)bio->bi_iter.bi_sector,
+		      bio->bi_iter.bi_size, err);
 		dip->errors = 1;
 
 		/*
@@ -6985,7 +6986,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	struct bio *bio;
 	struct bio *orig_bio = dip->orig_bio;
 	struct bio_vec *bvec = orig_bio->bi_io_vec;
-	u64 start_sector = orig_bio->bi_sector;
+	u64 start_sector = orig_bio->bi_iter.bi_sector;
 	u64 file_offset = dip->logical_offset;
 	u64 submit_len = 0;
 	u64 map_length;
@@ -6993,7 +6994,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	int ret = 0;
 	int async_submit = 0;
 
-	map_length = orig_bio->bi_size;
+	map_length = orig_bio->bi_iter.bi_size;
 	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
 			      &map_length, NULL, 0);
 	if (ret) {
@@ -7001,7 +7002,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 		return -EIO;
 	}
 
-	if (map_length >= orig_bio->bi_size) {
+	if (map_length >= orig_bio->bi_iter.bi_size) {
 		bio = orig_bio;
 		goto submit;
 	}
@@ -7053,7 +7054,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 
-			map_length = orig_bio->bi_size;
+			map_length = orig_bio->bi_iter.bi_size;
 			ret = btrfs_map_block(root->fs_info, rw,
 					      start_sector << 9,
 					      &map_length, NULL, 0);
@@ -7111,7 +7112,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 
 	if (!skip_sum && !write) {
 		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-		sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+		sum_len = dio_bio->bi_iter.bi_size >>
+			inode->i_sb->s_blocksize_bits;
 		sum_len *= csum_size;
 	} else {
 		sum_len = 0;
@@ -7126,8 +7128,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 	dip->private = dio_bio->bi_private;
 	dip->inode = inode;
 	dip->logical_offset = file_offset;
-	dip->bytes = dio_bio->bi_size;
-	dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+	dip->bytes = dio_bio->bi_iter.bi_size;
+	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
 	io_bio->bi_private = dip;
 	dip->errors = 0;
 	dip->orig_bio = io_bio;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24ac21840a9..9af0b25d991 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1032,8 +1032,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 
 	/* see if we can add this page onto our existing bio */
 	if (last) {
-		last_end = (u64)last->bi_sector << 9;
-		last_end += last->bi_size;
+		last_end = (u64)last->bi_iter.bi_sector << 9;
+		last_end += last->bi_iter.bi_size;
 
 		/*
 		 * we can't merge these if they are from different
@@ -1053,9 +1053,9 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 	if (!bio)
 		return -ENOMEM;
 
-	bio->bi_size = 0;
+	bio->bi_iter.bi_size = 0;
 	bio->bi_bdev = stripe->dev->bdev;
-	bio->bi_sector = disk_start >> 9;
+	bio->bi_iter.bi_sector = disk_start >> 9;
 	set_bit(BIO_UPTODATE, &bio->bi_flags);
 
 	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -1111,7 +1111,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 
 	spin_lock_irq(&rbio->bio_list_lock);
 	bio_list_for_each(bio, &rbio->bio_list) {
-		start = (u64)bio->bi_sector << 9;
+		start = (u64)bio->bi_iter.bi_sector << 9;
 		stripe_offset = start - rbio->raid_map[0];
 		page_index = stripe_offset >> PAGE_CACHE_SHIFT;
 
@@ -1272,7 +1272,7 @@ cleanup:
 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 			   struct bio *bio)
 {
-	u64 physical = bio->bi_sector;
+	u64 physical = bio->bi_iter.bi_sector;
 	u64 stripe_start;
 	int i;
 	struct btrfs_bio_stripe *stripe;
@@ -1298,7 +1298,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
 				   struct bio *bio)
 {
-	u64 logical = bio->bi_sector;
+	u64 logical = bio->bi_iter.bi_sector;
 	u64 stripe_start;
 	int i;
 
@@ -1602,8 +1602,8 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
 						 plug_list);
 	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
 						 plug_list);
-	u64 a_sector = ra->bio_list.head->bi_sector;
-	u64 b_sector = rb->bio_list.head->bi_sector;
+	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
+	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
 
 	if (a_sector < b_sector)
 		return -1;
@@ -1691,7 +1691,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 	if (IS_ERR(rbio))
 		return PTR_ERR(rbio);
 	bio_list_add(&rbio->bio_list, bio);
-	rbio->bio_list_bytes = bio->bi_size;
+	rbio->bio_list_bytes = bio->bi_iter.bi_size;
 
 	/*
 	 * don't plug on full rbios, just get them out the door
@@ -2044,7 +2044,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 
 	rbio->read_rebuild = 1;
 	bio_list_add(&rbio->bio_list, bio);
-	rbio->bio_list_bytes = bio->bi_size;
+	rbio->bio_list_bytes = bio->bi_iter.bi_size;
 
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1fd3f33c330..bb9a928fa3a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1308,7 +1308,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 			continue;
 		}
 		bio->bi_bdev = page->dev->bdev;
-		bio->bi_sector = page->physical >> 9;
+		bio->bi_iter.bi_sector = page->physical >> 9;
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
 		if (btrfsic_submit_bio_wait(READ, bio))
@@ -1427,7 +1427,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		if (!bio)
 			return -EIO;
 		bio->bi_bdev = page_bad->dev->bdev;
-		bio->bi_sector = page_bad->physical >> 9;
+		bio->bi_iter.bi_sector = page_bad->physical >> 9;
 
 		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
 		if (PAGE_SIZE != ret) {
@@ -1520,7 +1520,7 @@ again:
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_wr_bio_end_io;
 		bio->bi_bdev = sbio->dev->bdev;
-		bio->bi_sector = sbio->physical >> 9;
+		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		sbio->err = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical_for_dev_replace ||
@@ -1926,7 +1926,7 @@ again:
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_bio_end_io;
 		bio->bi_bdev = sbio->dev->bdev;
-		bio->bi_sector = sbio->physical >> 9;
+		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		sbio->err = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical ||
@@ -3371,8 +3371,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 		spin_unlock(&sctx->stat_lock);
 		return -ENOMEM;
 	}
-	bio->bi_size = 0;
-	bio->bi_sector = physical_for_dev_replace >> 9;
+	bio->bi_iter.bi_size = 0;
+	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
 	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 	if (ret != PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 92303f42baa..f2130de0ddc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5411,7 +5411,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
 	if (!q->merge_bvec_fn)
 		return 1;
 
-	bvm.bi_size = bio->bi_size - prev->bv_len;
+	bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
 	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
 		return 0;
 	return 1;
@@ -5426,7 +5426,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 	bio->bi_private = bbio;
 	btrfs_io_bio(bio)->stripe_index = dev_nr;
 	bio->bi_end_io = btrfs_end_bio;
-	bio->bi_sector = physical >> 9;
+	bio->bi_iter.bi_sector = physical >> 9;
 #ifdef DEBUG
 	{
 		struct rcu_string *name;
@@ -5464,7 +5464,7 @@ again:
 	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
 		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
 				 bvec->bv_offset) < bvec->bv_len) {
-			u64 len = bio->bi_size;
+			u64 len = bio->bi_iter.bi_size;
 
 			atomic_inc(&bbio->stripes_pending);
 			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
@@ -5486,7 +5486,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
-		bio->bi_sector = logical >> 9;
+		bio->bi_iter.bi_sector = logical >> 9;
 		kfree(bbio);
 		bio_endio(bio, -EIO);
 	}
@@ -5497,7 +5497,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 {
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
-	u64 logical = (u64)bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	u64 *raid_map = NULL;
@@ -5506,7 +5506,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	int total_devs = 1;
 	struct btrfs_bio *bbio = NULL;
 
-	length = bio->bi_size;
+	length = bio->bi_iter.bi_size;
 	map_length = length;
 
 	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
diff --git a/fs/buffer.c b/fs/buffer.c
index 6024877335c..1c04ec66974 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2982,11 +2982,11 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
 	 * let it through, and the IO layer will turn it into
 	 * an EIO.
 	 */
-	if (unlikely(bio->bi_sector >= maxsector))
+	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
 		return;
 
-	maxsector -= bio->bi_sector;
-	bytes = bio->bi_size;
+	maxsector -= bio->bi_iter.bi_sector;
+	bytes = bio->bi_iter.bi_size;
 	if (likely((bytes >> 9) <= maxsector))
 		return;
 
@@ -2994,7 +2994,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
 	bytes = maxsector << 9;
 
 	/* Truncate the bio.. */
-	bio->bi_size = bytes;
+	bio->bi_iter.bi_size = bytes;
 	bio->bi_io_vec[0].bv_len = bytes;
 
 	/* ..and clear the end of the buffer for reads */
@@ -3029,14 +3029,14 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_io_vec[0].bv_page = bh->b_page;
 	bio->bi_io_vec[0].bv_len = bh->b_size;
 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
 
 	bio->bi_vcnt = 1;
-	bio->bi_size = bh->b_size;
+	bio->bi_iter.bi_size = bh->b_size;
 
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0e04142d596..160a5489a93 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -375,7 +375,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
 
 	bio->bi_bdev = bdev;
-	bio->bi_sector = first_sector;
+	bio->bi_iter.bi_sector = first_sector;
 	if (dio->is_async)
 		bio->bi_end_io = dio_bio_end_aio;
 	else
@@ -719,7 +719,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
 	if (sdio->bio) {
 		loff_t cur_offset = sdio->cur_page_fs_offset;
 		loff_t bio_next_offset = sdio->logical_offset_in_bio +
-			sdio->bio->bi_size;
+			sdio->bio->bi_iter.bi_size;
 
 		/*
 		 * See whether this new request is contiguous with the old.
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index a31e4da1450..ab95508e3d4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -298,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
 static void ext4_end_bio(struct bio *bio, int error)
 {
 	ext4_io_end_t *io_end = bio->bi_private;
-	sector_t bi_sector = bio->bi_sector;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
 
 	BUG_ON(!io_end);
 	bio->bi_end_io = NULL;
@@ -366,7 +366,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
 	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
 	if (!bio)
 		return -ENOMEM;
-	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_end_io = ext4_end_bio;
 	bio->bi_private = ext4_get_io_end(io->io_end);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a4949096cf4..a2c8de8ba6c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -386,7 +386,7 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
 	bio = f2fs_bio_alloc(bdev, 1);
 
 	/* Initialize the bio */
-	bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
 	bio->bi_end_io = read_end_io;
 
 	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a90c6bc0d12..36e8afd8e1e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -682,7 +682,7 @@ retry:
 
 		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
 		sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks);
-		sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+		sbi->bio[type]->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
 		sbi->bio[type]->bi_private = priv;
 		/*
 		 * The end_io will be assigned at the sumbission phase.
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec..985da945f0b 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -272,7 +272,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
 		nrvecs = max(nrvecs/2, 1U);
 	}
 
-	bio->bi_sector = blkno * (sb->s_blocksize >> 9);
+	bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
 	bio->bi_bdev = sb->s_bdev;
 	bio->bi_end_io = gfs2_end_log_write;
 	bio->bi_private = sdp;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 82303b47495..16194da9165 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 	lock_page(page);
 
 	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_sector = sector * (sb->s_blocksize >> 9);
+	bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
 	bio->bi_bdev = sb->s_bdev;
 	bio_add_page(bio, page, PAGE_SIZE, 0);
 
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index e9a97a0d431..3f999649587 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -63,7 +63,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 	sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
 
 	bio = bio_alloc(GFP_NOIO, 1);
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	bio->bi_bdev = sb->s_bdev;
 
 	if (!(rw & WRITE) && data)
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 360d27c4888..8d811e02b4b 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1998,20 +1998,20 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio = bio_alloc(GFP_NOFS, 1);
 
-	bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	bio->bi_bdev = log->bdev;
 	bio->bi_io_vec[0].bv_page = bp->l_page;
 	bio->bi_io_vec[0].bv_len = LOGPSIZE;
 	bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
 	bio->bi_vcnt = 1;
-	bio->bi_size = LOGPSIZE;
+	bio->bi_iter.bi_size = LOGPSIZE;
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
 	/*check if journaling to disk has been disabled*/
 	if (log->no_integrity) {
-		bio->bi_size = 0;
+		bio->bi_iter.bi_size = 0;
 		lbmIODone(bio, 0);
 	} else {
 		submit_bio(READ_SYNC, bio);
@@ -2144,21 +2144,21 @@ static void lbmStartIO(struct lbuf * bp)
 	jfs_info("lbmStartIO\n");
 
 	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	bio->bi_bdev = log->bdev;
 	bio->bi_io_vec[0].bv_page = bp->l_page;
 	bio->bi_io_vec[0].bv_len = LOGPSIZE;
 	bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
 	bio->bi_vcnt = 1;
-	bio->bi_size = LOGPSIZE;
+	bio->bi_iter.bi_size = LOGPSIZE;
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
 
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
-		bio->bi_size = 0;
+		bio->bi_iter.bi_size = 0;
 		lbmIODone(bio, 0);
 	} else {
 		submit_bio(WRITE_SYNC, bio);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d165cde0c68..49ba7ff1bbb 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 			 * count from hitting zero before we're through
 			 */
 			inc_io(page);
-			if (!bio->bi_size)
+			if (!bio->bi_iter.bi_size)
 				goto dump_bio;
 			submit_bio(WRITE, bio);
 			nr_underway++;
@@ -438,7 +438,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 
 		bio = bio_alloc(GFP_NOFS, 1);
 		bio->bi_bdev = inode->i_sb->s_bdev;
-		bio->bi_sector = pblock << (inode->i_blkbits - 9);
+		bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
 		bio->bi_end_io = metapage_write_end_io;
 		bio->bi_private = page;
 
@@ -452,7 +452,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 	if (bio) {
 		if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes)
 				goto add_failed;
-		if (!bio->bi_size)
+		if (!bio->bi_iter.bi_size)
 			goto dump_bio;
 
 		submit_bio(WRITE, bio);
@@ -517,7 +517,8 @@ static int metapage_readpage(struct file *fp, struct page *page)
 
 			bio = bio_alloc(GFP_NOFS, 1);
 			bio->bi_bdev = inode->i_sb->s_bdev;
-			bio->bi_sector = pblock << (inode->i_blkbits - 9);
+			bio->bi_iter.bi_sector =
+				pblock << (inode->i_blkbits - 9);
 			bio->bi_end_io = metapage_read_end_io;
 			bio->bi_private = page;
 			len = xlen << inode->i_blkbits;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index e6df3be3b31..76279e11982 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -26,9 +26,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	bio_vec.bv_len = PAGE_SIZE;
 	bio_vec.bv_offset = 0;
 	bio.bi_vcnt = 1;
-	bio.bi_size = PAGE_SIZE;
 	bio.bi_bdev = bdev;
-	bio.bi_sector = page->index * (PAGE_SIZE >> 9);
+	bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
+	bio.bi_iter.bi_size = PAGE_SIZE;
 
 	return submit_bio_wait(rw, &bio);
 }
@@ -92,9 +92,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 		if (i >= max_pages) {
 			/* Block layer cannot split bios :( */
 			bio->bi_vcnt = i;
-			bio->bi_size = i * PAGE_SIZE;
+			bio->bi_iter.bi_size = i * PAGE_SIZE;
 			bio->bi_bdev = super->s_bdev;
-			bio->bi_sector = ofs >> 9;
+			bio->bi_iter.bi_sector = ofs >> 9;
 			bio->bi_private = sb;
 			bio->bi_end_io = writeseg_end_io;
 			atomic_inc(&super->s_pending_writes);
@@ -119,9 +119,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 		unlock_page(page);
 	}
 	bio->bi_vcnt = nr_pages;
-	bio->bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
 	bio->bi_bdev = super->s_bdev;
-	bio->bi_sector = ofs >> 9;
+	bio->bi_iter.bi_sector = ofs >> 9;
 	bio->bi_private = sb;
 	bio->bi_end_io = writeseg_end_io;
 	atomic_inc(&super->s_pending_writes);
@@ -184,9 +184,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 		if (i >= max_pages) {
 			/* Block layer cannot split bios :( */
 			bio->bi_vcnt = i;
-			bio->bi_size = i * PAGE_SIZE;
+			bio->bi_iter.bi_size = i * PAGE_SIZE;
 			bio->bi_bdev = super->s_bdev;
-			bio->bi_sector = ofs >> 9;
+			bio->bi_iter.bi_sector = ofs >> 9;
 			bio->bi_private = sb;
 			bio->bi_end_io = erase_end_io;
 			atomic_inc(&super->s_pending_writes);
@@ -205,9 +205,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 		bio->bi_io_vec[i].bv_offset = 0;
 	}
 	bio->bi_vcnt = nr_pages;
-	bio->bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
 	bio->bi_bdev = super->s_bdev;
-	bio->bi_sector = ofs >> 9;
+	bio->bi_iter.bi_sector = ofs >> 9;
 	bio->bi_private = sb;
 	bio->bi_end_io = erase_end_io;
 	atomic_inc(&super->s_pending_writes);
diff --git a/fs/mpage.c b/fs/mpage.c
index dd6d5878f4d..4979ffa60aa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -93,7 +93,7 @@ mpage_alloc(struct block_device *bdev,
 
 	if (bio) {
 		bio->bi_bdev = bdev;
-		bio->bi_sector = first_sector;
+		bio->bi_iter.bi_sector = first_sector;
 	}
 	return bio;
 }
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index da768923bf7..56ff823ca82 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio)
 	if (bio) {
 		get_parallel(bio->bi_private);
 		dprintk("%s submitting %s bio %u@%llu\n", __func__,
-			rw == READ ? "read" : "write",
-			bio->bi_size, (unsigned long long)bio->bi_sector);
+			rw == READ ? "read" : "write", bio->bi_iter.bi_size,
+			(unsigned long long)bio->bi_iter.bi_sector);
 		submit_bio(rw, bio);
 	}
 	return NULL;
@@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 	}
 
 	if (bio) {
-		bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+		bio->bi_iter.bi_sector = isect - be->be_f_offset +
+			be->be_v_offset;
 		bio->bi_bdev = be->be_mdev;
 		bio->bi_end_io = end_io;
 		bio->bi_private = par;
@@ -511,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
 	isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
 		(offset / SECTOR_SIZE);
 
-	bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+	bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
 	bio->bi_bdev = be->be_mdev;
 	bio->bi_end_io = bl_read_single_end_io;
 
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2d8be51f90d..dc3a9efdaab 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -416,7 +416,8 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
 	}
 	if (likely(bio)) {
 		bio->bi_bdev = nilfs->ns_bdev;
-		bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
+		bio->bi_iter.bi_sector =
+			start << (nilfs->ns_blocksize_bits - 9);
 	}
 	return bio;
 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73920ffda05..bf482dfed14 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -413,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 	}
 
 	/* Must put everything in 512 byte sectors for the bio... */
-	bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
+	bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
 	bio->bi_bdev = reg->hr_bdev;
 	bio->bi_private = wc;
 	bio->bi_end_io = o2hb_bio_end_io;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71c8c9d2b88..1b19b9cd692 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -407,7 +407,7 @@ xfs_alloc_ioend_bio(
 	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
 
 	ASSERT(bio->bi_private == NULL);
-	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	return bio;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c7f0b77dcb0..5f3ea443ebb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1255,7 +1255,7 @@ next_chunk:
 
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	bio->bi_bdev = bp->b_target->bt_bdev;
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	bio->bi_end_io = xfs_buf_bio_end_io;
 	bio->bi_private = bp;
 
@@ -1277,7 +1277,7 @@ next_chunk:
 		total_nr_pages--;
 	}
 
-	if (likely(bio->bi_size)) {
+	if (likely(bio->bi_iter.bi_size)) {
 		if (xfs_buf_is_vmapped(bp)) {
 			flush_kernel_vmap_range(bp->b_addr,
 						xfs_buf_vmap_len(bp));
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 060ff695085..e2e0bc642ed 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -62,19 +62,19 @@
  * on highmem page vectors
  */
 #define bio_iovec_idx(bio, idx)	(&((bio)->bi_io_vec[(idx)]))
-#define bio_iovec(bio)		bio_iovec_idx((bio), (bio)->bi_idx)
+#define bio_iovec(bio)		bio_iovec_idx((bio), (bio)->bi_iter.bi_idx)
 #define bio_page(bio)		bio_iovec((bio))->bv_page
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
-#define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
-#define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_end_sector(bio)	((bio)->bi_sector + bio_sectors((bio)))
+#define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_iter.bi_idx)
+#define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
+#define bio_end_sector(bio)	((bio)->bi_iter.bi_sector + bio_sectors((bio)))
 
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
 	if (bio->bi_vcnt)
 		return bio_iovec(bio)->bv_len;
 	else /* dataless requests such as discard */
-		return bio->bi_size;
+		return bio->bi_iter.bi_size;
 }
 
 static inline void *bio_data(struct bio *bio)
@@ -108,7 +108,7 @@ static inline void *bio_data(struct bio *bio)
  */
 
 #define __BVEC_END(bio)		bio_iovec_idx((bio), (bio)->bi_vcnt - 1)
-#define __BVEC_START(bio)	bio_iovec_idx((bio), (bio)->bi_idx)
+#define __BVEC_START(bio)	bio_iovec_idx((bio), (bio)->bi_iter.bi_idx)
 
 /* Default implementation of BIOVEC_PHYS_MERGEABLE */
 #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
@@ -150,7 +150,7 @@ static inline void *bio_data(struct bio *bio)
 	     i++)
 
 #define bio_for_each_segment(bvl, bio, i)				\
-	for (i = (bio)->bi_idx;						\
+	for (i = (bio)->bi_iter.bi_idx;					\
 	     bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;	\
 	     i++)
 
@@ -365,7 +365,7 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 #define __bio_kunmap_irq(buf, flags)	bvec_kunmap_irq(buf, flags)
 
 #define bio_kmap_irq(bio, flags) \
-	__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
+	__bio_kmap_irq((bio), (bio)->bi_iter.bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
 
 /*
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 238ef0ed62f..29b5b84d8a2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -28,13 +28,19 @@ struct bio_vec {
 	unsigned int	bv_offset;
 };
 
+struct bvec_iter {
+	sector_t		bi_sector;	/* device address in 512 byte
+						   sectors */
+	unsigned int		bi_size;	/* residual I/O count */
+
+	unsigned int		bi_idx;		/* current index into bvl_vec */
+};
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
  */
 struct bio {
-	sector_t		bi_sector;	/* device address in 512 byte
-						   sectors */
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
 	unsigned long		bi_flags;	/* status, command, etc */
@@ -42,16 +48,13 @@ struct bio {
 						 * top bits priority
 						 */
 
-	unsigned short		bi_vcnt;	/* how many bio_vec's */
-	unsigned short		bi_idx;		/* current index into bvl_vec */
+	struct bvec_iter	bi_iter;
 
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
 	 */
 	unsigned int		bi_phys_segments;
 
-	unsigned int		bi_size;	/* residual I/O count */
-
 	/*
 	 * To keep track of the max segment size, we account for the
 	 * sizes of the first and last mergeable segments in this bio.
@@ -74,11 +77,13 @@ struct bio {
 	struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
 
+	unsigned short		bi_vcnt;	/* how many bio_vec's */
+
 	/*
 	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
 	 */
 
-	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
+	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
 
 	atomic_t		bi_cnt;		/* pin count */
 
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index e2b9576d00e..095c6e4fe1e 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -24,10 +24,10 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->orig_major	= d->disk->major;
 		__entry->orig_minor	= d->disk->first_minor;
-		__entry->sector		= bio->bi_sector;
-		__entry->orig_sector	= bio->bi_sector - 16;
-		__entry->nr_sector	= bio->bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -99,9 +99,9 @@ DECLARE_EVENT_CLASS(bcache_bio,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
-		__entry->nr_sector	= bio->bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u",
@@ -134,9 +134,9 @@ TRACE_EVENT(bcache_read,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
-		__entry->nr_sector	= bio->bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 		__entry->cache_hit = hit;
 		__entry->bypass = bypass;
 	),
@@ -162,9 +162,9 @@ TRACE_EVENT(bcache_write,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
-		__entry->nr_sector	= bio->bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 		__entry->writeback = writeback;
 		__entry->bypass = bypass;
 	),
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 4c2301d2ef1..e76ae19a8d6 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -243,9 +243,9 @@ TRACE_EVENT(block_bio_bounce,
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev ?
 					  bio->bi_bdev->bd_dev : 0;
-		__entry->sector		= bio->bi_sector;
+		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -280,10 +280,10 @@ TRACE_EVENT(block_bio_complete,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
+		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= error;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
@@ -308,9 +308,9 @@ DECLARE_EVENT_CLASS(block_bio_merge,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
+		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -375,9 +375,9 @@ TRACE_EVENT(block_bio_queue,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
+		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -403,7 +403,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 
 	TP_fast_assign(
 		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
-		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
 		blk_fill_rwbs(__entry->rwbs,
 			      bio ? bio->bi_rw : 0, __entry->nr_sector);
@@ -538,9 +538,9 @@ TRACE_EVENT(block_split,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
+		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -579,11 +579,11 @@ TRACE_EVENT(block_bio_remap,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->sector		= bio->bi_sector;
+		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index e0dc355fa31..bd3ee4fbe7a 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -616,8 +616,8 @@ TRACE_EVENT(f2fs_do_submit_bio,
 		__entry->dev		= sb->s_dev;
 		__entry->btype		= btype;
 		__entry->sync		= sync;
-		__entry->sector		= bio->bi_sector;
-		__entry->size		= bio->bi_size;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->size		= bio->bi_iter.bi_size;
 	),
 
 	TP_printk("dev = (%d,%d), type = %s, io = %s, sector = %lld, size = %u",
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index d09dd10c5a5..9a58bc25881 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -32,7 +32,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector,
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	bio->bi_bdev = bdev;
 	bio->bi_end_io = end_swap_bio_read;
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f785aef6579..b418cb0d724 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -781,8 +781,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 	if (!error && !bio_flagged(bio, BIO_UPTODATE))
 		error = EIO;
 
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-			error, 0, NULL);
+	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+			bio->bi_rw, what, error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -885,8 +885,9 @@ static void blk_add_trace_split(void *ignore,
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(pdu);
 
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
-				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+		__blk_add_trace(bt, bio->bi_iter.bi_sector,
+				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
+				!bio_flagged(bio, BIO_UPTODATE),
 				sizeof(rpdu), &rpdu);
 	}
 }
@@ -918,9 +919,9 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
 	r.sector_from = cpu_to_be64(from);
 
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
-			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
-			sizeof(r), &r);
+	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+			bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
 }
 
 /**
diff --git a/mm/page_io.c b/mm/page_io.c
index 8c79a4764be..f14eded987f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
 
 	bio = bio_alloc(gfp_flags, 1);
 	if (bio) {
-		bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
-		bio->bi_sector <<= PAGE_SHIFT - 9;
+		bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+		bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
 		bio->bi_io_vec[0].bv_page = page;
 		bio->bi_io_vec[0].bv_len = PAGE_SIZE;
 		bio->bi_io_vec[0].bv_offset = 0;
 		bio->bi_vcnt = 1;
-		bio->bi_size = PAGE_SIZE;
+		bio->bi_iter.bi_size = PAGE_SIZE;
 		bio->bi_end_io = end_io;
 	}
 	return bio;
@@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err)
 		printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
 				imajor(bio->bi_bdev->bd_inode),
 				iminor(bio->bi_bdev->bd_inode),
-				(unsigned long long)bio->bi_sector);
+				(unsigned long long)bio->bi_iter.bi_sector);
 		ClearPageReclaim(page);
 	}
 	end_page_writeback(page);
@@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err)
 		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
 				imajor(bio->bi_bdev->bd_inode),
 				iminor(bio->bi_bdev->bd_inode),
-				(unsigned long long)bio->bi_sector);
+				(unsigned long long)bio->bi_iter.bi_sector);
 		goto out;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a4ad39b1d10584dfcfcfb0d510faab2c7f034399 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 7 Aug 2013 14:24:32 -0700
Subject: block: Convert bio_iovec() to bvec_iter

For immutable biovecs, we'll be introducing a new bio_iovec() that uses
our new bvec iterator to construct a biovec, taking into account
bvec_iter->bi_bvec_done - this patch updates existing users for the new
usage.

Some of the existing users really do need a pointer into the bvec array
- those uses are all going to be removed, but we'll need the
functionality from immutable to remove them - so for now rename the
existing bio_iovec() -> __bio_iovec(), and it'll be removed in a couple
patches.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Ed L. Cashin" <ecashin@coraid.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: dm-devel@redhat.com
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
---
 drivers/block/aoe/aoecmd.c |  2 +-
 drivers/md/bcache/io.c     | 13 +++++++------
 drivers/md/dm-verity.c     |  2 +-
 drivers/scsi/sd.c          |  2 +-
 fs/bio.c                   | 20 ++++++++++----------
 include/linux/bio.h        | 10 ++++++----
 6 files changed, 26 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 877ba119b3f..77c24ab1898 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -932,7 +932,7 @@ bufinit(struct buf *buf, struct request *rq, struct bio *bio)
 	buf->resid = bio->bi_iter.bi_size;
 	buf->sector = bio->bi_iter.bi_sector;
 	bio_pageinc(bio);
-	buf->bv = bio_iovec(bio);
+	buf->bv = __bio_iovec(bio);
 	buf->bv_resid = buf->bv->bv_len;
 	WARN_ON(buf->bv_resid == 0);
 }
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index cc4ba2da5fb..dc44f0689eb 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -22,11 +22,12 @@ static void bch_bi_idx_hack_endio(struct bio *bio, int error)
 static void bch_generic_make_request_hack(struct bio *bio)
 {
 	if (bio->bi_iter.bi_idx) {
+		int i;
+		struct bio_vec *bv;
 		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
 
-		memcpy(clone->bi_io_vec,
-		       bio_iovec(bio),
-		       bio_segments(bio) * sizeof(struct bio_vec));
+		bio_for_each_segment(bv, bio, i)
+			clone->bi_io_vec[clone->bi_vcnt++] = *bv;
 
 		clone->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 		clone->bi_bdev		= bio->bi_bdev;
@@ -97,7 +98,7 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
 			if (!ret)
 				return NULL;
 
-			memcpy(ret->bi_io_vec, bio_iovec(bio),
+			memcpy(ret->bi_io_vec, __bio_iovec(bio),
 			       sizeof(struct bio_vec) * vcnt);
 
 			break;
@@ -106,7 +107,7 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
 			if (!ret)
 				return NULL;
 
-			memcpy(ret->bi_io_vec, bio_iovec(bio),
+			memcpy(ret->bi_io_vec, __bio_iovec(bio),
 			       sizeof(struct bio_vec) * vcnt);
 
 			ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
@@ -182,7 +183,7 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
 	ret = min(ret, queue_max_sectors(q));
 
 	WARN_ON(!ret);
-	ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
+	ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9);
 
 	return ret;
 }
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 132b3154d46..5392135924c 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -524,7 +524,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 		io->io_vec = io->io_vec_inline;
 	else
 		io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
-	memcpy(io->io_vec, bio_iovec(bio),
+	memcpy(io->io_vec, __bio_iovec(bio),
 	       io->io_vec_size * sizeof(struct bio_vec));
 
 	verity_submit_prefetch(v, io);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index e6c4bff0433..200d6bc8124 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -801,7 +801,7 @@ static int sd_setup_write_same_cmnd(struct scsi_device *sdp, struct request *rq)
 	if (sdkp->device->no_write_same)
 		return BLKPREP_KILL;
 
-	BUG_ON(bio_offset(bio) || bio_iovec(bio)->bv_len != sdp->sector_size);
+	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
 	sector >>= ilog2(sdp->sector_size) - 9;
 	nr_sectors >>= ilog2(sdp->sector_size) - 9;
diff --git a/fs/bio.c b/fs/bio.c
index a402ad6e753..7bb281fc3d5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -821,12 +821,12 @@ void bio_advance(struct bio *bio, unsigned bytes)
 			break;
 		}
 
-		if (bytes >= bio_iovec(bio)->bv_len) {
-			bytes -= bio_iovec(bio)->bv_len;
+		if (bytes >= bio_iovec(bio).bv_len) {
+			bytes -= bio_iovec(bio).bv_len;
 			bio->bi_iter.bi_idx++;
 		} else {
-			bio_iovec(bio)->bv_len -= bytes;
-			bio_iovec(bio)->bv_offset += bytes;
+			bio_iovec(bio).bv_len -= bytes;
+			bio_iovec(bio).bv_offset += bytes;
 			bytes = 0;
 		}
 	}
@@ -879,8 +879,8 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 	unsigned src_offset, dst_offset, bytes;
 	void *src_p, *dst_p;
 
-	src_bv = bio_iovec(src);
-	dst_bv = bio_iovec(dst);
+	src_bv = __bio_iovec(src);
+	dst_bv = __bio_iovec(dst);
 
 	src_offset = src_bv->bv_offset;
 	dst_offset = dst_bv->bv_offset;
@@ -893,7 +893,7 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 				if (!src)
 					break;
 
-				src_bv = bio_iovec(src);
+				src_bv = __bio_iovec(src);
 			}
 
 			src_offset = src_bv->bv_offset;
@@ -906,7 +906,7 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 				if (!dst)
 					break;
 
-				dst_bv = bio_iovec(dst);
+				dst_bv = __bio_iovec(dst);
 			}
 
 			dst_offset = dst_bv->bv_offset;
@@ -1776,8 +1776,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 	bp->bio1.bi_iter.bi_size = first_sectors << 9;
 
 	if (bi->bi_vcnt != 0) {
-		bp->bv1 = *bio_iovec(bi);
-		bp->bv2 = *bio_iovec(bi);
+		bp->bv1 = bio_iovec(bi);
+		bp->bv2 = bio_iovec(bi);
 
 		if (bio_is_rw(bi)) {
 			bp->bv2.bv_offset += first_sectors << 9;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e2e0bc642ed..9f182fcbe71 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -62,9 +62,11 @@
  * on highmem page vectors
  */
 #define bio_iovec_idx(bio, idx)	(&((bio)->bi_io_vec[(idx)]))
-#define bio_iovec(bio)		bio_iovec_idx((bio), (bio)->bi_iter.bi_idx)
-#define bio_page(bio)		bio_iovec((bio))->bv_page
-#define bio_offset(bio)		bio_iovec((bio))->bv_offset
+#define __bio_iovec(bio)	bio_iovec_idx((bio), (bio)->bi_iter.bi_idx)
+#define bio_iovec(bio)		(*__bio_iovec(bio))
+
+#define bio_page(bio)		(bio_iovec((bio)).bv_page)
+#define bio_offset(bio)		(bio_iovec((bio)).bv_offset)
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_iter.bi_idx)
 #define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
 #define bio_end_sector(bio)	((bio)->bi_iter.bi_sector + bio_sectors((bio)))
@@ -72,7 +74,7 @@
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
 	if (bio->bi_vcnt)
-		return bio_iovec(bio)->bv_len;
+		return bio_iovec(bio).bv_len;
 	else /* dataless requests such as discard */
 		return bio->bi_iter.bi_size;
 }
-- 
cgit v1.2.3-70-g09d2


From 7988613b0e5b2638caf6cd493cc78e9595eba19c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 23 Nov 2013 17:19:00 -0800
Subject: block: Convert bio_for_each_segment() to bvec_iter

More prep work for immutable biovecs - with immutable bvecs drivers
won't be able to use the biovec directly, they'll need to use helpers
that take into account bio->bi_iter.bi_bvec_done.

This updates callers for the new usage without changing the
implementation yet.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Ed L. Cashin" <ecashin@coraid.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Paul Clements <Paul.Clements@steeleye.com>
Cc: Jim Paris <jim@jtan.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Yehuda Sadeh <yehuda@inktank.com>
Cc: Sage Weil <sage@inktank.com>
Cc: Alex Elder <elder@inktank.com>
Cc: ceph-devel@vger.kernel.org
Cc: Joshua Morris <josh.h.morris@us.ibm.com>
Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux390@de.ibm.com
Cc: Nagalakshmi Nandigama <Nagalakshmi.Nandigama@lsi.com>
Cc: Sreekanth Reddy <Sreekanth.Reddy@lsi.com>
Cc: support@lsi.com
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Guo Chao <yan@linux.vnet.ibm.com>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Quoc-Son Anh <quoc-sonx.anh@intel.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Jan Kara <jack@suse.cz>
Cc: linux-m68k@lists.linux-m68k.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: drbd-user@lists.linbit.com
Cc: nbd-general@lists.sourceforge.net
Cc: cbe-oss-dev@lists.ozlabs.org
Cc: xen-devel@lists.xensource.com
Cc: virtualization@lists.linux-foundation.org
Cc: linux-raid@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: DL-MPTFusionLinux@lsi.com
Cc: linux-scsi@vger.kernel.org
Cc: devel@driverdev.osuosl.org
Cc: linux-fsdevel@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: linux-mm@kvack.org
Acked-by: Geoff Levand <geoff@infradead.org>
---
 arch/m68k/emu/nfblock.c                     | 11 ++---
 arch/powerpc/sysdev/axonram.c               | 18 ++++----
 block/blk-core.c                            |  4 +-
 block/blk-merge.c                           | 49 ++++++++++----------
 drivers/block/aoe/aoecmd.c                  | 16 +++----
 drivers/block/brd.c                         | 12 ++---
 drivers/block/drbd/drbd_main.c              | 27 ++++++-----
 drivers/block/drbd/drbd_receiver.c          | 13 +++---
 drivers/block/drbd/drbd_worker.c            |  8 ++--
 drivers/block/floppy.c                      | 12 ++---
 drivers/block/loop.c                        | 23 +++++-----
 drivers/block/mtip32xx/mtip32xx.c           | 13 +++---
 drivers/block/nbd.c                         | 12 ++---
 drivers/block/nvme-core.c                   | 33 ++++++++------
 drivers/block/ps3disk.c                     | 10 ++---
 drivers/block/ps3vram.c                     | 10 ++---
 drivers/block/rbd.c                         | 38 ++++++++--------
 drivers/block/rsxx/dma.c                    | 11 ++---
 drivers/md/bcache/btree.c                   |  4 +-
 drivers/md/bcache/debug.c                   | 19 ++++----
 drivers/md/bcache/io.c                      | 69 ++++++++++++-----------------
 drivers/md/bcache/request.c                 | 26 +++++------
 drivers/md/raid5.c                          | 12 ++---
 drivers/s390/block/dasd_diag.c              | 10 ++---
 drivers/s390/block/dasd_eckd.c              | 48 ++++++++++----------
 drivers/s390/block/dasd_fba.c               | 26 +++++------
 drivers/s390/block/dcssblk.c                | 16 +++----
 drivers/s390/block/scm_blk.c                |  8 ++--
 drivers/s390/block/scm_blk_cluster.c        |  4 +-
 drivers/s390/block/xpram.c                  | 10 ++---
 drivers/scsi/mpt2sas/mpt2sas_transport.c    | 31 ++++++-------
 drivers/scsi/mpt3sas/mpt3sas_transport.c    | 31 ++++++-------
 drivers/staging/lustre/lustre/llite/lloop.c | 14 +++---
 drivers/staging/zram/zram_drv.c             | 19 ++++----
 fs/bio-integrity.c                          | 30 +++++++------
 fs/bio.c                                    | 22 ++++-----
 include/linux/bio.h                         | 28 ++++++------
 include/linux/blkdev.h                      |  7 +--
 mm/bounce.c                                 | 44 +++++++++---------
 39 files changed, 401 insertions(+), 397 deletions(-)

(limited to 'fs')

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 0a9d0b3c794..2d75ae24616 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -62,17 +62,18 @@ struct nfhd_device {
 static void nfhd_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct nfhd_device *dev = queue->queuedata;
-	struct bio_vec *bvec;
-	int i, dir, len, shift;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	int dir, len, shift;
 	sector_t sec = bio->bi_iter.bi_sector;
 
 	dir = bio_data_dir(bio);
 	shift = dev->bshift;
-	bio_for_each_segment(bvec, bio, i) {
-		len = bvec->bv_len;
+	bio_for_each_segment(bvec, bio, iter) {
+		len = bvec.bv_len;
 		len >>= 9;
 		nfhd_read_write(dev->id, 0, dir, sec >> shift, len >> shift,
-				bvec_to_phys(bvec));
+				bvec_to_phys(&bvec));
 		sec += len;
 	}
 	bio_endio(bio, 0);
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f33bcbaa6a0..47b6b9f81d4 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -109,28 +109,28 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 	struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
 	unsigned long phys_mem, phys_end;
 	void *user_mem;
-	struct bio_vec *vec;
+	struct bio_vec vec;
 	unsigned int transfered;
-	unsigned short idx;
+	struct bvec_iter iter;
 
 	phys_mem = bank->io_addr + (bio->bi_iter.bi_sector <<
 				    AXON_RAM_SECTOR_SHIFT);
 	phys_end = bank->io_addr + bank->size;
 	transfered = 0;
-	bio_for_each_segment(vec, bio, idx) {
-		if (unlikely(phys_mem + vec->bv_len > phys_end)) {
+	bio_for_each_segment(vec, bio, iter) {
+		if (unlikely(phys_mem + vec.bv_len > phys_end)) {
 			bio_io_error(bio);
 			return;
 		}
 
-		user_mem = page_address(vec->bv_page) + vec->bv_offset;
+		user_mem = page_address(vec.bv_page) + vec.bv_offset;
 		if (bio_data_dir(bio) == READ)
-			memcpy(user_mem, (void *) phys_mem, vec->bv_len);
+			memcpy(user_mem, (void *) phys_mem, vec.bv_len);
 		else
-			memcpy((void *) phys_mem, user_mem, vec->bv_len);
+			memcpy((void *) phys_mem, user_mem, vec.bv_len);
 
-		phys_mem += vec->bv_len;
-		transfered += vec->bv_len;
+		phys_mem += vec.bv_len;
+		transfered += vec.bv_len;
 	}
 	bio_endio(bio, 0);
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index 5c2ab2c7406..5da8e900d3b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2746,10 +2746,10 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 void rq_flush_dcache_pages(struct request *rq)
 {
 	struct req_iterator iter;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
 
 	rq_for_each_segment(bvec, rq, iter)
-		flush_dcache_page(bvec->bv_page);
+		flush_dcache_page(bvec.bv_page);
 }
 EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
 #endif
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 03bc083c28c..a1ead9049ed 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,10 +12,11 @@
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
-	struct bio_vec *bv, *bvprv = NULL;
-	int cluster, i, high, highprv = 1;
+	struct bio_vec bv, bvprv = { NULL };
+	int cluster, high, highprv = 1;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
+	struct bvec_iter iter;
 
 	if (!bio)
 		return 0;
@@ -25,25 +26,23 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	seg_size = 0;
 	nr_phys_segs = 0;
 	for_each_bio(bio) {
-		bio_for_each_segment(bv, bio, i) {
+		bio_for_each_segment(bv, bio, iter) {
 			/*
 			 * the trick here is making sure that a high page is
 			 * never considered part of another segment, since that
 			 * might change with the bounce page.
 			 */
-			high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q);
-			if (high || highprv)
-				goto new_segment;
-			if (cluster) {
-				if (seg_size + bv->bv_len
+			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
+			if (!high && !highprv && cluster) {
+				if (seg_size + bv.bv_len
 				    > queue_max_segment_size(q))
 					goto new_segment;
-				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
+				if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
 					goto new_segment;
-				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+				if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
 					goto new_segment;
 
-				seg_size += bv->bv_len;
+				seg_size += bv.bv_len;
 				bvprv = bv;
 				continue;
 			}
@@ -54,7 +53,7 @@ new_segment:
 
 			nr_phys_segs++;
 			bvprv = bv;
-			seg_size = bv->bv_len;
+			seg_size = bv.bv_len;
 			highprv = high;
 		}
 		bbio = bio;
@@ -110,21 +109,21 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	return 0;
 }
 
-static void
+static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
-		     struct scatterlist *sglist, struct bio_vec **bvprv,
+		     struct scatterlist *sglist, struct bio_vec *bvprv,
 		     struct scatterlist **sg, int *nsegs, int *cluster)
 {
 
 	int nbytes = bvec->bv_len;
 
-	if (*bvprv && *cluster) {
+	if (*sg && *cluster) {
 		if ((*sg)->length + nbytes > queue_max_segment_size(q))
 			goto new_segment;
 
-		if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
+		if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
 			goto new_segment;
-		if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
+		if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
 			goto new_segment;
 
 		(*sg)->length += nbytes;
@@ -150,7 +149,7 @@ new_segment:
 		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
 		(*nsegs)++;
 	}
-	*bvprv = bvec;
+	*bvprv = *bvec;
 }
 
 /*
@@ -160,7 +159,7 @@ new_segment:
 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		  struct scatterlist *sglist)
 {
-	struct bio_vec *bvec, *bvprv;
+	struct bio_vec bvec, bvprv;
 	struct req_iterator iter;
 	struct scatterlist *sg;
 	int nsegs, cluster;
@@ -171,10 +170,9 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	/*
 	 * for each bio in rq
 	 */
-	bvprv = NULL;
 	sg = NULL;
 	rq_for_each_segment(bvec, rq, iter) {
-		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+		__blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg,
 				     &nsegs, &cluster);
 	} /* segments in rq */
 
@@ -223,18 +221,17 @@ EXPORT_SYMBOL(blk_rq_map_sg);
 int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
 		   struct scatterlist *sglist)
 {
-	struct bio_vec *bvec, *bvprv;
+	struct bio_vec bvec, bvprv;
 	struct scatterlist *sg;
 	int nsegs, cluster;
-	unsigned long i;
+	struct bvec_iter iter;
 
 	nsegs = 0;
 	cluster = blk_queue_cluster(q);
 
-	bvprv = NULL;
 	sg = NULL;
-	bio_for_each_segment(bvec, bio, i) {
-		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+	bio_for_each_segment(bvec, bio, iter) {
+		__blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg,
 				     &nsegs, &cluster);
 	} /* segments in bio */
 
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 77c24ab1898..7a06aec1ded 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -897,15 +897,15 @@ rqbiocnt(struct request *r)
 static void
 bio_pageinc(struct bio *bio)
 {
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	struct page *page;
-	int i;
+	struct bvec_iter iter;
 
-	bio_for_each_segment(bv, bio, i) {
+	bio_for_each_segment(bv, bio, iter) {
 		/* Non-zero page count for non-head members of
 		 * compound pages is no longer allowed by the kernel.
 		 */
-		page = compound_trans_head(bv->bv_page);
+		page = compound_trans_head(bv.bv_page);
 		atomic_inc(&page->_count);
 	}
 }
@@ -913,12 +913,12 @@ bio_pageinc(struct bio *bio)
 static void
 bio_pagedec(struct bio *bio)
 {
-	struct bio_vec *bv;
 	struct page *page;
-	int i;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 
-	bio_for_each_segment(bv, bio, i) {
-		page = compound_trans_head(bv->bv_page);
+	bio_for_each_segment(bv, bio, iter) {
+		page = compound_trans_head(bv.bv_page);
 		atomic_dec(&page->_count);
 	}
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 66f5aaae15a..e73b85cf075 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -328,9 +328,9 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
 	struct block_device *bdev = bio->bi_bdev;
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	int rw;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
 	sector_t sector;
-	int i;
+	struct bvec_iter iter;
 	int err = -EIO;
 
 	sector = bio->bi_iter.bi_sector;
@@ -347,10 +347,10 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
 	if (rw == READA)
 		rw = READ;
 
-	bio_for_each_segment(bvec, bio, i) {
-		unsigned int len = bvec->bv_len;
-		err = brd_do_bvec(brd, bvec->bv_page, len,
-					bvec->bv_offset, rw, sector);
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+		err = brd_do_bvec(brd, bvec.bv_page, len,
+					bvec.bv_offset, rw, sector);
 		if (err)
 			break;
 		sector += len >> SECTOR_SHIFT;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 9e3818b1bc8..f4e5440aba0 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1537,15 +1537,17 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
 
 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
 {
-	struct bio_vec *bvec;
-	int i;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
 	/* hint all but last page with MSG_MORE */
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment(bvec, bio, iter) {
 		int err;
 
-		err = _drbd_no_send_page(mdev, bvec->bv_page,
-					 bvec->bv_offset, bvec->bv_len,
-					 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+		err = _drbd_no_send_page(mdev, bvec.bv_page,
+					 bvec.bv_offset, bvec.bv_len,
+					 bio_iter_last(bio, iter)
+					 ? 0 : MSG_MORE);
 		if (err)
 			return err;
 	}
@@ -1554,15 +1556,16 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
 
 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 {
-	struct bio_vec *bvec;
-	int i;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
 	/* hint all but last page with MSG_MORE */
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment(bvec, bio, iter) {
 		int err;
 
-		err = _drbd_send_page(mdev, bvec->bv_page,
-				      bvec->bv_offset, bvec->bv_len,
-				      i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+		err = _drbd_send_page(mdev, bvec.bv_page,
+				      bvec.bv_offset, bvec.bv_len,
+				      bio_iter_last(bio, iter) ? 0 : MSG_MORE);
 		if (err)
 			return err;
 	}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5326c22cdb9..d073305ffd5 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1595,9 +1595,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
 static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 			   sector_t sector, int data_size)
 {
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	struct bio *bio;
-	int dgs, err, i, expect;
+	int dgs, err, expect;
 	void *dig_in = mdev->tconn->int_dig_in;
 	void *dig_vv = mdev->tconn->int_dig_vv;
 
@@ -1617,11 +1618,11 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 	bio = req->master_bio;
 	D_ASSERT(sector == bio->bi_iter.bi_sector);
 
-	bio_for_each_segment(bvec, bio, i) {
-		void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
-		expect = min_t(int, data_size, bvec->bv_len);
+	bio_for_each_segment(bvec, bio, iter) {
+		void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
+		expect = min_t(int, data_size, bvec.bv_len);
 		err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
-		kunmap(bvec->bv_page);
+		kunmap(bvec.bv_page);
 		if (err)
 			return err;
 		data_size -= expect;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 891c0ecaa29..84d3175d493 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -313,8 +313,8 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
 {
 	struct hash_desc desc;
 	struct scatterlist sg;
-	struct bio_vec *bvec;
-	int i;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 
 	desc.tfm = tfm;
 	desc.flags = 0;
@@ -322,8 +322,8 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
 	sg_init_table(&sg, 1);
 	crypto_hash_init(&desc);
 
-	bio_for_each_segment(bvec, bio, i) {
-		sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+	bio_for_each_segment(bvec, bio, iter) {
+		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
 		crypto_hash_update(&desc, &sg, sg.length);
 	}
 	crypto_hash_final(&desc, digest);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 6a86fe7b730..6b29c442282 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2351,7 +2351,7 @@ static void rw_interrupt(void)
 /* Compute maximal contiguous buffer size. */
 static int buffer_chain_size(void)
 {
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	int size;
 	struct req_iterator iter;
 	char *base;
@@ -2360,10 +2360,10 @@ static int buffer_chain_size(void)
 	size = 0;
 
 	rq_for_each_segment(bv, current_req, iter) {
-		if (page_address(bv->bv_page) + bv->bv_offset != base + size)
+		if (page_address(bv.bv_page) + bv.bv_offset != base + size)
 			break;
 
-		size += bv->bv_len;
+		size += bv.bv_len;
 	}
 
 	return size >> 9;
@@ -2389,7 +2389,7 @@ static int transfer_size(int ssize, int max_sector, int max_size)
 static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 {
 	int remaining;		/* number of transferred 512-byte sectors */
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *buffer;
 	char *dma_buffer;
 	int size;
@@ -2427,10 +2427,10 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 		if (!remaining)
 			break;
 
-		size = bv->bv_len;
+		size = bv.bv_len;
 		SUPBOUND(size, remaining);
 
-		buffer = page_address(bv->bv_page) + bv->bv_offset;
+		buffer = page_address(bv.bv_page) + bv.bv_offset;
 		if (dma_buffer + size >
 		    floppy_track_buffer + (max_buffer_sectors << 10) ||
 		    dma_buffer < floppy_track_buffer) {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f5e39989add..33fde3a3975 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -288,9 +288,10 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 {
 	int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
 			struct page *page);
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	struct page *page = NULL;
-	int i, ret = 0;
+	int ret = 0;
 
 	if (lo->transfer != transfer_none) {
 		page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
@@ -302,11 +303,11 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 		do_lo_send = do_lo_send_direct_write;
 	}
 
-	bio_for_each_segment(bvec, bio, i) {
-		ret = do_lo_send(lo, bvec, pos, page);
+	bio_for_each_segment(bvec, bio, iter) {
+		ret = do_lo_send(lo, &bvec, pos, page);
 		if (ret < 0)
 			break;
-		pos += bvec->bv_len;
+		pos += bvec.bv_len;
 	}
 	if (page) {
 		kunmap(page);
@@ -392,20 +393,20 @@ do_lo_receive(struct loop_device *lo,
 static int
 lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
 {
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	ssize_t s;
-	int i;
 
-	bio_for_each_segment(bvec, bio, i) {
-		s = do_lo_receive(lo, bvec, bsize, pos);
+	bio_for_each_segment(bvec, bio, iter) {
+		s = do_lo_receive(lo, &bvec, bsize, pos);
 		if (s < 0)
 			return s;
 
-		if (s != bvec->bv_len) {
+		if (s != bvec.bv_len) {
 			zero_fill_bio(bio);
 			break;
 		}
-		pos += bvec->bv_len;
+		pos += bvec.bv_len;
 	}
 	return 0;
 }
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 69e9eb5a6b3..52b2f2a7147 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3962,8 +3962,9 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct driver_data *dd = queue->queuedata;
 	struct scatterlist *sg;
-	struct bio_vec *bvec;
-	int i, nents = 0;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	int nents = 0;
 	int tag = 0, unaligned = 0;
 
 	if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
@@ -4026,11 +4027,11 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 		}
 
 		/* Create the scatter list for this bio. */
-		bio_for_each_segment(bvec, bio, i) {
+		bio_for_each_segment(bvec, bio, iter) {
 			sg_set_page(&sg[nents],
-					bvec->bv_page,
-					bvec->bv_len,
-					bvec->bv_offset);
+					bvec.bv_page,
+					bvec.bv_len,
+					bvec.bv_offset);
 			nents++;
 		}
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 2dc3b5153f0..aa362f49321 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -271,7 +271,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 
 	if (nbd_cmd(req) == NBD_CMD_WRITE) {
 		struct req_iterator iter;
-		struct bio_vec *bvec;
+		struct bio_vec bvec;
 		/*
 		 * we are really probing at internals to determine
 		 * whether to set MSG_MORE or not...
@@ -281,8 +281,8 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 			if (!rq_iter_last(req, iter))
 				flags = MSG_MORE;
 			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
-					nbd->disk->disk_name, req, bvec->bv_len);
-			result = sock_send_bvec(nbd, bvec, flags);
+					nbd->disk->disk_name, req, bvec.bv_len);
+			result = sock_send_bvec(nbd, &bvec, flags);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
@@ -378,10 +378,10 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 			nbd->disk->disk_name, req);
 	if (nbd_cmd(req) == NBD_CMD_READ) {
 		struct req_iterator iter;
-		struct bio_vec *bvec;
+		struct bio_vec bvec;
 
 		rq_for_each_segment(bvec, req, iter) {
-			result = sock_recv_bvec(nbd, bvec);
+			result = sock_recv_bvec(nbd, &bvec);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
@@ -389,7 +389,7 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 				return req;
 			}
 			dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
-				nbd->disk->disk_name, req, bvec->bv_len);
+				nbd->disk->disk_name, req, bvec.bv_len);
 		}
 	}
 	return req;
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 53d21738187..5539d292087 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -550,9 +550,11 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
 static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
-	struct bio_vec *bvec, *bvprv = NULL;
+	struct bio_vec bvec, bvprv;
+	struct bvec_iter iter;
 	struct scatterlist *sg = NULL;
-	int i, length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
+	int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
+	int first = 1;
 
 	if (nvmeq->dev->stripe_size)
 		split_len = nvmeq->dev->stripe_size -
@@ -560,25 +562,28 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 			 (nvmeq->dev->stripe_size - 1));
 
 	sg_init_table(iod->sg, psegs);
-	bio_for_each_segment(bvec, bio, i) {
-		if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
-			sg->length += bvec->bv_len;
+	bio_for_each_segment(bvec, bio, iter) {
+		if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
+			sg->length += bvec.bv_len;
 		} else {
-			if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
-				return nvme_split_and_submit(bio, nvmeq, i,
-								length, 0);
+			if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
+				return nvme_split_and_submit(bio, nvmeq,
+							     iter.bi_idx,
+							     length, 0);
 
 			sg = sg ? sg + 1 : iod->sg;
-			sg_set_page(sg, bvec->bv_page, bvec->bv_len,
-							bvec->bv_offset);
+			sg_set_page(sg, bvec.bv_page,
+				    bvec.bv_len, bvec.bv_offset);
 			nsegs++;
 		}
 
-		if (split_len - length < bvec->bv_len)
-			return nvme_split_and_submit(bio, nvmeq, i, split_len,
-							split_len - length);
-		length += bvec->bv_len;
+		if (split_len - length < bvec.bv_len)
+			return nvme_split_and_submit(bio, nvmeq, iter.bi_idx,
+						     split_len,
+						     split_len - length);
+		length += bvec.bv_len;
 		bvprv = bvec;
+		first = 0;
 	}
 	iod->nents = nsegs;
 	sg_mark_end(sg);
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 464be78a083..1c6edb9a996 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -94,7 +94,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
 {
 	unsigned int offset = 0;
 	struct req_iterator iter;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
 	unsigned int i = 0;
 	size_t size;
 	void *buf;
@@ -106,14 +106,14 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
 			__func__, __LINE__, i, bio_segments(iter.bio),
 			bio_sectors(iter.bio), iter.bio->bi_iter.bi_sector);
 
-		size = bvec->bv_len;
-		buf = bvec_kmap_irq(bvec, &flags);
+		size = bvec.bv_len;
+		buf = bvec_kmap_irq(&bvec, &flags);
 		if (gather)
 			memcpy(dev->bounce_buf+offset, buf, size);
 		else
 			memcpy(buf, dev->bounce_buf+offset, size);
 		offset += size;
-		flush_kernel_dcache_page(bvec->bv_page);
+		flush_kernel_dcache_page(bvec.bv_page);
 		bvec_kunmap_irq(buf, &flags);
 		i++;
 	}
@@ -130,7 +130,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 
 #ifdef DEBUG
 	unsigned int n = 0;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	struct req_iterator iter;
 
 	rq_for_each_segment(bv, req, iter)
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 320bbfc9b90..ef45cfb98fd 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -555,14 +555,14 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev,
 	const char *op = write ? "write" : "read";
 	loff_t offset = bio->bi_iter.bi_sector << 9;
 	int error = 0;
-	struct bio_vec *bvec;
-	unsigned int i;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	struct bio *next;
 
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment(bvec, bio, iter) {
 		/* PS3 is ppc64, so we don't handle highmem */
-		char *ptr = page_address(bvec->bv_page) + bvec->bv_offset;
-		size_t len = bvec->bv_len, retlen;
+		char *ptr = page_address(bvec.bv_page) + bvec.bv_offset;
+		size_t len = bvec.bv_len, retlen;
 
 		dev_dbg(&dev->core, "    %s %zu bytes at offset %llu\n", op,
 			len, offset);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a8f4fe2d4d1..20e8ab35736 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1109,23 +1109,23 @@ static void bio_chain_put(struct bio *chain)
  */
 static void zero_bio_chain(struct bio *chain, int start_ofs)
 {
-	struct bio_vec *bv;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 	unsigned long flags;
 	void *buf;
-	int i;
 	int pos = 0;
 
 	while (chain) {
-		bio_for_each_segment(bv, chain, i) {
-			if (pos + bv->bv_len > start_ofs) {
+		bio_for_each_segment(bv, chain, iter) {
+			if (pos + bv.bv_len > start_ofs) {
 				int remainder = max(start_ofs - pos, 0);
-				buf = bvec_kmap_irq(bv, &flags);
+				buf = bvec_kmap_irq(&bv, &flags);
 				memset(buf + remainder, 0,
-				       bv->bv_len - remainder);
-				flush_dcache_page(bv->bv_page);
+				       bv.bv_len - remainder);
+				flush_dcache_page(bv.bv_page);
 				bvec_kunmap_irq(buf, &flags);
 			}
-			pos += bv->bv_len;
+			pos += bv.bv_len;
 		}
 
 		chain = chain->bi_next;
@@ -1173,11 +1173,11 @@ static struct bio *bio_clone_range(struct bio *bio_src,
 					unsigned int len,
 					gfp_t gfpmask)
 {
-	struct bio_vec *bv;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	struct bvec_iter end_iter;
 	unsigned int resid;
-	unsigned short idx;
 	unsigned int voff;
-	unsigned short end_idx;
 	unsigned short vcnt;
 	struct bio *bio;
 
@@ -1196,22 +1196,22 @@ static struct bio *bio_clone_range(struct bio *bio_src,
 	/* Find first affected segment... */
 
 	resid = offset;
-	bio_for_each_segment(bv, bio_src, idx) {
-		if (resid < bv->bv_len)
+	bio_for_each_segment(bv, bio_src, iter) {
+		if (resid < bv.bv_len)
 			break;
-		resid -= bv->bv_len;
+		resid -= bv.bv_len;
 	}
 	voff = resid;
 
 	/* ...and the last affected segment */
 
 	resid += len;
-	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
-		if (resid <= bv->bv_len)
+	__bio_for_each_segment(bv, bio_src, end_iter, iter) {
+		if (resid <= bv.bv_len)
 			break;
-		resid -= bv->bv_len;
+		resid -= bv.bv_len;
 	}
-	vcnt = end_idx - idx + 1;
+	vcnt = end_iter.bi_idx = iter.bi_idx + 1;
 
 	/* Build the clone */
 
@@ -1229,7 +1229,7 @@ static struct bio *bio_clone_range(struct bio *bio_src,
 	 * Copy over our part of the bio_vec, then update the first
 	 * and last (or only) entries.
 	 */
-	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
+	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[iter.bi_idx],
 			vcnt * sizeof (struct bio_vec));
 	bio->bi_io_vec[0].bv_offset += voff;
 	if (vcnt > 1) {
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 3716633be3c..cf8cd293abb 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -684,7 +684,8 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 			   void *cb_data)
 {
 	struct list_head dma_list[RSXX_MAX_TARGETS];
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	unsigned long long addr8;
 	unsigned int laddr;
 	unsigned int bv_len;
@@ -722,9 +723,9 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 			bv_len -= RSXX_HW_BLK_SIZE;
 		}
 	} else {
-		bio_for_each_segment(bvec, bio, i) {
-			bv_len = bvec->bv_len;
-			bv_off = bvec->bv_offset;
+		bio_for_each_segment(bvec, bio, iter) {
+			bv_len = bvec.bv_len;
+			bv_off = bvec.bv_offset;
 
 			while (bv_len > 0) {
 				tgt   = rsxx_get_dma_tgt(card, addr8);
@@ -736,7 +737,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 				st = rsxx_queue_dma(card, &dma_list[tgt],
 							bio_data_dir(bio),
 							dma_off, dma_len,
-							laddr, bvec->bv_page,
+							laddr, bvec.bv_page,
 							bv_off, cb, cb_data);
 				if (st)
 					goto bvec_err;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 038a6d2aced..b62f3792537 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -362,7 +362,7 @@ static void btree_node_write_done(struct closure *cl)
 	struct bio_vec *bv;
 	int n;
 
-	__bio_for_each_segment(bv, b->bio, n, 0)
+	bio_for_each_segment_all(bv, b->bio, n)
 		__free_page(bv->bv_page);
 
 	__btree_node_write_done(cl);
@@ -421,7 +421,7 @@ static void do_btree_node_write(struct btree *b)
 		struct bio_vec *bv;
 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
 
-		bio_for_each_segment(bv, b->bio, j)
+		bio_for_each_segment_all(bv, b->bio, j)
 			memcpy(page_address(bv->bv_page),
 			       base + j * PAGE_SIZE, PAGE_SIZE);
 
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 92b3fd468a0..03cb4d114e1 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -173,7 +173,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 {
 	char name[BDEVNAME_SIZE];
 	struct bio *check;
-	struct bio_vec *bv;
+	struct bio_vec bv, *bv2;
+	struct bvec_iter iter;
 	int i;
 
 	check = bio_clone(bio, GFP_NOIO);
@@ -185,13 +186,13 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 
 	submit_bio_wait(READ_SYNC, check);
 
-	bio_for_each_segment(bv, bio, i) {
-		void *p1 = kmap_atomic(bv->bv_page);
-		void *p2 = page_address(check->bi_io_vec[i].bv_page);
+	bio_for_each_segment(bv, bio, iter) {
+		void *p1 = kmap_atomic(bv.bv_page);
+		void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
 
-		cache_set_err_on(memcmp(p1 + bv->bv_offset,
-					p2 + bv->bv_offset,
-					bv->bv_len),
+		cache_set_err_on(memcmp(p1 + bv.bv_offset,
+					p2 + bv.bv_offset,
+					bv.bv_len),
 				 dc->disk.c,
 				 "verify failed at dev %s sector %llu",
 				 bdevname(dc->bdev, name),
@@ -200,8 +201,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 		kunmap_atomic(p1);
 	}
 
-	bio_for_each_segment_all(bv, check, i)
-		__free_page(bv->bv_page);
+	bio_for_each_segment_all(bv2, check, i)
+		__free_page(bv2->bv_page);
 out_put:
 	bio_put(check);
 }
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index dc44f0689eb..9b5b6a41a9b 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -22,12 +22,12 @@ static void bch_bi_idx_hack_endio(struct bio *bio, int error)
 static void bch_generic_make_request_hack(struct bio *bio)
 {
 	if (bio->bi_iter.bi_idx) {
-		int i;
-		struct bio_vec *bv;
+		struct bio_vec bv;
+		struct bvec_iter iter;
 		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
 
-		bio_for_each_segment(bv, bio, i)
-			clone->bi_io_vec[clone->bi_vcnt++] = *bv;
+		bio_for_each_segment(bv, bio, iter)
+			clone->bi_io_vec[clone->bi_vcnt++] = bv;
 
 		clone->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 		clone->bi_bdev		= bio->bi_bdev;
@@ -73,8 +73,9 @@ static void bch_generic_make_request_hack(struct bio *bio)
 struct bio *bch_bio_split(struct bio *bio, int sectors,
 			  gfp_t gfp, struct bio_set *bs)
 {
-	unsigned idx = bio->bi_iter.bi_idx, vcnt = 0, nbytes = sectors << 9;
-	struct bio_vec *bv;
+	unsigned vcnt = 0, nbytes = sectors << 9;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 	struct bio *ret = NULL;
 
 	BUG_ON(sectors <= 0);
@@ -86,49 +87,35 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
 		ret = bio_alloc_bioset(gfp, 1, bs);
 		if (!ret)
 			return NULL;
-		idx = 0;
 		goto out;
 	}
 
-	bio_for_each_segment(bv, bio, idx) {
-		vcnt = idx - bio->bi_iter.bi_idx;
+	bio_for_each_segment(bv, bio, iter) {
+		vcnt++;
 
-		if (!nbytes) {
-			ret = bio_alloc_bioset(gfp, vcnt, bs);
-			if (!ret)
-				return NULL;
+		if (nbytes <= bv.bv_len)
+			break;
 
-			memcpy(ret->bi_io_vec, __bio_iovec(bio),
-			       sizeof(struct bio_vec) * vcnt);
+		nbytes -= bv.bv_len;
+	}
 
-			break;
-		} else if (nbytes < bv->bv_len) {
-			ret = bio_alloc_bioset(gfp, ++vcnt, bs);
-			if (!ret)
-				return NULL;
+	ret = bio_alloc_bioset(gfp, vcnt, bs);
+	if (!ret)
+		return NULL;
 
-			memcpy(ret->bi_io_vec, __bio_iovec(bio),
-			       sizeof(struct bio_vec) * vcnt);
+	bio_for_each_segment(bv, bio, iter) {
+		ret->bi_io_vec[ret->bi_vcnt++] = bv;
 
-			ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
-			bv->bv_offset	+= nbytes;
-			bv->bv_len	-= nbytes;
+		if (ret->bi_vcnt == vcnt)
 			break;
-		}
-
-		nbytes -= bv->bv_len;
 	}
+
+	ret->bi_io_vec[ret->bi_vcnt - 1].bv_len = nbytes;
 out:
 	ret->bi_bdev	= bio->bi_bdev;
 	ret->bi_iter.bi_sector	= bio->bi_iter.bi_sector;
 	ret->bi_iter.bi_size	= sectors << 9;
 	ret->bi_rw	= bio->bi_rw;
-	ret->bi_vcnt	= vcnt;
-	ret->bi_max_vecs = vcnt;
-
-	bio->bi_iter.bi_sector	+= sectors;
-	bio->bi_iter.bi_size	-= sectors << 9;
-	bio->bi_iter.bi_idx	 = idx;
 
 	if (bio_integrity(bio)) {
 		if (bio_integrity_clone(ret, bio, gfp)) {
@@ -137,9 +124,10 @@ out:
 		}
 
 		bio_integrity_trim(ret, 0, bio_sectors(ret));
-		bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
 	}
 
+	bio_advance(bio, ret->bi_iter.bi_size);
+
 	return ret;
 }
 
@@ -155,12 +143,13 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
 
 	if (bio_segments(bio) > max_segments ||
 	    q->merge_bvec_fn) {
-		struct bio_vec *bv;
-		int i, seg = 0;
+		struct bio_vec bv;
+		struct bvec_iter iter;
+		unsigned seg = 0;
 
 		ret = 0;
 
-		bio_for_each_segment(bv, bio, i) {
+		bio_for_each_segment(bv, bio, iter) {
 			struct bvec_merge_data bvm = {
 				.bi_bdev	= bio->bi_bdev,
 				.bi_sector	= bio->bi_iter.bi_sector,
@@ -172,11 +161,11 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
 				break;
 
 			if (q->merge_bvec_fn &&
-			    q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
+			    q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
 				break;
 
 			seg++;
-			ret += bv->bv_len >> 9;
+			ret += bv.bv_len >> 9;
 		}
 	}
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 47a9bbc7512..4c0a422fd49 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -198,14 +198,14 @@ static bool verify(struct cached_dev *dc, struct bio *bio)
 
 static void bio_csum(struct bio *bio, struct bkey *k)
 {
-	struct bio_vec *bv;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 	uint64_t csum = 0;
-	int i;
 
-	bio_for_each_segment(bv, bio, i) {
-		void *d = kmap(bv->bv_page) + bv->bv_offset;
-		csum = bch_crc64_update(csum, d, bv->bv_len);
-		kunmap(bv->bv_page);
+	bio_for_each_segment(bv, bio, iter) {
+		void *d = kmap(bv.bv_page) + bv.bv_offset;
+		csum = bch_crc64_update(csum, d, bv.bv_len);
+		kunmap(bv.bv_page);
 	}
 
 	k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
@@ -1182,17 +1182,17 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
 static int flash_dev_cache_miss(struct btree *b, struct search *s,
 				struct bio *bio, unsigned sectors)
 {
-	struct bio_vec *bv;
-	int i;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 
 	/* Zero fill bio */
 
-	bio_for_each_segment(bv, bio, i) {
-		unsigned j = min(bv->bv_len >> 9, sectors);
+	bio_for_each_segment(bv, bio, iter) {
+		unsigned j = min(bv.bv_len >> 9, sectors);
 
-		void *p = kmap(bv->bv_page);
-		memset(p + bv->bv_offset, 0, j << 9);
-		kunmap(bv->bv_page);
+		void *p = kmap(bv.bv_page);
+		memset(p + bv.bv_offset, 0, j << 9);
+		kunmap(bv.bv_page);
 
 		sectors	-= j;
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5d9c0ee4d6..bef353c51c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -937,9 +937,9 @@ static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page *page,
 	sector_t sector, struct dma_async_tx_descriptor *tx)
 {
-	struct bio_vec *bvl;
+	struct bio_vec bvl;
+	struct bvec_iter iter;
 	struct page *bio_page;
-	int i;
 	int page_offset;
 	struct async_submit_ctl submit;
 	enum async_tx_flags flags = 0;
@@ -953,8 +953,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 		flags |= ASYNC_TX_FENCE;
 	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
 
-	bio_for_each_segment(bvl, bio, i) {
-		int len = bvl->bv_len;
+	bio_for_each_segment(bvl, bio, iter) {
+		int len = bvl.bv_len;
 		int clen;
 		int b_offset = 0;
 
@@ -970,8 +970,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 			clen = len;
 
 		if (clen > 0) {
-			b_offset += bvl->bv_offset;
-			bio_page = bvl->bv_page;
+			b_offset += bvl.bv_offset;
+			bio_page = bvl.bv_page;
 			if (frombio)
 				tx = async_memcpy(page, bio_page, page_offset,
 						  b_offset, clen, &submit);
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index 92bd22ce676..9cbc567698c 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -504,7 +504,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
 	struct dasd_diag_req *dreq;
 	struct dasd_diag_bio *dbio;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst;
 	unsigned int count, datasize;
 	sector_t recid, first_rec, last_rec;
@@ -525,10 +525,10 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
 	/* Check struct bio and count the number of blocks for the request. */
 	count = 0;
 	rq_for_each_segment(bv, req, iter) {
-		if (bv->bv_len & (blksize - 1))
+		if (bv.bv_len & (blksize - 1))
 			/* Fba can only do full blocks. */
 			return ERR_PTR(-EINVAL);
-		count += bv->bv_len >> (block->s2b_shift + 9);
+		count += bv.bv_len >> (block->s2b_shift + 9);
 	}
 	/* Paranoia. */
 	if (count != last_rec - first_rec + 1)
@@ -545,8 +545,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
 	dbio = dreq->bio;
 	recid = first_rec;
 	rq_for_each_segment(bv, req, iter) {
-		dst = page_address(bv->bv_page) + bv->bv_offset;
-		for (off = 0; off < bv->bv_len; off += blksize) {
+		dst = page_address(bv.bv_page) + bv.bv_offset;
+		for (off = 0; off < bv.bv_len; off += blksize) {
 			memset(dbio, 0, sizeof (struct dasd_diag_bio));
 			dbio->type = rw_cmd;
 			dbio->block_number = recid + 1;
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index cee7e2708a1..70d17701732 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -2551,7 +2551,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single(
 	struct dasd_ccw_req *cqr;
 	struct ccw1 *ccw;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst;
 	unsigned int off;
 	int count, cidaw, cplength, datasize;
@@ -2573,13 +2573,13 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single(
 	count = 0;
 	cidaw = 0;
 	rq_for_each_segment(bv, req, iter) {
-		if (bv->bv_len & (blksize - 1))
+		if (bv.bv_len & (blksize - 1))
 			/* Eckd can only do full blocks. */
 			return ERR_PTR(-EINVAL);
-		count += bv->bv_len >> (block->s2b_shift + 9);
+		count += bv.bv_len >> (block->s2b_shift + 9);
 #if defined(CONFIG_64BIT)
-		if (idal_is_needed (page_address(bv->bv_page), bv->bv_len))
-			cidaw += bv->bv_len >> (block->s2b_shift + 9);
+		if (idal_is_needed (page_address(bv.bv_page), bv.bv_len))
+			cidaw += bv.bv_len >> (block->s2b_shift + 9);
 #endif
 	}
 	/* Paranoia. */
@@ -2650,16 +2650,16 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single(
 			      last_rec - recid + 1, cmd, basedev, blksize);
 	}
 	rq_for_each_segment(bv, req, iter) {
-		dst = page_address(bv->bv_page) + bv->bv_offset;
+		dst = page_address(bv.bv_page) + bv.bv_offset;
 		if (dasd_page_cache) {
 			char *copy = kmem_cache_alloc(dasd_page_cache,
 						      GFP_DMA | __GFP_NOWARN);
 			if (copy && rq_data_dir(req) == WRITE)
-				memcpy(copy + bv->bv_offset, dst, bv->bv_len);
+				memcpy(copy + bv.bv_offset, dst, bv.bv_len);
 			if (copy)
-				dst = copy + bv->bv_offset;
+				dst = copy + bv.bv_offset;
 		}
-		for (off = 0; off < bv->bv_len; off += blksize) {
+		for (off = 0; off < bv.bv_len; off += blksize) {
 			sector_t trkid = recid;
 			unsigned int recoffs = sector_div(trkid, blk_per_trk);
 			rcmd = cmd;
@@ -2735,7 +2735,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track(
 	struct dasd_ccw_req *cqr;
 	struct ccw1 *ccw;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst, *idaw_dst;
 	unsigned int cidaw, cplength, datasize;
 	unsigned int tlf;
@@ -2813,8 +2813,8 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track(
 	idaw_dst = NULL;
 	idaw_len = 0;
 	rq_for_each_segment(bv, req, iter) {
-		dst = page_address(bv->bv_page) + bv->bv_offset;
-		seg_len = bv->bv_len;
+		dst = page_address(bv.bv_page) + bv.bv_offset;
+		seg_len = bv.bv_len;
 		while (seg_len) {
 			if (new_track) {
 				trkid = recid;
@@ -3039,7 +3039,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track(
 {
 	struct dasd_ccw_req *cqr;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst;
 	unsigned int trkcount, ctidaw;
 	unsigned char cmd;
@@ -3125,8 +3125,8 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track(
 		new_track = 1;
 		recid = first_rec;
 		rq_for_each_segment(bv, req, iter) {
-			dst = page_address(bv->bv_page) + bv->bv_offset;
-			seg_len = bv->bv_len;
+			dst = page_address(bv.bv_page) + bv.bv_offset;
+			seg_len = bv.bv_len;
 			while (seg_len) {
 				if (new_track) {
 					trkid = recid;
@@ -3158,9 +3158,9 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track(
 		}
 	} else {
 		rq_for_each_segment(bv, req, iter) {
-			dst = page_address(bv->bv_page) + bv->bv_offset;
+			dst = page_address(bv.bv_page) + bv.bv_offset;
 			last_tidaw = itcw_add_tidaw(itcw, 0x00,
-						    dst, bv->bv_len);
+						    dst, bv.bv_len);
 			if (IS_ERR(last_tidaw)) {
 				ret = -EINVAL;
 				goto out_error;
@@ -3276,7 +3276,7 @@ static struct dasd_ccw_req *dasd_raw_build_cp(struct dasd_device *startdev,
 	struct dasd_ccw_req *cqr;
 	struct ccw1 *ccw;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst;
 	unsigned char cmd;
 	unsigned int trkcount;
@@ -3376,8 +3376,8 @@ static struct dasd_ccw_req *dasd_raw_build_cp(struct dasd_device *startdev,
 			idaws = idal_create_words(idaws, rawpadpage, PAGE_SIZE);
 	}
 	rq_for_each_segment(bv, req, iter) {
-		dst = page_address(bv->bv_page) + bv->bv_offset;
-		seg_len = bv->bv_len;
+		dst = page_address(bv.bv_page) + bv.bv_offset;
+		seg_len = bv.bv_len;
 		if (cmd == DASD_ECKD_CCW_READ_TRACK)
 			memset(dst, 0, seg_len);
 		if (!len_to_track_end) {
@@ -3422,7 +3422,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 	struct dasd_eckd_private *private;
 	struct ccw1 *ccw;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst, *cda;
 	unsigned int blksize, blk_per_trk, off;
 	sector_t recid;
@@ -3440,8 +3440,8 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 	if (private->uses_cdl == 0 || recid > 2*blk_per_trk)
 		ccw++;
 	rq_for_each_segment(bv, req, iter) {
-		dst = page_address(bv->bv_page) + bv->bv_offset;
-		for (off = 0; off < bv->bv_len; off += blksize) {
+		dst = page_address(bv.bv_page) + bv.bv_offset;
+		for (off = 0; off < bv.bv_len; off += blksize) {
 			/* Skip locate record. */
 			if (private->uses_cdl && recid <= 2*blk_per_trk)
 				ccw++;
@@ -3452,7 +3452,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 					cda = (char *)((addr_t) ccw->cda);
 				if (dst != cda) {
 					if (rq_data_dir(req) == READ)
-						memcpy(dst, cda, bv->bv_len);
+						memcpy(dst, cda, bv.bv_len);
 					kmem_cache_free(dasd_page_cache,
 					    (void *)((addr_t)cda & PAGE_MASK));
 				}
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 9cbc8c32ba5..2c8e68bf9a1 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -260,7 +260,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev,
 	struct dasd_ccw_req *cqr;
 	struct ccw1 *ccw;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst;
 	int count, cidaw, cplength, datasize;
 	sector_t recid, first_rec, last_rec;
@@ -283,13 +283,13 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev,
 	count = 0;
 	cidaw = 0;
 	rq_for_each_segment(bv, req, iter) {
-		if (bv->bv_len & (blksize - 1))
+		if (bv.bv_len & (blksize - 1))
 			/* Fba can only do full blocks. */
 			return ERR_PTR(-EINVAL);
-		count += bv->bv_len >> (block->s2b_shift + 9);
+		count += bv.bv_len >> (block->s2b_shift + 9);
 #if defined(CONFIG_64BIT)
-		if (idal_is_needed (page_address(bv->bv_page), bv->bv_len))
-			cidaw += bv->bv_len / blksize;
+		if (idal_is_needed (page_address(bv.bv_page), bv.bv_len))
+			cidaw += bv.bv_len / blksize;
 #endif
 	}
 	/* Paranoia. */
@@ -326,16 +326,16 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev,
 	}
 	recid = first_rec;
 	rq_for_each_segment(bv, req, iter) {
-		dst = page_address(bv->bv_page) + bv->bv_offset;
+		dst = page_address(bv.bv_page) + bv.bv_offset;
 		if (dasd_page_cache) {
 			char *copy = kmem_cache_alloc(dasd_page_cache,
 						      GFP_DMA | __GFP_NOWARN);
 			if (copy && rq_data_dir(req) == WRITE)
-				memcpy(copy + bv->bv_offset, dst, bv->bv_len);
+				memcpy(copy + bv.bv_offset, dst, bv.bv_len);
 			if (copy)
-				dst = copy + bv->bv_offset;
+				dst = copy + bv.bv_offset;
 		}
-		for (off = 0; off < bv->bv_len; off += blksize) {
+		for (off = 0; off < bv.bv_len; off += blksize) {
 			/* Locate record for stupid devices. */
 			if (private->rdc_data.mode.bits.data_chain == 0) {
 				ccw[-1].flags |= CCW_FLAG_CC;
@@ -384,7 +384,7 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 	struct dasd_fba_private *private;
 	struct ccw1 *ccw;
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *dst, *cda;
 	unsigned int blksize, off;
 	int status;
@@ -399,8 +399,8 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 	if (private->rdc_data.mode.bits.data_chain != 0)
 		ccw++;
 	rq_for_each_segment(bv, req, iter) {
-		dst = page_address(bv->bv_page) + bv->bv_offset;
-		for (off = 0; off < bv->bv_len; off += blksize) {
+		dst = page_address(bv.bv_page) + bv.bv_offset;
+		for (off = 0; off < bv.bv_len; off += blksize) {
 			/* Skip locate record. */
 			if (private->rdc_data.mode.bits.data_chain == 0)
 				ccw++;
@@ -411,7 +411,7 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 					cda = (char *)((addr_t) ccw->cda);
 				if (dst != cda) {
 					if (rq_data_dir(req) == READ)
-						memcpy(dst, cda, bv->bv_len);
+						memcpy(dst, cda, bv.bv_len);
 					kmem_cache_free(dasd_page_cache,
 					    (void *)((addr_t)cda & PAGE_MASK));
 				}
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 16814a8457f..ebf41e228e5 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -808,12 +808,12 @@ static void
 dcssblk_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct dcssblk_dev_info *dev_info;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	unsigned long index;
 	unsigned long page_addr;
 	unsigned long source_addr;
 	unsigned long bytes_done;
-	int i;
 
 	bytes_done = 0;
 	dev_info = bio->bi_bdev->bd_disk->private_data;
@@ -844,21 +844,21 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	index = (bio->bi_iter.bi_sector >> 3);
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment(bvec, bio, iter) {
 		page_addr = (unsigned long)
-			page_address(bvec->bv_page) + bvec->bv_offset;
+			page_address(bvec.bv_page) + bvec.bv_offset;
 		source_addr = dev_info->start + (index<<12) + bytes_done;
-		if (unlikely((page_addr & 4095) != 0) || (bvec->bv_len & 4095) != 0)
+		if (unlikely((page_addr & 4095) != 0) || (bvec.bv_len & 4095) != 0)
 			// More paranoia.
 			goto fail;
 		if (bio_data_dir(bio) == READ) {
 			memcpy((void*)page_addr, (void*)source_addr,
-				bvec->bv_len);
+				bvec.bv_len);
 		} else {
 			memcpy((void*)source_addr, (void*)page_addr,
-				bvec->bv_len);
+				bvec.bv_len);
 		}
-		bytes_done += bvec->bv_len;
+		bytes_done += bvec.bv_len;
 	}
 	bio_endio(bio, 0);
 	return;
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index d0ab5019d88..76bed1743db 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -130,7 +130,7 @@ static void scm_request_prepare(struct scm_request *scmrq)
 	struct aidaw *aidaw = scmrq->aidaw;
 	struct msb *msb = &scmrq->aob->msb[0];
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 
 	msb->bs = MSB_BS_4K;
 	scmrq->aob->request.msb_count = 1;
@@ -142,9 +142,9 @@ static void scm_request_prepare(struct scm_request *scmrq)
 	msb->data_addr = (u64) aidaw;
 
 	rq_for_each_segment(bv, scmrq->request, iter) {
-		WARN_ON(bv->bv_offset);
-		msb->blk_count += bv->bv_len >> 12;
-		aidaw->data_addr = (u64) page_address(bv->bv_page);
+		WARN_ON(bv.bv_offset);
+		msb->blk_count += bv.bv_len >> 12;
+		aidaw->data_addr = (u64) page_address(bv.bv_page);
 		aidaw++;
 	}
 }
diff --git a/drivers/s390/block/scm_blk_cluster.c b/drivers/s390/block/scm_blk_cluster.c
index 27f930cd657..9aae909d47a 100644
--- a/drivers/s390/block/scm_blk_cluster.c
+++ b/drivers/s390/block/scm_blk_cluster.c
@@ -122,7 +122,7 @@ static void scm_prepare_cluster_request(struct scm_request *scmrq)
 	struct aidaw *aidaw = scmrq->aidaw;
 	struct msb *msb = &scmrq->aob->msb[0];
 	struct req_iterator iter;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	int i = 0;
 	u64 addr;
 
@@ -163,7 +163,7 @@ static void scm_prepare_cluster_request(struct scm_request *scmrq)
 			i++;
 		}
 		rq_for_each_segment(bv, req, iter) {
-			aidaw->data_addr = (u64) page_address(bv->bv_page);
+			aidaw->data_addr = (u64) page_address(bv.bv_page);
 			aidaw++;
 			i++;
 		}
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index dd4e73fdb32..3e530f9da8c 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -184,11 +184,11 @@ static unsigned long xpram_highest_page_index(void)
 static void xpram_make_request(struct request_queue *q, struct bio *bio)
 {
 	xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	unsigned int index;
 	unsigned long page_addr;
 	unsigned long bytes;
-	int i;
 
 	if ((bio->bi_iter.bi_sector & 7) != 0 ||
 	    (bio->bi_iter.bi_size & 4095) != 0)
@@ -200,10 +200,10 @@ static void xpram_make_request(struct request_queue *q, struct bio *bio)
 	if ((bio->bi_iter.bi_sector >> 3) > 0xffffffffU - xdev->offset)
 		goto fail;
 	index = (bio->bi_iter.bi_sector >> 3) + xdev->offset;
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment(bvec, bio, iter) {
 		page_addr = (unsigned long)
-			kmap(bvec->bv_page) + bvec->bv_offset;
-		bytes = bvec->bv_len;
+			kmap(bvec.bv_page) + bvec.bv_offset;
+		bytes = bvec.bv_len;
 		if ((page_addr & 4095) != 0 || (bytes & 4095) != 0)
 			/* More paranoia. */
 			goto fail;
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index 9d26637308b..7143e86af32 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1901,7 +1901,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	struct MPT2SAS_ADAPTER *ioc = shost_priv(shost);
 	Mpi2SmpPassthroughRequest_t *mpi_request;
 	Mpi2SmpPassthroughReply_t *mpi_reply;
-	int rc, i;
+	int rc;
 	u16 smid;
 	u32 ioc_state;
 	unsigned long timeleft;
@@ -1916,7 +1916,8 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	void *pci_addr_out = NULL;
 	u16 wait_state_count;
 	struct request *rsp = req->next_rq;
-	struct bio_vec *bvec = NULL;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 
 	if (!rsp) {
 		printk(MPT2SAS_ERR_FMT "%s: the smp response space is "
@@ -1955,11 +1956,11 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			goto out;
 		}
 
-		bio_for_each_segment(bvec, req->bio, i) {
+		bio_for_each_segment(bvec, req->bio, iter) {
 			memcpy(pci_addr_out + offset,
-			    page_address(bvec->bv_page) + bvec->bv_offset,
-			    bvec->bv_len);
-			offset += bvec->bv_len;
+			    page_address(bvec.bv_page) + bvec.bv_offset,
+			    bvec.bv_len);
+			offset += bvec.bv_len;
 		}
 	} else {
 		dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio),
@@ -2106,19 +2107,19 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			u32 offset = 0;
 			u32 bytes_to_copy =
 			    le16_to_cpu(mpi_reply->ResponseDataLength);
-			bio_for_each_segment(bvec, rsp->bio, i) {
-				if (bytes_to_copy <= bvec->bv_len) {
-					memcpy(page_address(bvec->bv_page) +
-					    bvec->bv_offset, pci_addr_in +
+			bio_for_each_segment(bvec, rsp->bio, iter) {
+				if (bytes_to_copy <= bvec.bv_len) {
+					memcpy(page_address(bvec.bv_page) +
+					    bvec.bv_offset, pci_addr_in +
 					    offset, bytes_to_copy);
 					break;
 				} else {
-					memcpy(page_address(bvec->bv_page) +
-					    bvec->bv_offset, pci_addr_in +
-					    offset, bvec->bv_len);
-					bytes_to_copy -= bvec->bv_len;
+					memcpy(page_address(bvec.bv_page) +
+					    bvec.bv_offset, pci_addr_in +
+					    offset, bvec.bv_len);
+					bytes_to_copy -= bvec.bv_len;
 				}
-				offset += bvec->bv_len;
+				offset += bvec.bv_len;
 			}
 		}
 	} else {
diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c
index e771a88c6a7..196a67f2e95 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_transport.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c
@@ -1884,7 +1884,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	struct MPT3SAS_ADAPTER *ioc = shost_priv(shost);
 	Mpi2SmpPassthroughRequest_t *mpi_request;
 	Mpi2SmpPassthroughReply_t *mpi_reply;
-	int rc, i;
+	int rc;
 	u16 smid;
 	u32 ioc_state;
 	unsigned long timeleft;
@@ -1898,7 +1898,8 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	void *pci_addr_out = NULL;
 	u16 wait_state_count;
 	struct request *rsp = req->next_rq;
-	struct bio_vec *bvec = NULL;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 
 	if (!rsp) {
 		pr_err(MPT3SAS_FMT "%s: the smp response space is missing\n",
@@ -1938,11 +1939,11 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			goto out;
 		}
 
-		bio_for_each_segment(bvec, req->bio, i) {
+		bio_for_each_segment(bvec, req->bio, iter) {
 			memcpy(pci_addr_out + offset,
-			    page_address(bvec->bv_page) + bvec->bv_offset,
-			    bvec->bv_len);
-			offset += bvec->bv_len;
+			    page_address(bvec.bv_page) + bvec.bv_offset,
+			    bvec.bv_len);
+			offset += bvec.bv_len;
 		}
 	} else {
 		dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio),
@@ -2067,19 +2068,19 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			u32 offset = 0;
 			u32 bytes_to_copy =
 			    le16_to_cpu(mpi_reply->ResponseDataLength);
-			bio_for_each_segment(bvec, rsp->bio, i) {
-				if (bytes_to_copy <= bvec->bv_len) {
-					memcpy(page_address(bvec->bv_page) +
-					    bvec->bv_offset, pci_addr_in +
+			bio_for_each_segment(bvec, rsp->bio, iter) {
+				if (bytes_to_copy <= bvec.bv_len) {
+					memcpy(page_address(bvec.bv_page) +
+					    bvec.bv_offset, pci_addr_in +
 					    offset, bytes_to_copy);
 					break;
 				} else {
-					memcpy(page_address(bvec->bv_page) +
-					    bvec->bv_offset, pci_addr_in +
-					    offset, bvec->bv_len);
-					bytes_to_copy -= bvec->bv_len;
+					memcpy(page_address(bvec.bv_page) +
+					    bvec.bv_offset, pci_addr_in +
+					    offset, bvec.bv_len);
+					bytes_to_copy -= bvec.bv_len;
 				}
-				offset += bvec->bv_len;
+				offset += bvec.bv_len;
 			}
 		}
 	} else {
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
index 53741be754b..581ff78be1a 100644
--- a/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -194,10 +194,10 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 	struct cl_object     *obj = ll_i2info(inode)->lli_clob;
 	pgoff_t	       offset;
 	int		   ret;
-	int		   i;
 	int		   rw;
 	obd_count	     page_count = 0;
-	struct bio_vec       *bvec;
+	struct bio_vec       bvec;
+	struct bvec_iter   iter;
 	struct bio	   *bio;
 	ssize_t	       bytes;
 
@@ -221,14 +221,14 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 		LASSERT(rw == bio->bi_rw);
 
 		offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
-		bio_for_each_segment(bvec, bio, i) {
-			BUG_ON(bvec->bv_offset != 0);
-			BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+		bio_for_each_segment(bvec, bio, iter) {
+			BUG_ON(bvec.bv_offset != 0);
+			BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE);
 
-			pages[page_count] = bvec->bv_page;
+			pages[page_count] = bvec.bv_page;
 			offsets[page_count] = offset;
 			page_count++;
-			offset += bvec->bv_len;
+			offset += bvec.bv_len;
 		}
 		LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
 	}
diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index e9e6f984092..6f988382b17 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -672,9 +672,10 @@ static ssize_t reset_store(struct device *dev,
 
 static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
 {
-	int i, offset;
+	int offset;
 	u32 index;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 
 	switch (rw) {
 	case READ:
@@ -689,33 +690,33 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
 	offset = (bio->bi_iter.bi_sector &
 		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment(bvec, bio, iter) {
 		int max_transfer_size = PAGE_SIZE - offset;
 
-		if (bvec->bv_len > max_transfer_size) {
+		if (bvec.bv_len > max_transfer_size) {
 			/*
 			 * zram_bvec_rw() can only make operation on a single
 			 * zram page. Split the bio vector.
 			 */
 			struct bio_vec bv;
 
-			bv.bv_page = bvec->bv_page;
+			bv.bv_page = bvec.bv_page;
 			bv.bv_len = max_transfer_size;
-			bv.bv_offset = bvec->bv_offset;
+			bv.bv_offset = bvec.bv_offset;
 
 			if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0)
 				goto out;
 
-			bv.bv_len = bvec->bv_len - max_transfer_size;
+			bv.bv_len = bvec.bv_len - max_transfer_size;
 			bv.bv_offset += max_transfer_size;
 			if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0)
 				goto out;
 		} else
-			if (zram_bvec_rw(zram, bvec, index, offset, bio, rw)
+			if (zram_bvec_rw(zram, &bvec, index, offset, bio, rw)
 			    < 0)
 				goto out;
 
-		update_position(&index, &offset, bvec);
+		update_position(&index, &offset, &bvec);
 	}
 
 	set_bit(BIO_UPTODATE, &bio->bi_flags);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 08e3d1388c6..9127db86f31 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -299,25 +299,26 @@ static void bio_integrity_generate(struct bio *bio)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 	struct blk_integrity_exchg bix;
-	struct bio_vec *bv;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 	sector_t sector = bio->bi_iter.bi_sector;
-	unsigned int i, sectors, total;
+	unsigned int sectors, total;
 	void *prot_buf = bio->bi_integrity->bip_buf;
 
 	total = 0;
 	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
 	bix.sector_size = bi->sector_size;
 
-	bio_for_each_segment(bv, bio, i) {
-		void *kaddr = kmap_atomic(bv->bv_page);
-		bix.data_buf = kaddr + bv->bv_offset;
-		bix.data_size = bv->bv_len;
+	bio_for_each_segment(bv, bio, iter) {
+		void *kaddr = kmap_atomic(bv.bv_page);
+		bix.data_buf = kaddr + bv.bv_offset;
+		bix.data_size = bv.bv_len;
 		bix.prot_buf = prot_buf;
 		bix.sector = sector;
 
 		bi->generate_fn(&bix);
 
-		sectors = bv->bv_len / bi->sector_size;
+		sectors = bv.bv_len / bi->sector_size;
 		sector += sectors;
 		prot_buf += sectors * bi->tuple_size;
 		total += sectors * bi->tuple_size;
@@ -441,19 +442,20 @@ static int bio_integrity_verify(struct bio *bio)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 	struct blk_integrity_exchg bix;
-	struct bio_vec *bv;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 	sector_t sector = bio->bi_integrity->bip_sector;
-	unsigned int i, sectors, total, ret;
+	unsigned int sectors, total, ret;
 	void *prot_buf = bio->bi_integrity->bip_buf;
 
 	ret = total = 0;
 	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
 	bix.sector_size = bi->sector_size;
 
-	bio_for_each_segment(bv, bio, i) {
-		void *kaddr = kmap_atomic(bv->bv_page);
-		bix.data_buf = kaddr + bv->bv_offset;
-		bix.data_size = bv->bv_len;
+	bio_for_each_segment(bv, bio, iter) {
+		void *kaddr = kmap_atomic(bv.bv_page);
+		bix.data_buf = kaddr + bv.bv_offset;
+		bix.data_size = bv.bv_len;
 		bix.prot_buf = prot_buf;
 		bix.sector = sector;
 
@@ -464,7 +466,7 @@ static int bio_integrity_verify(struct bio *bio)
 			return ret;
 		}
 
-		sectors = bv->bv_len / bi->sector_size;
+		sectors = bv.bv_len / bi->sector_size;
 		sector += sectors;
 		prot_buf += sectors * bi->tuple_size;
 		total += sectors * bi->tuple_size;
diff --git a/fs/bio.c b/fs/bio.c
index 7bb281fc3d5..8b7f14a9550 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -473,13 +473,13 @@ EXPORT_SYMBOL(bio_alloc_bioset);
 void zero_fill_bio(struct bio *bio)
 {
 	unsigned long flags;
-	struct bio_vec *bv;
-	int i;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 
-	bio_for_each_segment(bv, bio, i) {
-		char *data = bvec_kmap_irq(bv, &flags);
-		memset(data, 0, bv->bv_len);
-		flush_dcache_page(bv->bv_page);
+	bio_for_each_segment(bv, bio, iter) {
+		char *data = bvec_kmap_irq(&bv, &flags);
+		memset(data, 0, bv.bv_len);
+		flush_dcache_page(bv.bv_page);
 		bvec_kunmap_irq(data, &flags);
 	}
 }
@@ -1687,11 +1687,11 @@ void bio_check_pages_dirty(struct bio *bio)
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 void bio_flush_dcache_pages(struct bio *bi)
 {
-	int i;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 
-	bio_for_each_segment(bvec, bi, i)
-		flush_dcache_page(bvec->bv_page);
+	bio_for_each_segment(bvec, bi, iter)
+		flush_dcache_page(bvec.bv_page);
 }
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
@@ -1840,7 +1840,7 @@ void bio_trim(struct bio *bio, int offset, int size)
 		bio->bi_iter.bi_idx = 0;
 	}
 	/* Make sure vcnt and last bv are not too big */
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		if (sofar + bvec->bv_len > size)
 			bvec->bv_len = size - sofar;
 		if (bvec->bv_len == 0) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9f182fcbe71..c16adb5f69f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -63,10 +63,13 @@
  */
 #define bio_iovec_idx(bio, idx)	(&((bio)->bi_io_vec[(idx)]))
 #define __bio_iovec(bio)	bio_iovec_idx((bio), (bio)->bi_iter.bi_idx)
-#define bio_iovec(bio)		(*__bio_iovec(bio))
+
+#define bio_iter_iovec(bio, iter) ((bio)->bi_io_vec[(iter).bi_idx])
 
 #define bio_page(bio)		(bio_iovec((bio)).bv_page)
 #define bio_offset(bio)		(bio_iovec((bio)).bv_offset)
+#define bio_iovec(bio)		(*__bio_iovec(bio))
+
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_iter.bi_idx)
 #define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
 #define bio_end_sector(bio)	((bio)->bi_iter.bi_sector + bio_sectors((bio)))
@@ -133,15 +136,6 @@ static inline void *bio_data(struct bio *bio)
 
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 
-/*
- * drivers should not use the __ version unless they _really_ know what
- * they're doing
- */
-#define __bio_for_each_segment(bvl, bio, i, start_idx)			\
-	for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx);	\
-	     i < (bio)->bi_vcnt;					\
-	     bvl++, i++)
-
 /*
  * drivers should _never_ use the all version - the bio may have been split
  * before it got to the driver and the driver won't own all of it
@@ -151,10 +145,16 @@ static inline void *bio_data(struct bio *bio)
 	     bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;	\
 	     i++)
 
-#define bio_for_each_segment(bvl, bio, i)				\
-	for (i = (bio)->bi_iter.bi_idx;					\
-	     bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;	\
-	     i++)
+#define __bio_for_each_segment(bvl, bio, iter, start)			\
+	for (iter = (start);						\
+	     bvl = bio_iter_iovec((bio), (iter)),			\
+	     (iter).bi_idx < (bio)->bi_vcnt;				\
+	     (iter).bi_idx++)
+
+#define bio_for_each_segment(bvl, bio, iter)				\
+	__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
+
+#define bio_iter_last(bio, iter) ((iter).bi_idx == (bio)->bi_vcnt - 1)
 
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1b135d49b27..337b92a5465 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -735,7 +735,7 @@ struct rq_map_data {
 };
 
 struct req_iterator {
-	int i;
+	struct bvec_iter iter;
 	struct bio *bio;
 };
 
@@ -748,10 +748,11 @@ struct req_iterator {
 
 #define rq_for_each_segment(bvl, _rq, _iter)			\
 	__rq_for_each_bio(_iter.bio, _rq)			\
-		bio_for_each_segment(bvl, _iter.bio, _iter.i)
+		bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 
 #define rq_iter_last(rq, _iter)					\
-		(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
+		(_iter.bio->bi_next == NULL &&			\
+		 bio_iter_last(_iter.bio, _iter.iter))
 
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
diff --git a/mm/bounce.c b/mm/bounce.c
index 5a7d58fb883..523918b8c6d 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -98,27 +98,24 @@ int init_emergency_isa_pool(void)
 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 {
 	unsigned char *vfrom;
-	struct bio_vec *tovec, *fromvec;
-	int i;
-
-	bio_for_each_segment(tovec, to, i) {
-		fromvec = from->bi_io_vec + i;
-
-		/*
-		 * not bounced
-		 */
-		if (tovec->bv_page == fromvec->bv_page)
-			continue;
-
-		/*
-		 * fromvec->bv_offset and fromvec->bv_len might have been
-		 * modified by the block layer, so use the original copy,
-		 * bounce_copy_vec already uses tovec->bv_len
-		 */
-		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+	struct bio_vec tovec, *fromvec = from->bi_io_vec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(tovec, to, iter) {
+		if (tovec.bv_page != fromvec->bv_page) {
+			/*
+			 * fromvec->bv_offset and fromvec->bv_len might have
+			 * been modified by the block layer, so use the original
+			 * copy, bounce_copy_vec already uses tovec->bv_len
+			 */
+			vfrom = page_address(fromvec->bv_page) +
+				tovec.bv_offset;
+
+			bounce_copy_vec(&tovec, vfrom);
+			flush_dcache_page(tovec.bv_page);
+		}
 
-		bounce_copy_vec(tovec, vfrom);
-		flush_dcache_page(tovec->bv_page);
+		fromvec++;
 	}
 }
 
@@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 {
 	struct bio *bio;
 	int rw = bio_data_dir(*bio_orig);
-	struct bio_vec *to, *from;
+	struct bio_vec *to, from;
+	struct bvec_iter iter;
 	unsigned i;
 
 	if (force)
 		goto bounce;
-	bio_for_each_segment(from, *bio_orig, i)
-		if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+	bio_for_each_segment(from, *bio_orig, iter)
+		if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
 			goto bounce;
 
 	return;
-- 
cgit v1.2.3-70-g09d2


From 4550dd6c6b062fc5e5b647296d55da22616123c3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 7 Aug 2013 14:26:21 -0700
Subject: block: Immutable bio vecs

This adds a mechanism by which we can advance a bio by an arbitrary
number of bytes without modifying the biovec: bio->bi_iter.bi_bvec_done
indicates the number of bytes completed in the current bvec.

Various driver code still needs to be updated to not refer to the bvec
directly before we can use this for interesting things, like efficient
bio splitting.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: Paul Clements <Paul.Clements@steeleye.com>
Cc: drbd-user@lists.linbit.com
Cc: nbd-general@lists.sourceforge.net
---
 Documentation/block/biovecs.txt | 111 ++++++++++++++++++++++++++++++++++++++++
 drivers/block/drbd/drbd_main.c  |   4 +-
 drivers/block/nbd.c             |   2 +-
 fs/bio.c                        |  27 +---------
 include/linux/bio.h             |  81 ++++++++++++++++++++++++++---
 include/linux/blk_types.h       |   3 ++
 include/linux/blkdev.h          |   4 +-
 7 files changed, 194 insertions(+), 38 deletions(-)
 create mode 100644 Documentation/block/biovecs.txt

(limited to 'fs')

diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
new file mode 100644
index 00000000000..74a32ad52f5
--- /dev/null
+++ b/Documentation/block/biovecs.txt
@@ -0,0 +1,111 @@
+
+Immutable biovecs and biovec iterators:
+=======================================
+
+Kent Overstreet <kmo@daterainc.com>
+
+As of 3.13, biovecs should never be modified after a bio has been submitted.
+Instead, we have a new struct bvec_iter which represents a range of a biovec -
+the iterator will be modified as the bio is completed, not the biovec.
+
+More specifically, old code that needed to partially complete a bio would
+update bi_sector and bi_size, and advance bi_idx to the next biovec. If it
+ended up partway through a biovec, it would increment bv_offset and decrement
+bv_len by the number of bytes completed in that biovec.
+
+In the new scheme of things, everything that must be mutated in order to
+partially complete a bio is segregated into struct bvec_iter: bi_sector,
+bi_size and bi_idx have been moved there; and instead of modifying bv_offset
+and bv_len, struct bvec_iter has bi_bvec_done, which represents the number of
+bytes completed in the current bvec.
+
+There are a bunch of new helper macros for hiding the gory details - in
+particular, presenting the illusion of partially completed biovecs so that
+normal code doesn't have to deal with bi_bvec_done.
+
+ * Driver code should no longer refer to biovecs directly; we now have
+   bio_iovec() and bio_iovec_iter() macros that return literal struct biovecs,
+   constructed from the raw biovecs but taking into account bi_bvec_done and
+   bi_size.
+
+   bio_for_each_segment() has been updated to take a bvec_iter argument
+   instead of an integer (that corresponded to bi_idx); for a lot of code the
+   conversion just required changing the types of the arguments to
+   bio_for_each_segment().
+
+ * Advancing a bvec_iter is done with bio_advance_iter(); bio_advance() is a
+   wrapper around bio_advance_iter() that operates on bio->bi_iter, and also
+   advances the bio integrity's iter if present.
+
+   There is a lower level advance function - bvec_iter_advance() - which takes
+   a pointer to a biovec, not a bio; this is used by the bio integrity code.
+
+What's all this get us?
+=======================
+
+Having a real iterator, and making biovecs immutable, has a number of
+advantages:
+
+ * Before, iterating over bios was very awkward when you weren't processing
+   exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
+   which copies the contents of one bio into another. Because the biovecs
+   wouldn't necessarily be the same size, the old code was tricky convoluted -
+   it had to walk two different bios at the same time, keeping both bi_idx and
+   and offset into the current biovec for each.
+
+   The new code is much more straightforward - have a look. This sort of
+   pattern comes up in a lot of places; a lot of drivers were essentially open
+   coding bvec iterators before, and having common implementation considerably
+   simplifies a lot of code.
+
+ * Before, any code that might need to use the biovec after the bio had been
+   completed (perhaps to copy the data somewhere else, or perhaps to resubmit
+   it somewhere else if there was an error) had to save the entire bvec array
+   - again, this was being done in a fair number of places.
+
+ * Biovecs can be shared between multiple bios - a bvec iter can represent an
+   arbitrary range of an existing biovec, both starting and ending midway
+   through biovecs. This is what enables efficient splitting of arbitrary
+   bios. Note that this means we _only_ use bi_size to determine when we've
+   reached the end of a bio, not bi_vcnt - and the bio_iovec() macro takes
+   bi_size into account when constructing biovecs.
+
+ * Splitting bios is now much simpler. The old bio_split() didn't even work on
+   bios with more than a single bvec! Now, we can efficiently split arbitrary
+   size bios - because the new bio can share the old bio's biovec.
+
+   Care must be taken to ensure the biovec isn't freed while the split bio is
+   still using it, in case the original bio completes first, though. Using
+   bio_chain() when splitting bios helps with this.
+
+ * Submitting partially completed bios is now perfectly fine - this comes up
+   occasionally in stacking block drivers and various code (e.g. md and
+   bcache) had some ugly workarounds for this.
+
+   It used to be the case that submitting a partially completed bio would work
+   fine to _most_ devices, but since accessing the raw bvec array was the
+   norm, not all drivers would respect bi_idx and those would break. Now,
+   since all drivers _must_ go through the bvec iterator - and have been
+   audited to make sure they are - submitting partially completed bios is
+   perfectly fine.
+
+Other implications:
+===================
+
+ * Almost all usage of bi_idx is now incorrect and has been removed; instead,
+   where previously you would have used bi_idx you'd now use a bvec_iter,
+   probably passing it to one of the helper macros.
+
+   I.e. instead of using bio_iovec_idx() (or bio->bi_iovec[bio->bi_idx]), you
+   now use bio_iter_iovec(), which takes a bvec_iter and returns a
+   literal struct bio_vec - constructed on the fly from the raw biovec but
+   taking into account bi_bvec_done (and bi_size).
+
+ * bi_vcnt can't be trusted or relied upon by driver code - i.e. anything that
+   doesn't actually own the bio. The reason is twofold: firstly, it's not
+   actually needed for iterating over the bio anymore - we only use bi_size.
+   Secondly, when cloning a bio and reusing (a portion of) the original bio's
+   biovec, in order to calculate bi_vcnt for the new bio we'd have to iterate
+   over all the biovecs in the new bio - which is silly as it's not needed.
+
+   So, don't use bi_vcnt anymore.
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index f4e5440aba0..929468e1512 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1546,7 +1546,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
 
 		err = _drbd_no_send_page(mdev, bvec.bv_page,
 					 bvec.bv_offset, bvec.bv_len,
-					 bio_iter_last(bio, iter)
+					 bio_iter_last(bvec, iter)
 					 ? 0 : MSG_MORE);
 		if (err)
 			return err;
@@ -1565,7 +1565,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 
 		err = _drbd_send_page(mdev, bvec.bv_page,
 				      bvec.bv_offset, bvec.bv_len,
-				      bio_iter_last(bio, iter) ? 0 : MSG_MORE);
+				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
 		if (err)
 			return err;
 	}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index aa362f49321..55298db36b2 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -278,7 +278,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 		 */
 		rq_for_each_segment(bvec, req, iter) {
 			flags = 0;
-			if (!rq_iter_last(req, iter))
+			if (!rq_iter_last(bvec, iter))
 				flags = MSG_MORE;
 			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
 					nbd->disk->disk_name, req, bvec.bv_len);
diff --git a/fs/bio.c b/fs/bio.c
index 8b7f14a9550..07b4b7afa69 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -532,13 +532,11 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 	 * most users will be overriding ->bi_bdev with a new target,
 	 * so we don't set nor calculate new physical/hw segment counts here
 	 */
-	bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
 	bio->bi_bdev = bio_src->bi_bdev;
 	bio->bi_flags |= 1 << BIO_CLONED;
 	bio->bi_rw = bio_src->bi_rw;
 	bio->bi_vcnt = bio_src->bi_vcnt;
-	bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-	bio->bi_iter.bi_idx = bio_src->bi_iter.bi_idx;
+	bio->bi_iter = bio_src->bi_iter;
 }
 EXPORT_SYMBOL(__bio_clone);
 
@@ -808,28 +806,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
 	if (bio_integrity(bio))
 		bio_integrity_advance(bio, bytes);
 
-	bio->bi_iter.bi_sector += bytes >> 9;
-	bio->bi_iter.bi_size -= bytes;
-
-	if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
-		return;
-
-	while (bytes) {
-		if (unlikely(bio->bi_iter.bi_idx >= bio->bi_vcnt)) {
-			WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
-				  bio->bi_iter.bi_idx, bio->bi_vcnt);
-			break;
-		}
-
-		if (bytes >= bio_iovec(bio).bv_len) {
-			bytes -= bio_iovec(bio).bv_len;
-			bio->bi_iter.bi_idx++;
-		} else {
-			bio_iovec(bio).bv_len -= bytes;
-			bio_iovec(bio).bv_offset += bytes;
-			bytes = 0;
-		}
-	}
+	bio_advance_iter(bio, &bio->bi_iter, bytes);
 }
 EXPORT_SYMBOL(bio_advance);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c16adb5f69f..04e592e74c9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -64,11 +64,38 @@
 #define bio_iovec_idx(bio, idx)	(&((bio)->bi_io_vec[(idx)]))
 #define __bio_iovec(bio)	bio_iovec_idx((bio), (bio)->bi_iter.bi_idx)
 
-#define bio_iter_iovec(bio, iter) ((bio)->bi_io_vec[(iter).bi_idx])
+#define __bvec_iter_bvec(bvec, iter)	(&(bvec)[(iter).bi_idx])
 
-#define bio_page(bio)		(bio_iovec((bio)).bv_page)
-#define bio_offset(bio)		(bio_iovec((bio)).bv_offset)
-#define bio_iovec(bio)		(*__bio_iovec(bio))
+#define bvec_iter_page(bvec, iter)				\
+	(__bvec_iter_bvec((bvec), (iter))->bv_page)
+
+#define bvec_iter_len(bvec, iter)				\
+	min((iter).bi_size,					\
+	    __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
+
+#define bvec_iter_offset(bvec, iter)				\
+	(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
+
+#define bvec_iter_bvec(bvec, iter)				\
+((struct bio_vec) {						\
+	.bv_page	= bvec_iter_page((bvec), (iter)),	\
+	.bv_len		= bvec_iter_len((bvec), (iter)),	\
+	.bv_offset	= bvec_iter_offset((bvec), (iter)),	\
+})
+
+#define bio_iter_iovec(bio, iter)				\
+	bvec_iter_bvec((bio)->bi_io_vec, (iter))
+
+#define bio_iter_page(bio, iter)				\
+	bvec_iter_page((bio)->bi_io_vec, (iter))
+#define bio_iter_len(bio, iter)					\
+	bvec_iter_len((bio)->bi_io_vec, (iter))
+#define bio_iter_offset(bio, iter)				\
+	bvec_iter_offset((bio)->bi_io_vec, (iter))
+
+#define bio_page(bio)		bio_iter_page((bio), (bio)->bi_iter)
+#define bio_offset(bio)		bio_iter_offset((bio), (bio)->bi_iter)
+#define bio_iovec(bio)		bio_iter_iovec((bio), (bio)->bi_iter)
 
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_iter.bi_idx)
 #define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
@@ -145,16 +172,54 @@ static inline void *bio_data(struct bio *bio)
 	     bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;	\
 	     i++)
 
+static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter *iter,
+				     unsigned bytes)
+{
+	WARN_ONCE(bytes > iter->bi_size,
+		  "Attempted to advance past end of bvec iter\n");
+
+	while (bytes) {
+		unsigned len = min(bytes, bvec_iter_len(bv, *iter));
+
+		bytes -= len;
+		iter->bi_size -= len;
+		iter->bi_bvec_done += len;
+
+		if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) {
+			iter->bi_bvec_done = 0;
+			iter->bi_idx++;
+		}
+	}
+}
+
+#define for_each_bvec(bvl, bio_vec, iter, start)			\
+	for ((iter) = start;						\
+	     (bvl) = bvec_iter_bvec((bio_vec), (iter)),			\
+		(iter).bi_size;						\
+	     bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
+
+
+static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
+				    unsigned bytes)
+{
+	iter->bi_sector += bytes >> 9;
+
+	if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+		iter->bi_size -= bytes;
+	else
+		bvec_iter_advance(bio->bi_io_vec, iter, bytes);
+}
+
 #define __bio_for_each_segment(bvl, bio, iter, start)			\
 	for (iter = (start);						\
-	     bvl = bio_iter_iovec((bio), (iter)),			\
-	     (iter).bi_idx < (bio)->bi_vcnt;				\
-	     (iter).bi_idx++)
+	     (iter).bi_size &&						\
+		((bvl = bio_iter_iovec((bio), (iter))), 1);		\
+	     bio_advance_iter((bio), &(iter), (bvl).bv_len))
 
 #define bio_for_each_segment(bvl, bio, iter)				\
 	__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
 
-#define bio_iter_last(bio, iter) ((iter).bi_idx == (bio)->bi_vcnt - 1)
+#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 29b5b84d8a2..d369f8f6af7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -34,6 +34,9 @@ struct bvec_iter {
 	unsigned int		bi_size;	/* residual I/O count */
 
 	unsigned int		bi_idx;		/* current index into bvl_vec */
+
+	unsigned int            bi_bvec_done;	/* number of bytes completed in
+						   current bvec */
 };
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 337b92a5465..02cb6f0ea71 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -750,9 +750,9 @@ struct req_iterator {
 	__rq_for_each_bio(_iter.bio, _rq)			\
 		bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 
-#define rq_iter_last(rq, _iter)					\
+#define rq_iter_last(bvec, _iter)				\
 		(_iter.bio->bi_next == NULL &&			\
-		 bio_iter_last(_iter.bio, _iter.iter))
+		 bio_iter_last(bvec, _iter.iter))
 
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
-- 
cgit v1.2.3-70-g09d2


From 1cb9dda4f4332aa560a2db39f92a96e1a8273cf8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 7 Aug 2013 14:26:39 -0700
Subject: block: Convert bio_copy_data() to bvec_iter

Our fancy new bvec iterator makes code like this much easier to write.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c | 60 +++++++++++++++++++++++++-----------------------------------
 1 file changed, 25 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 07b4b7afa69..f61e59b3881 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -852,58 +852,48 @@ EXPORT_SYMBOL(bio_alloc_pages);
  */
 void bio_copy_data(struct bio *dst, struct bio *src)
 {
-	struct bio_vec *src_bv, *dst_bv;
-	unsigned src_offset, dst_offset, bytes;
+	struct bvec_iter src_iter, dst_iter;
+	struct bio_vec src_bv, dst_bv;
 	void *src_p, *dst_p;
+	unsigned bytes;
 
-	src_bv = __bio_iovec(src);
-	dst_bv = __bio_iovec(dst);
-
-	src_offset = src_bv->bv_offset;
-	dst_offset = dst_bv->bv_offset;
+	src_iter = src->bi_iter;
+	dst_iter = dst->bi_iter;
 
 	while (1) {
-		if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
-			src_bv++;
-			if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
-				src = src->bi_next;
-				if (!src)
-					break;
-
-				src_bv = __bio_iovec(src);
-			}
+		if (!src_iter.bi_size) {
+			src = src->bi_next;
+			if (!src)
+				break;
 
-			src_offset = src_bv->bv_offset;
+			src_iter = src->bi_iter;
 		}
 
-		if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
-			dst_bv++;
-			if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
-				dst = dst->bi_next;
-				if (!dst)
-					break;
-
-				dst_bv = __bio_iovec(dst);
-			}
+		if (!dst_iter.bi_size) {
+			dst = dst->bi_next;
+			if (!dst)
+				break;
 
-			dst_offset = dst_bv->bv_offset;
+			dst_iter = dst->bi_iter;
 		}
 
-		bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
-			    src_bv->bv_offset + src_bv->bv_len - src_offset);
+		src_bv = bio_iter_iovec(src, src_iter);
+		dst_bv = bio_iter_iovec(dst, dst_iter);
+
+		bytes = min(src_bv.bv_len, dst_bv.bv_len);
 
-		src_p = kmap_atomic(src_bv->bv_page);
-		dst_p = kmap_atomic(dst_bv->bv_page);
+		src_p = kmap_atomic(src_bv.bv_page);
+		dst_p = kmap_atomic(dst_bv.bv_page);
 
-		memcpy(dst_p + dst_offset,
-		       src_p + src_offset,
+		memcpy(dst_p + dst_bv.bv_offset,
+		       src_p + src_bv.bv_offset,
 		       bytes);
 
 		kunmap_atomic(dst_p);
 		kunmap_atomic(src_p);
 
-		src_offset += bytes;
-		dst_offset += bytes;
+		bio_advance_iter(src, &src_iter, bytes);
+		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
 EXPORT_SYMBOL(bio_copy_data);
-- 
cgit v1.2.3-70-g09d2


From d57a5f7c6605f15f3b5134837e68b448a7cea88e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 23 Nov 2013 17:20:16 -0800
Subject: bio-integrity: Convert to bvec_iter

The bio integrity is also stored in a bvec array, so if we use the bvec
iter code we just added, the integrity code won't need to implement its
own iteration stuff (bio_integrity_mark_head(), bio_integrity_mark_tail())

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
---
 block/blk-integrity.c |  40 ++++++++++---------
 drivers/scsi/sd_dif.c |  30 +++++++-------
 fs/bio-integrity.c    | 108 ++++++++++++--------------------------------------
 include/linux/bio.h   |  19 ++++-----
 4 files changed, 71 insertions(+), 126 deletions(-)

(limited to 'fs')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 03cf7179e8e..7fbab84399e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -43,30 +43,32 @@ static const char *bi_unsupported_name = "unsupported";
  */
 int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
 {
-	struct bio_vec *iv, *ivprv = NULL;
+	struct bio_vec iv, ivprv = { NULL };
 	unsigned int segments = 0;
 	unsigned int seg_size = 0;
-	unsigned int i = 0;
+	struct bvec_iter iter;
+	int prev = 0;
 
-	bio_for_each_integrity_vec(iv, bio, i) {
+	bio_for_each_integrity_vec(iv, bio, iter) {
 
-		if (ivprv) {
-			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+		if (prev) {
+			if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
 				goto new_segment;
 
-			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+			if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
 				goto new_segment;
 
-			if (seg_size + iv->bv_len > queue_max_segment_size(q))
+			if (seg_size + iv.bv_len > queue_max_segment_size(q))
 				goto new_segment;
 
-			seg_size += iv->bv_len;
+			seg_size += iv.bv_len;
 		} else {
 new_segment:
 			segments++;
-			seg_size = iv->bv_len;
+			seg_size = iv.bv_len;
 		}
 
+		prev = 1;
 		ivprv = iv;
 	}
 
@@ -87,24 +89,25 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
 int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
 			    struct scatterlist *sglist)
 {
-	struct bio_vec *iv, *ivprv = NULL;
+	struct bio_vec iv, ivprv = { NULL };
 	struct scatterlist *sg = NULL;
 	unsigned int segments = 0;
-	unsigned int i = 0;
+	struct bvec_iter iter;
+	int prev = 0;
 
-	bio_for_each_integrity_vec(iv, bio, i) {
+	bio_for_each_integrity_vec(iv, bio, iter) {
 
-		if (ivprv) {
-			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+		if (prev) {
+			if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
 				goto new_segment;
 
-			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+			if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
 				goto new_segment;
 
-			if (sg->length + iv->bv_len > queue_max_segment_size(q))
+			if (sg->length + iv.bv_len > queue_max_segment_size(q))
 				goto new_segment;
 
-			sg->length += iv->bv_len;
+			sg->length += iv.bv_len;
 		} else {
 new_segment:
 			if (!sg)
@@ -114,10 +117,11 @@ new_segment:
 				sg = sg_next(sg);
 			}
 
-			sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset);
+			sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);
 			segments++;
 		}
 
+		prev = 1;
 		ivprv = iv;
 	}
 
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 6174ca4ea27..a7a691d0af7 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -365,7 +365,6 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector,
 	struct bio *bio;
 	struct scsi_disk *sdkp;
 	struct sd_dif_tuple *sdt;
-	unsigned int i, j;
 	u32 phys, virt;
 
 	sdkp = rq->bio->bi_bdev->bd_disk->private_data;
@@ -376,19 +375,21 @@ void sd_dif_prepare(struct request *rq, sector_t hw_sector,
 	phys = hw_sector & 0xffffffff;
 
 	__rq_for_each_bio(bio, rq) {
-		struct bio_vec *iv;
+		struct bio_vec iv;
+		struct bvec_iter iter;
+		unsigned int j;
 
 		/* Already remapped? */
 		if (bio_flagged(bio, BIO_MAPPED_INTEGRITY))
 			break;
 
-		virt = bio->bi_integrity->bip_sector & 0xffffffff;
+		virt = bio->bi_integrity->bip_iter.bi_sector & 0xffffffff;
 
-		bip_for_each_vec(iv, bio->bi_integrity, i) {
-			sdt = kmap_atomic(iv->bv_page)
-				+ iv->bv_offset;
+		bip_for_each_vec(iv, bio->bi_integrity, iter) {
+			sdt = kmap_atomic(iv.bv_page)
+				+ iv.bv_offset;
 
-			for (j = 0 ; j < iv->bv_len ; j += tuple_sz, sdt++) {
+			for (j = 0; j < iv.bv_len; j += tuple_sz, sdt++) {
 
 				if (be32_to_cpu(sdt->ref_tag) == virt)
 					sdt->ref_tag = cpu_to_be32(phys);
@@ -414,7 +415,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
 	struct scsi_disk *sdkp;
 	struct bio *bio;
 	struct sd_dif_tuple *sdt;
-	unsigned int i, j, sectors, sector_sz;
+	unsigned int j, sectors, sector_sz;
 	u32 phys, virt;
 
 	sdkp = scsi_disk(scmd->request->rq_disk);
@@ -430,15 +431,16 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
 		phys >>= 3;
 
 	__rq_for_each_bio(bio, scmd->request) {
-		struct bio_vec *iv;
+		struct bio_vec iv;
+		struct bvec_iter iter;
 
-		virt = bio->bi_integrity->bip_sector & 0xffffffff;
+		virt = bio->bi_integrity->bip_iter.bi_sector & 0xffffffff;
 
-		bip_for_each_vec(iv, bio->bi_integrity, i) {
-			sdt = kmap_atomic(iv->bv_page)
-				+ iv->bv_offset;
+		bip_for_each_vec(iv, bio->bi_integrity, iter) {
+			sdt = kmap_atomic(iv.bv_page)
+				+ iv.bv_offset;
 
-			for (j = 0 ; j < iv->bv_len ; j += tuple_sz, sdt++) {
+			for (j = 0; j < iv.bv_len; j += tuple_sz, sdt++) {
 
 				if (sectors == 0) {
 					kunmap_atomic(sdt);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9127db86f31..fed744b8c9e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -134,8 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 		return 0;
 	}
 
-	iv = bip_vec_idx(bip, bip->bip_vcnt);
-	BUG_ON(iv == NULL);
+	iv = bip->bip_vec + bip->bip_vcnt;
 
 	iv->bv_page = page;
 	iv->bv_len = len;
@@ -203,6 +202,12 @@ static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
 	return sectors;
 }
 
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+					       unsigned int sectors)
+{
+	return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size;
+}
+
 /**
  * bio_integrity_tag_size - Retrieve integrity tag space
  * @bio:	bio to inspect
@@ -235,9 +240,9 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
 	nr_sectors = bio_integrity_hw_sectors(bi,
 					DIV_ROUND_UP(len, bi->tag_size));
 
-	if (nr_sectors * bi->tuple_size > bip->bip_size) {
-		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
-		       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+	if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
+		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
+		       nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
 		return -1;
 	}
 
@@ -322,7 +327,7 @@ static void bio_integrity_generate(struct bio *bio)
 		sector += sectors;
 		prot_buf += sectors * bi->tuple_size;
 		total += sectors * bi->tuple_size;
-		BUG_ON(total > bio->bi_integrity->bip_size);
+		BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
 
 		kunmap_atomic(kaddr);
 	}
@@ -387,8 +392,8 @@ int bio_integrity_prep(struct bio *bio)
 
 	bip->bip_owns_buf = 1;
 	bip->bip_buf = buf;
-	bip->bip_size = len;
-	bip->bip_sector = bio->bi_iter.bi_sector;
+	bip->bip_iter.bi_size = len;
+	bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
 
 	/* Map it */
 	offset = offset_in_page(buf);
@@ -444,7 +449,7 @@ static int bio_integrity_verify(struct bio *bio)
 	struct blk_integrity_exchg bix;
 	struct bio_vec bv;
 	struct bvec_iter iter;
-	sector_t sector = bio->bi_integrity->bip_sector;
+	sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
 	unsigned int sectors, total, ret;
 	void *prot_buf = bio->bi_integrity->bip_buf;
 
@@ -470,7 +475,7 @@ static int bio_integrity_verify(struct bio *bio)
 		sector += sectors;
 		prot_buf += sectors * bi->tuple_size;
 		total += sectors * bi->tuple_size;
-		BUG_ON(total > bio->bi_integrity->bip_size);
+		BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
 
 		kunmap_atomic(kaddr);
 	}
@@ -534,56 +539,6 @@ void bio_integrity_endio(struct bio *bio, int error)
 }
 EXPORT_SYMBOL(bio_integrity_endio);
 
-/**
- * bio_integrity_mark_head - Advance bip_vec skip bytes
- * @bip:	Integrity vector to advance
- * @skip:	Number of bytes to advance it
- */
-void bio_integrity_mark_head(struct bio_integrity_payload *bip,
-			     unsigned int skip)
-{
-	struct bio_vec *iv;
-	unsigned int i;
-
-	bip_for_each_vec(iv, bip, i) {
-		if (skip == 0) {
-			bip->bip_idx = i;
-			return;
-		} else if (skip >= iv->bv_len) {
-			skip -= iv->bv_len;
-		} else { /* skip < iv->bv_len) */
-			iv->bv_offset += skip;
-			iv->bv_len -= skip;
-			bip->bip_idx = i;
-			return;
-		}
-	}
-}
-
-/**
- * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
- * @bip:	Integrity vector to truncate
- * @len:	New length of integrity vector
- */
-void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
-			     unsigned int len)
-{
-	struct bio_vec *iv;
-	unsigned int i;
-
-	bip_for_each_vec(iv, bip, i) {
-		if (len == 0) {
-			bip->bip_vcnt = i;
-			return;
-		} else if (len >= iv->bv_len) {
-			len -= iv->bv_len;
-		} else { /* len < iv->bv_len) */
-			iv->bv_len = len;
-			len = 0;
-		}
-	}
-}
-
 /**
  * bio_integrity_advance - Advance integrity vector
  * @bio:	bio whose integrity vector to update
@@ -597,13 +552,9 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 {
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	unsigned int nr_sectors;
+	unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
 
-	BUG_ON(bip == NULL);
-	BUG_ON(bi == NULL);
-
-	nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
-	bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
 }
 EXPORT_SYMBOL(bio_integrity_advance);
 
@@ -623,16 +574,9 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset,
 {
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	unsigned int nr_sectors;
 
-	BUG_ON(bip == NULL);
-	BUG_ON(bi == NULL);
-	BUG_ON(!bio_flagged(bio, BIO_CLONED));
-
-	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-	bip->bip_sector = bip->bip_sector + offset;
-	bio_integrity_mark_head(bip, offset * bi->tuple_size);
-	bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+	bio_integrity_advance(bio, offset << 9);
+	bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
 }
 EXPORT_SYMBOL(bio_integrity_trim);
 
@@ -662,8 +606,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
 	bp->bio1.bi_integrity = &bp->bip1;
 	bp->bio2.bi_integrity = &bp->bip2;
 
-	bp->iv1 = bip->bip_vec[bip->bip_idx];
-	bp->iv2 = bip->bip_vec[bip->bip_idx];
+	bp->iv1 = bip->bip_vec[bip->bip_iter.bi_idx];
+	bp->iv2 = bip->bip_vec[bip->bip_iter.bi_idx];
 
 	bp->bip1.bip_vec = &bp->iv1;
 	bp->bip2.bip_vec = &bp->iv2;
@@ -672,11 +616,12 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
 	bp->iv2.bv_offset += sectors * bi->tuple_size;
 	bp->iv2.bv_len -= sectors * bi->tuple_size;
 
-	bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
-	bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+	bp->bip1.bip_iter.bi_sector = bio->bi_integrity->bip_iter.bi_sector;
+	bp->bip2.bip_iter.bi_sector =
+		bio->bi_integrity->bip_iter.bi_sector + nr_sectors;
 
 	bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
-	bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+	bp->bip1.bip_iter.bi_idx = bp->bip2.bip_iter.bi_idx = 0;
 }
 EXPORT_SYMBOL(bio_integrity_split);
 
@@ -704,9 +649,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 	memcpy(bip->bip_vec, bip_src->bip_vec,
 	       bip_src->bip_vcnt * sizeof(struct bio_vec));
 
-	bip->bip_sector = bip_src->bip_sector;
 	bip->bip_vcnt = bip_src->bip_vcnt;
-	bip->bip_idx = bip_src->bip_idx;
+	bip->bip_iter = bip_src->bip_iter;
 
 	return 0;
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 04e592e74c9..930cb73c894 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -244,16 +244,15 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 struct bio_integrity_payload {
 	struct bio		*bip_bio;	/* parent bio */
 
-	sector_t		bip_sector;	/* virtual start sector */
+	struct bvec_iter	bip_iter;
 
+	/* kill - should just use bip_vec */
 	void			*bip_buf;	/* generated integrity data */
-	bio_end_io_t		*bip_end_io;	/* saved I/O completion fn */
 
-	unsigned int		bip_size;
+	bio_end_io_t		*bip_end_io;	/* saved I/O completion fn */
 
 	unsigned short		bip_slab;	/* slab the bip came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
-	unsigned short		bip_idx;	/* current bip_vec index */
 	unsigned		bip_owns_buf:1;	/* should free bip_buf */
 
 	struct work_struct	bip_work;	/* I/O completion */
@@ -626,16 +625,12 @@ struct biovec_slab {
 
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
-#define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
-#define bip_vec(bip)		bip_vec_idx(bip, 0)
 
-#define __bip_for_each_vec(bvl, bip, i, start_idx)			\
-	for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx);	\
-	     i < (bip)->bip_vcnt;					\
-	     bvl++, i++)
 
-#define bip_for_each_vec(bvl, bip, i)					\
-	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
+#define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
+
+#define bip_for_each_vec(bvl, bip, iter)				\
+	for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)
 
 #define bio_for_each_integrity_vec(_bvl, _bio, _iter)			\
 	for_each_bio(_bio)						\
-- 
cgit v1.2.3-70-g09d2


From 458b76ed2f9517becb74dcc8eedd70d3068ea6e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 24 Sep 2013 16:26:05 -0700
Subject: block: Kill bio_segments()/bi_vcnt usage

When we start sharing biovecs, keeping bi_vcnt accurate for splits is
going to be error prone - and unnecessary, if we refactor some code.

So bio_segments() has to go - but most of the existing users just needed
to know if the bio had multiple segments, which is easier - add a
bio_multiple_segments() for them.

(Two of the current uses of bio_segments() are going to go away in a
couple patches, but the current implementation of bio_segments() is
unsafe as soon as we start doing driver conversions for immutable
biovecs - so implement a dumb version for bisectability, it'll go away
in a couple patches)

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Neil Brown <neilb@suse.de>
Cc: Nagalakshmi Nandigama <Nagalakshmi.Nandigama@lsi.com>
Cc: Sreekanth Reddy <Sreekanth.Reddy@lsi.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
---
 drivers/block/ps3disk.c                  |  7 ++-
 drivers/md/bcache/io.c                   | 53 +++++++++------------
 drivers/md/raid0.c                       |  2 +-
 drivers/md/raid10.c                      |  2 +-
 drivers/message/fusion/mptsas.c          |  8 ++--
 drivers/scsi/libsas/sas_expander.c       |  8 ++--
 drivers/scsi/mpt2sas/mpt2sas_transport.c | 10 ++--
 drivers/scsi/mpt3sas/mpt3sas_transport.c |  8 ++--
 fs/bio.c                                 |  2 +-
 include/linux/bio.h                      | 81 +++++++++++++++++++-------------
 10 files changed, 94 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 1c6edb9a996..c120d70d3fb 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -101,10 +101,9 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
 
 	rq_for_each_segment(bvec, req, iter) {
 		unsigned long flags;
-		dev_dbg(&dev->sbd.core,
-			"%s:%u: bio %u: %u segs %u sectors from %lu\n",
-			__func__, __LINE__, i, bio_segments(iter.bio),
-			bio_sectors(iter.bio), iter.bio->bi_iter.bi_sector);
+		dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %lu\n",
+			__func__, __LINE__, i, bio_sectors(iter.bio),
+			iter.bio->bi_iter.bi_sector);
 
 		size = bvec.bv_len;
 		buf = bvec_kmap_irq(&bvec, &flags);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9b5b6a41a9b..6e04f3bb028 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -24,7 +24,8 @@ static void bch_generic_make_request_hack(struct bio *bio)
 	if (bio->bi_iter.bi_idx) {
 		struct bio_vec bv;
 		struct bvec_iter iter;
-		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
+		unsigned segs = bio_segments(bio);
+		struct bio *clone = bio_alloc(GFP_NOIO, segs);
 
 		bio_for_each_segment(bv, bio, iter)
 			clone->bi_io_vec[clone->bi_vcnt++] = bv;
@@ -32,7 +33,7 @@ static void bch_generic_make_request_hack(struct bio *bio)
 		clone->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 		clone->bi_bdev		= bio->bi_bdev;
 		clone->bi_rw		= bio->bi_rw;
-		clone->bi_vcnt		= bio_segments(bio);
+		clone->bi_vcnt		= segs;
 		clone->bi_iter.bi_size	= bio->bi_iter.bi_size;
 
 		clone->bi_private	= bio;
@@ -133,40 +134,32 @@ out:
 
 static unsigned bch_bio_max_sectors(struct bio *bio)
 {
-	unsigned ret = bio_sectors(bio);
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
-				      queue_max_segments(q));
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned ret = 0, seg = 0;
 
 	if (bio->bi_rw & REQ_DISCARD)
-		return min(ret, q->limits.max_discard_sectors);
-
-	if (bio_segments(bio) > max_segments ||
-	    q->merge_bvec_fn) {
-		struct bio_vec bv;
-		struct bvec_iter iter;
-		unsigned seg = 0;
-
-		ret = 0;
+		return min(bio_sectors(bio), q->limits.max_discard_sectors);
 
-		bio_for_each_segment(bv, bio, iter) {
-			struct bvec_merge_data bvm = {
-				.bi_bdev	= bio->bi_bdev,
-				.bi_sector	= bio->bi_iter.bi_sector,
-				.bi_size	= ret << 9,
-				.bi_rw		= bio->bi_rw,
-			};
-
-			if (seg == max_segments)
-				break;
+	bio_for_each_segment(bv, bio, iter) {
+		struct bvec_merge_data bvm = {
+			.bi_bdev	= bio->bi_bdev,
+			.bi_sector	= bio->bi_iter.bi_sector,
+			.bi_size	= ret << 9,
+			.bi_rw		= bio->bi_rw,
+		};
+
+		if (seg == min_t(unsigned, BIO_MAX_PAGES,
+				 queue_max_segments(q)))
+			break;
 
-			if (q->merge_bvec_fn &&
-			    q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
-				break;
+		if (q->merge_bvec_fn &&
+		    q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
+			break;
 
-			seg++;
-			ret += bv.bv_len >> 9;
-		}
+		seg++;
+		ret += bv.bv_len >> 9;
 	}
 
 	ret = min(ret, queue_max_sectors(q));
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e38d1d3226f..8ee1a6c658b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -528,7 +528,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 		sector_t sector = bio->bi_iter.bi_sector;
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
-		if (bio_segments(bio) > 1)
+		if (bio_multiple_segments(bio))
 			goto bad_map;
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dbf3b63c275..ac4bfa438c5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1188,7 +1188,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			 || conf->prev.near_copies < conf->prev.raid_disks))) {
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
-		if (bio_segments(bio) > 1)
+		if (bio_multiple_segments(bio))
 			goto bad_map;
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index dd239bdbfcb..00d339c361f 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -2235,10 +2235,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	}
 
 	/* do we need to support multiple segments? */
-	if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
-		printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
-		    ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req),
-		    bio_segments(rsp->bio), blk_rq_bytes(rsp));
+	if (bio_multiple_segments(req->bio) ||
+	    bio_multiple_segments(rsp->bio)) {
+		printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u, rsp %u\n",
+		    ioc->name, __func__, blk_rq_bytes(req), blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 446b85110a1..0cac7d8fd0f 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -2163,10 +2163,10 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	}
 
 	/* do we need to support multiple segments? */
-	if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
-		printk("%s: multiple segments req %u %u, rsp %u %u\n",
-		       __func__, bio_segments(req->bio), blk_rq_bytes(req),
-		       bio_segments(rsp->bio), blk_rq_bytes(rsp));
+	if (bio_multiple_segments(req->bio) ||
+	    bio_multiple_segments(rsp->bio)) {
+		printk("%s: multiple segments req %u, rsp %u\n",
+		       __func__, blk_rq_bytes(req), blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index 7143e86af32..410f4a3e888 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1943,7 +1943,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	ioc->transport_cmds.status = MPT2_CMD_PENDING;
 
 	/* Check if the request is split across multiple segments */
-	if (bio_segments(req->bio) > 1) {
+	if (bio_multiple_segments(req->bio)) {
 		u32 offset = 0;
 
 		/* Allocate memory and copy the request */
@@ -1975,7 +1975,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 	/* Check if the response needs to be populated across
 	 * multiple segments */
-	if (bio_segments(rsp->bio) > 1) {
+	if (bio_multiple_segments(rsp->bio)) {
 		pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp),
 		    &pci_dma_in);
 		if (!pci_addr_in) {
@@ -2042,7 +2042,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	sgl_flags = (MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 	    MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC);
 	sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-	if (bio_segments(req->bio) > 1) {
+	if (bio_multiple_segments(req->bio)) {
 		ioc->base_add_sg_single(psge, sgl_flags |
 		    (blk_rq_bytes(req) - 4), pci_dma_out);
 	} else {
@@ -2058,7 +2058,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	    MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER |
 	    MPI2_SGE_FLAGS_END_OF_LIST);
 	sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-	if (bio_segments(rsp->bio) > 1) {
+	if (bio_multiple_segments(rsp->bio)) {
 		ioc->base_add_sg_single(psge, sgl_flags |
 		    (blk_rq_bytes(rsp) + 4), pci_dma_in);
 	} else {
@@ -2103,7 +2103,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		    le16_to_cpu(mpi_reply->ResponseDataLength);
 		/* check if the resp needs to be copied from the allocated
 		 * pci mem */
-		if (bio_segments(rsp->bio) > 1) {
+		if (bio_multiple_segments(rsp->bio)) {
 			u32 offset = 0;
 			u32 bytes_to_copy =
 			    le16_to_cpu(mpi_reply->ResponseDataLength);
diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c
index 196a67f2e95..65170cb1a00 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_transport.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c
@@ -1926,7 +1926,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	ioc->transport_cmds.status = MPT3_CMD_PENDING;
 
 	/* Check if the request is split across multiple segments */
-	if (req->bio->bi_vcnt > 1) {
+	if (bio_multiple_segments(req->bio)) {
 		u32 offset = 0;
 
 		/* Allocate memory and copy the request */
@@ -1958,7 +1958,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 	/* Check if the response needs to be populated across
 	 * multiple segments */
-	if (rsp->bio->bi_vcnt > 1) {
+	if (bio_multiple_segments(rsp->bio)) {
 		pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp),
 		    &pci_dma_in);
 		if (!pci_addr_in) {
@@ -2019,7 +2019,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	mpi_request->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4);
 	psge = &mpi_request->SGL;
 
-	if (req->bio->bi_vcnt > 1)
+	if (bio_multiple_segments(req->bio))
 		ioc->build_sg(ioc, psge, pci_dma_out, (blk_rq_bytes(req) - 4),
 		    pci_dma_in, (blk_rq_bytes(rsp) + 4));
 	else
@@ -2064,7 +2064,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		/* check if the resp needs to be copied from the allocated
 		 * pci mem */
-		if (rsp->bio->bi_vcnt > 1) {
+		if (bio_multiple_segments(rsp->bio)) {
 			u32 offset = 0;
 			u32 bytes_to_copy =
 			    le16_to_cpu(mpi_reply->ResponseDataLength);
diff --git a/fs/bio.c b/fs/bio.c
index f61e59b3881..e32f2ffc3f3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1733,7 +1733,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
 				bi->bi_iter.bi_sector + first_sectors);
 
-	BUG_ON(bio_segments(bi) > 1);
+	BUG_ON(bio_multiple_segments(bi));
 	atomic_set(&bp->cnt, 3);
 	bp->error = 0;
 	bp->bio1 = *bi;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 930cb73c894..aea9896a628 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -97,13 +97,46 @@
 #define bio_offset(bio)		bio_iter_offset((bio), (bio)->bi_iter)
 #define bio_iovec(bio)		bio_iter_iovec((bio), (bio)->bi_iter)
 
-#define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_iter.bi_idx)
+#define bio_multiple_segments(bio)				\
+	((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
 #define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
 #define bio_end_sector(bio)	((bio)->bi_iter.bi_sector + bio_sectors((bio)))
 
+/*
+ * Check whether this bio carries any data or not. A NULL bio is allowed.
+ */
+static inline bool bio_has_data(struct bio *bio)
+{
+	if (bio &&
+	    bio->bi_iter.bi_size &&
+	    !(bio->bi_rw & REQ_DISCARD))
+		return true;
+
+	return false;
+}
+
+static inline bool bio_is_rw(struct bio *bio)
+{
+	if (!bio_has_data(bio))
+		return false;
+
+	if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+		return false;
+
+	return true;
+}
+
+static inline bool bio_mergeable(struct bio *bio)
+{
+	if (bio->bi_rw & REQ_NOMERGE_FLAGS)
+		return false;
+
+	return true;
+}
+
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
-	if (bio->bi_vcnt)
+	if (bio_has_data(bio))
 		return bio_iovec(bio).bv_len;
 	else /* dataless requests such as discard */
 		return bio->bi_iter.bi_size;
@@ -111,7 +144,7 @@ static inline unsigned int bio_cur_bytes(struct bio *bio)
 
 static inline void *bio_data(struct bio *bio)
 {
-	if (bio->bi_vcnt)
+	if (bio_has_data(bio))
 		return page_address(bio_page(bio)) + bio_offset(bio);
 
 	return NULL;
@@ -221,6 +254,18 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
+static inline unsigned bio_segments(struct bio *bio)
+{
+	unsigned segs = 0;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bv, bio, iter)
+		segs++;
+
+	return segs;
+}
+
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -434,36 +479,6 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 	__bio_kmap_irq((bio), (bio)->bi_iter.bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
 
-/*
- * Check whether this bio carries any data or not. A NULL bio is allowed.
- */
-static inline bool bio_has_data(struct bio *bio)
-{
-	if (bio && bio->bi_vcnt)
-		return true;
-
-	return false;
-}
-
-static inline bool bio_is_rw(struct bio *bio)
-{
-	if (!bio_has_data(bio))
-		return false;
-
-	if (bio->bi_rw & REQ_WRITE_SAME)
-		return false;
-
-	return true;
-}
-
-static inline bool bio_mergeable(struct bio *bio)
-{
-	if (bio->bi_rw & REQ_NOMERGE_FLAGS)
-		return false;
-
-	return true;
-}
-
 /*
  * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
-- 
cgit v1.2.3-70-g09d2


From 003b5c5719f159f4f4bf97511c4702a0638313dd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 11 Oct 2013 15:45:43 -0700
Subject: block: Convert drivers to immutable biovecs

Now that we've got a mechanism for immutable biovecs -
bi_iter.bi_bvec_done - we need to convert drivers to use primitives that
respect it instead of using the bvec array directly.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: NeilBrown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: dm-devel@redhat.com
---
 drivers/block/umem.c   | 50 ++++++++++++++++++++++--------------------------
 drivers/md/dm-crypt.c  | 49 +++++++++++++++++------------------------------
 drivers/md/dm-io.c     | 31 ++++++++++++++++--------------
 drivers/md/dm-raid1.c  |  8 ++++----
 drivers/md/dm-verity.c | 52 ++++++++++++++------------------------------------
 fs/bio.c               | 14 +++++++++++---
 include/linux/dm-io.h  |  4 ++--
 7 files changed, 89 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index dab4f1afeae..4cf81b5bf0f 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -108,8 +108,7 @@ struct cardinfo {
 				    * have been written
 				    */
 	struct bio	*bio, *currentbio, **biotail;
-	int		current_idx;
-	sector_t	current_sector;
+	struct bvec_iter current_iter;
 
 	struct request_queue *queue;
 
@@ -118,7 +117,7 @@ struct cardinfo {
 		struct mm_dma_desc	*desc;
 		int	 		cnt, headcnt;
 		struct bio		*bio, **biotail;
-		int			idx;
+		struct bvec_iter	iter;
 	} mm_pages[2];
 #define DESC_PER_PAGE ((PAGE_SIZE*2)/sizeof(struct mm_dma_desc))
 
@@ -344,16 +343,13 @@ static int add_bio(struct cardinfo *card)
 	dma_addr_t dma_handle;
 	int offset;
 	struct bio *bio;
-	struct bio_vec *vec;
-	int idx;
+	struct bio_vec vec;
 	int rw;
-	int len;
 
 	bio = card->currentbio;
 	if (!bio && card->bio) {
 		card->currentbio = card->bio;
-		card->current_idx = card->bio->bi_iter.bi_idx;
-		card->current_sector = card->bio->bi_iter.bi_sector;
+		card->current_iter = card->bio->bi_iter;
 		card->bio = card->bio->bi_next;
 		if (card->bio == NULL)
 			card->biotail = &card->bio;
@@ -362,18 +358,17 @@ static int add_bio(struct cardinfo *card)
 	}
 	if (!bio)
 		return 0;
-	idx = card->current_idx;
 
 	rw = bio_rw(bio);
 	if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE)
 		return 0;
 
-	vec = bio_iovec_idx(bio, idx);
-	len = vec->bv_len;
+	vec = bio_iter_iovec(bio, card->current_iter);
+
 	dma_handle = pci_map_page(card->dev,
-				  vec->bv_page,
-				  vec->bv_offset,
-				  len,
+				  vec.bv_page,
+				  vec.bv_offset,
+				  vec.bv_len,
 				  (rw == READ) ?
 				  PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
 
@@ -381,7 +376,7 @@ static int add_bio(struct cardinfo *card)
 	desc = &p->desc[p->cnt];
 	p->cnt++;
 	if (p->bio == NULL)
-		p->idx = idx;
+		p->iter = card->current_iter;
 	if ((p->biotail) != &bio->bi_next) {
 		*(p->biotail) = bio;
 		p->biotail = &(bio->bi_next);
@@ -391,8 +386,8 @@ static int add_bio(struct cardinfo *card)
 	desc->data_dma_handle = dma_handle;
 
 	desc->pci_addr = cpu_to_le64((u64)desc->data_dma_handle);
-	desc->local_addr = cpu_to_le64(card->current_sector << 9);
-	desc->transfer_size = cpu_to_le32(len);
+	desc->local_addr = cpu_to_le64(card->current_iter.bi_sector << 9);
+	desc->transfer_size = cpu_to_le32(vec.bv_len);
 	offset = (((char *)&desc->sem_control_bits) - ((char *)p->desc));
 	desc->sem_addr = cpu_to_le64((u64)(p->page_dma+offset));
 	desc->zero1 = desc->zero2 = 0;
@@ -407,10 +402,9 @@ static int add_bio(struct cardinfo *card)
 		desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ);
 	desc->sem_control_bits = desc->control_bits;
 
-	card->current_sector += (len >> 9);
-	idx++;
-	card->current_idx = idx;
-	if (idx >= bio->bi_vcnt)
+
+	bio_advance_iter(bio, &card->current_iter, vec.bv_len);
+	if (!card->current_iter.bi_size)
 		card->currentbio = NULL;
 
 	return 1;
@@ -439,23 +433,25 @@ static void process_page(unsigned long data)
 		struct mm_dma_desc *desc = &page->desc[page->headcnt];
 		int control = le32_to_cpu(desc->sem_control_bits);
 		int last = 0;
-		int idx;
+		struct bio_vec vec;
 
 		if (!(control & DMASCR_DMA_COMPLETE)) {
 			control = dma_status;
 			last = 1;
 		}
+
 		page->headcnt++;
-		idx = page->idx;
-		page->idx++;
-		if (page->idx >= bio->bi_vcnt) {
+		vec = bio_iter_iovec(bio, page->iter);
+		bio_advance_iter(bio, &page->iter, vec.bv_len);
+
+		if (!page->iter.bi_size) {
 			page->bio = bio->bi_next;
 			if (page->bio)
-				page->idx = page->bio->bi_iter.bi_idx;
+				page->iter = page->bio->bi_iter;
 		}
 
 		pci_unmap_page(card->dev, desc->data_dma_handle,
-			       bio_iovec_idx(bio, idx)->bv_len,
+			       vec.bv_len,
 				 (control & DMASCR_TRANSFER_READ) ?
 				PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
 		if (control & DMASCR_HARD_ERROR) {
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1e2e5465d28..784695d22fd 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -39,10 +39,8 @@ struct convert_context {
 	struct completion restart;
 	struct bio *bio_in;
 	struct bio *bio_out;
-	unsigned int offset_in;
-	unsigned int offset_out;
-	unsigned int idx_in;
-	unsigned int idx_out;
+	struct bvec_iter iter_in;
+	struct bvec_iter iter_out;
 	sector_t cc_sector;
 	atomic_t cc_pending;
 };
@@ -826,10 +824,10 @@ static void crypt_convert_init(struct crypt_config *cc,
 {
 	ctx->bio_in = bio_in;
 	ctx->bio_out = bio_out;
-	ctx->offset_in = 0;
-	ctx->offset_out = 0;
-	ctx->idx_in = bio_in ? bio_in->bi_iter.bi_idx : 0;
-	ctx->idx_out = bio_out ? bio_out->bi_iter.bi_idx : 0;
+	if (bio_in)
+		ctx->iter_in = bio_in->bi_iter;
+	if (bio_out)
+		ctx->iter_out = bio_out->bi_iter;
 	ctx->cc_sector = sector + cc->iv_offset;
 	init_completion(&ctx->restart);
 }
@@ -857,8 +855,8 @@ static int crypt_convert_block(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct ablkcipher_request *req)
 {
-	struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
-	struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
+	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
+	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
 	struct dm_crypt_request *dmreq;
 	u8 *iv;
 	int r;
@@ -869,24 +867,15 @@ static int crypt_convert_block(struct crypt_config *cc,
 	dmreq->iv_sector = ctx->cc_sector;
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
-	sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
-		    bv_in->bv_offset + ctx->offset_in);
+	sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
+		    bv_in.bv_offset);
 
 	sg_init_table(&dmreq->sg_out, 1);
-	sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
-		    bv_out->bv_offset + ctx->offset_out);
+	sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
+		    bv_out.bv_offset);
 
-	ctx->offset_in += 1 << SECTOR_SHIFT;
-	if (ctx->offset_in >= bv_in->bv_len) {
-		ctx->offset_in = 0;
-		ctx->idx_in++;
-	}
-
-	ctx->offset_out += 1 << SECTOR_SHIFT;
-	if (ctx->offset_out >= bv_out->bv_len) {
-		ctx->offset_out = 0;
-		ctx->idx_out++;
-	}
+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
 
 	if (cc->iv_gen_ops) {
 		r = cc->iv_gen_ops->generator(cc, iv, dmreq);
@@ -937,8 +926,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 	atomic_set(&ctx->cc_pending, 1);
 
-	while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
-	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
+	while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
 
 		crypt_alloc_req(cc, ctx);
 
@@ -1207,7 +1195,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	}
 
 	/* crypt_convert should have filled the clone bio */
-	BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
+	BUG_ON(io->ctx.iter_out.bi_size);
 
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
@@ -1246,7 +1234,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		}
 
 		io->ctx.bio_out = clone;
-		io->ctx.idx_out = 0;
+		io->ctx.iter_out = clone->bi_iter;
 
 		remaining -= clone->bi_iter.bi_size;
 		sector += bio_sectors(clone);
@@ -1290,8 +1278,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 			crypt_inc_pending(new_io);
 			crypt_convert_init(cc, &new_io->ctx, NULL,
 					   io->base_bio, sector);
-			new_io->ctx.idx_in = io->ctx.idx_in;
-			new_io->ctx.offset_in = io->ctx.offset_in;
+			new_io->ctx.iter_in = io->ctx.iter_in;
 
 			/*
 			 * Fragments after the first use the base_io
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 01558b09330..b2b8a10e842 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -201,26 +201,29 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
 /*
  * Functions for getting the pages from a bvec.
  */
-static void bvec_get_page(struct dpages *dp,
+static void bio_get_page(struct dpages *dp,
 		  struct page **p, unsigned long *len, unsigned *offset)
 {
-	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
-	*p = bvec->bv_page;
-	*len = bvec->bv_len;
-	*offset = bvec->bv_offset;
+	struct bio *bio = dp->context_ptr;
+	struct bio_vec bvec = bio_iovec(bio);
+	*p = bvec.bv_page;
+	*len = bvec.bv_len;
+	*offset = bvec.bv_offset;
 }
 
-static void bvec_next_page(struct dpages *dp)
+static void bio_next_page(struct dpages *dp)
 {
-	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
-	dp->context_ptr = bvec + 1;
+	struct bio *bio = dp->context_ptr;
+	struct bio_vec bvec = bio_iovec(bio);
+
+	bio_advance(bio, bvec.bv_len);
 }
 
-static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
+static void bio_dp_init(struct dpages *dp, struct bio *bio)
 {
-	dp->get_page = bvec_get_page;
-	dp->next_page = bvec_next_page;
-	dp->context_ptr = bvec;
+	dp->get_page = bio_get_page;
+	dp->next_page = bio_next_page;
+	dp->context_ptr = bio;
 }
 
 /*
@@ -457,8 +460,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
 		list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
 		break;
 
-	case DM_IO_BVEC:
-		bvec_dp_init(dp, io_req->mem.ptr.bvec);
+	case DM_IO_BIO:
+		bio_dp_init(dp, io_req->mem.ptr.bio);
 		break;
 
 	case DM_IO_VMA:
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9f6d8e6baa7..f284e0bfb25 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -526,8 +526,8 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
 	struct dm_io_region io;
 	struct dm_io_request io_req = {
 		.bi_rw = READ,
-		.mem.type = DM_IO_BVEC,
-		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_iter.bi_idx,
+		.mem.type = DM_IO_BIO,
+		.mem.ptr.bio = bio,
 		.notify.fn = read_callback,
 		.notify.context = bio,
 		.client = m->ms->io_client,
@@ -629,8 +629,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
-		.mem.type = DM_IO_BVEC,
-		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_iter.bi_idx,
+		.mem.type = DM_IO_BIO,
+		.mem.ptr.bio = bio,
 		.notify.fn = write_callback,
 		.notify.context = bio,
 		.client = ms->io_client,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 5392135924c..ac35e959d49 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -73,15 +73,10 @@ struct dm_verity_io {
 	sector_t block;
 	unsigned n_blocks;
 
-	/* saved bio vector */
-	struct bio_vec *io_vec;
-	unsigned io_vec_size;
+	struct bvec_iter iter;
 
 	struct work_struct work;
 
-	/* A space for short vectors; longer vectors are allocated separately. */
-	struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
-
 	/*
 	 * Three variably-size fields follow this struct:
 	 *
@@ -284,9 +279,10 @@ release_ret_r:
 static int verity_verify_io(struct dm_verity_io *io)
 {
 	struct dm_verity *v = io->v;
+	struct bio *bio = dm_bio_from_per_bio_data(io,
+						   v->ti->per_bio_data_size);
 	unsigned b;
 	int i;
-	unsigned vector = 0, offset = 0;
 
 	for (b = 0; b < io->n_blocks; b++) {
 		struct shash_desc *desc;
@@ -336,31 +332,22 @@ test_block_hash:
 		}
 
 		todo = 1 << v->data_dev_block_bits;
-		do {
-			struct bio_vec *bv;
+		while (io->iter.bi_size) {
 			u8 *page;
-			unsigned len;
-
-			BUG_ON(vector >= io->io_vec_size);
-			bv = &io->io_vec[vector];
-			page = kmap_atomic(bv->bv_page);
-			len = bv->bv_len - offset;
-			if (likely(len >= todo))
-				len = todo;
-			r = crypto_shash_update(desc,
-					page + bv->bv_offset + offset, len);
+			struct bio_vec bv = bio_iter_iovec(bio, io->iter);
+
+			page = kmap_atomic(bv.bv_page);
+			r = crypto_shash_update(desc, page + bv.bv_offset,
+						bv.bv_len);
 			kunmap_atomic(page);
+
 			if (r < 0) {
 				DMERR("crypto_shash_update failed: %d", r);
 				return r;
 			}
-			offset += len;
-			if (likely(offset == bv->bv_len)) {
-				offset = 0;
-				vector++;
-			}
-			todo -= len;
-		} while (todo);
+
+			bio_advance_iter(bio, &io->iter, bv.bv_len);
+		}
 
 		if (!v->version) {
 			r = crypto_shash_update(desc, v->salt, v->salt_size);
@@ -383,8 +370,6 @@ test_block_hash:
 			return -EIO;
 		}
 	}
-	BUG_ON(vector != io->io_vec_size);
-	BUG_ON(offset);
 
 	return 0;
 }
@@ -400,9 +385,6 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	if (io->io_vec != io->io_vec_inline)
-		mempool_free(io->io_vec, v->vec_mempool);
-
 	bio_endio(bio, error);
 }
 
@@ -519,13 +501,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_end_io = verity_end_io;
 	bio->bi_private = io;
-	io->io_vec_size = bio_segments(bio);
-	if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
-		io->io_vec = io->io_vec_inline;
-	else
-		io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
-	memcpy(io->io_vec, __bio_iovec(bio),
-	       io->io_vec_size * sizeof(struct bio_vec));
+	io->iter = bio->bi_iter;
 
 	verity_submit_prefetch(v, io);
 
diff --git a/fs/bio.c b/fs/bio.c
index e32f2ffc3f3..a082ce2d197 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -525,8 +525,17 @@ EXPORT_SYMBOL(bio_phys_segments);
  */
 void __bio_clone(struct bio *bio, struct bio *bio_src)
 {
-	memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
-		bio_src->bi_max_vecs * sizeof(struct bio_vec));
+	if (bio_is_rw(bio_src)) {
+		struct bio_vec bv;
+		struct bvec_iter iter;
+
+		bio_for_each_segment(bv, bio_src, iter)
+			bio->bi_io_vec[bio->bi_vcnt++] = bv;
+	} else if (bio_has_data(bio_src)) {
+		memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
+		       bio_src->bi_max_vecs * sizeof(struct bio_vec));
+		bio->bi_vcnt = bio_src->bi_vcnt;
+	}
 
 	/*
 	 * most users will be overriding ->bi_bdev with a new target,
@@ -535,7 +544,6 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 	bio->bi_bdev = bio_src->bi_bdev;
 	bio->bi_flags |= 1 << BIO_CLONED;
 	bio->bi_rw = bio_src->bi_rw;
-	bio->bi_vcnt = bio_src->bi_vcnt;
 	bio->bi_iter = bio_src->bi_iter;
 }
 EXPORT_SYMBOL(__bio_clone);
diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h
index f4b0aa3126f..a68cbe59e6a 100644
--- a/include/linux/dm-io.h
+++ b/include/linux/dm-io.h
@@ -29,7 +29,7 @@ typedef void (*io_notify_fn)(unsigned long error, void *context);
 
 enum dm_io_mem_type {
 	DM_IO_PAGE_LIST,/* Page list */
-	DM_IO_BVEC,	/* Bio vector */
+	DM_IO_BIO,	/* Bio vector */
 	DM_IO_VMA,	/* Virtual memory area */
 	DM_IO_KMEM,	/* Kernel memory */
 };
@@ -41,7 +41,7 @@ struct dm_io_memory {
 
 	union {
 		struct page_list *pl;
-		struct bio_vec *bvec;
+		struct bio *bio;
 		void *vma;
 		void *addr;
 	} ptr;
-- 
cgit v1.2.3-70-g09d2


From bdb53207411ae39b8a80dda0a66d1b468cbe1380 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 23 Nov 2013 17:26:46 -0800
Subject: block: Refactor bio_clone_bioset() for immutable biovecs

bio_clone() needs to produce a bio that's suitable for the caller to
munge with the biovec. Part of the immutable biovec patch series is
fixing stuff up so that submitting partially completed bios is safe and
works: thus, we now need bio_clone() on a partially completed bio to
produce a bio for which bi_idx and bi_bvec done are 0 - like they would
be if the caller had just allocated a new bio.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index a082ce2d197..1628917e262 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -549,36 +549,70 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 EXPORT_SYMBOL(__bio_clone);
 
 /**
- *	bio_clone_bioset -	clone a bio
- *	@bio: bio to clone
+ * 	bio_clone_bioset - clone a bio
+ * 	@bio_src: bio to clone
  *	@gfp_mask: allocation priority
  *	@bs: bio_set to allocate from
  *
- * 	Like __bio_clone, only also allocates the returned bio
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
  */
-struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 			     struct bio_set *bs)
 {
-	struct bio *b;
+	unsigned nr_iovecs = 0;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	struct bio *bio;
+
+	/*
+	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
+	 * bio_src->bi_io_vec to bio->bi_io_vec.
+	 *
+	 * We can't do that anymore, because:
+	 *
+	 *  - The point of cloning the biovec is to produce a bio with a biovec
+	 *    the caller can modify: bi_idx and bi_bvec_done should be 0.
+	 *
+	 *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+	 *    we tried to clone the whole thing bio_alloc_bioset() would fail.
+	 *    But the clone should succeed as long as the number of biovecs we
+	 *    actually need to allocate is fewer than BIO_MAX_PAGES.
+	 *
+	 *  - Lastly, bi_vcnt should not be looked at or relied upon by code
+	 *    that does not own the bio - reason being drivers don't use it for
+	 *    iterating over the biovec anymore, so expecting it to be kept up
+	 *    to date (i.e. for clones that share the parent biovec) is just
+	 *    asking for trouble and would force extra work on
+	 *    __bio_clone_fast() anyways.
+	 */
+
+	bio_for_each_segment(bv, bio_src, iter)
+		nr_iovecs++;
 
-	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
-	if (!b)
+	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, bs);
+	if (!bio)
 		return NULL;
 
-	__bio_clone(b, bio);
+	bio->bi_bdev		= bio_src->bi_bdev;
+	bio->bi_rw		= bio_src->bi_rw;
+	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
+	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
-	if (bio_integrity(bio)) {
-		int ret;
+	bio_for_each_segment(bv, bio_src, iter)
+		bio->bi_io_vec[bio->bi_vcnt++] = bv;
 
-		ret = bio_integrity_clone(b, bio, gfp_mask);
+	if (bio_integrity(bio_src)) {
+		int ret;
 
+		ret = bio_integrity_clone(bio, bio_src, gfp_mask);
 		if (ret < 0) {
-			bio_put(b);
+			bio_put(bio);
 			return NULL;
 		}
 	}
 
-	return b;
+	return bio;
 }
 EXPORT_SYMBOL(bio_clone_bioset);
 
-- 
cgit v1.2.3-70-g09d2


From 59d276fe02d7e887a4825ef05c80b8f8c54ba60a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 23 Nov 2013 18:19:27 -0800
Subject: block: Add bio_clone_fast()

bio_clone() just got more expensive - however, most users of bio_clone()
don't actually need to modify the biovec. If they aren't modifying the
biovec, and they can guarantee that the original bio isn't freed before
the clone (also true in most cases), we can just point the clone at the
original bio's biovec.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/request.c |  8 ++----
 fs/bio.c                    | 60 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h         |  2 ++
 3 files changed, 64 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4c0a422fd49..63451c72478 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -613,7 +613,6 @@ struct search {
 
 	struct btree_op		op;
 	struct data_insert_op	iop;
-	struct bio_vec		bv[BIO_MAX_PAGES];
 };
 
 static void bch_cache_read_endio(struct bio *bio, int error)
@@ -761,9 +760,7 @@ static void do_bio_hook(struct search *s)
 	struct bio *bio = &s->bio.bio;
 
 	bio_init(bio);
-	bio->bi_io_vec		= s->bv;
-	bio->bi_max_vecs	= BIO_MAX_PAGES;
-	__bio_clone(bio, s->orig_bio);
+	__bio_clone_fast(bio, s->orig_bio);
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;
 
@@ -1065,8 +1062,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(flush, cl, s->d);
 		}
 	} else {
-		s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
-					      dc->disk.bio_split);
+		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
 
 		closure_bio_submit(bio, cl, s->d);
 	}
diff --git a/fs/bio.c b/fs/bio.c
index 1628917e262..00dc1893c6e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -548,6 +548,66 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 }
 EXPORT_SYMBOL(__bio_clone);
 
+/**
+ * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
+ * 	@bio: destination bio
+ * 	@bio_src: bio to clone
+ *
+ *	Clone a &bio. Caller will own the returned bio, but not
+ *	the actual data it points to. Reference count of returned
+ * 	bio will be one.
+ *
+ * 	Caller must ensure that @bio_src is not freed before @bio.
+ */
+void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+{
+	BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
+
+	/*
+	 * most users will be overriding ->bi_bdev with a new target,
+	 * so we don't set nor calculate new physical/hw segment counts here
+	 */
+	bio->bi_bdev = bio_src->bi_bdev;
+	bio->bi_flags |= 1 << BIO_CLONED;
+	bio->bi_rw = bio_src->bi_rw;
+	bio->bi_iter = bio_src->bi_iter;
+	bio->bi_io_vec = bio_src->bi_io_vec;
+}
+EXPORT_SYMBOL(__bio_clone_fast);
+
+/**
+ *	bio_clone_fast - clone a bio that shares the original bio's biovec
+ *	@bio: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *
+ * 	Like __bio_clone_fast, only also allocates the returned bio
+ */
+struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+{
+	struct bio *b;
+
+	b = bio_alloc_bioset(gfp_mask, 0, bs);
+	if (!b)
+		return NULL;
+
+	__bio_clone_fast(b, bio);
+
+	if (bio_integrity(bio)) {
+		int ret;
+
+		ret = bio_integrity_clone(b, bio, gfp_mask);
+
+		if (ret < 0) {
+			bio_put(b);
+			return NULL;
+		}
+	}
+
+	return b;
+}
+EXPORT_SYMBOL(bio_clone_fast);
+
 /**
  * 	bio_clone_bioset - clone a bio
  * 	@bio_src: bio to clone
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1a31f9d9e05..1f83f4a3083 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -328,6 +328,8 @@ extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 
+extern void __bio_clone_fast(struct bio *, struct bio *);
+extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
-- 
cgit v1.2.3-70-g09d2


From 1c3b13e64cf70d652fb04e32d13ae3e36810c2e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 29 Oct 2013 17:17:49 -0700
Subject: dm: Refactor for new bio cloning/splitting

We need to convert the dm code to the new bvec_iter primitives which
respect bi_bvec_done; they also allow us to drastically simplify dm's
bio splitting code.

Also, it's no longer necessary to save/restore the bvec array anymore -
driver conversions for immutable bvecs are done, so drivers should never
be modifying it.

Also kill bio_sector_offset(), dm was the only user and it doesn't make
much sense anymore.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: dm-devel@redhat.com
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-record.h |  25 -------
 drivers/md/dm.c            | 174 ++++++---------------------------------------
 fs/bio.c                   |  72 -------------------
 include/linux/bio.h        |   2 -
 4 files changed, 20 insertions(+), 253 deletions(-)

(limited to 'fs')

diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 4f46e8e528d..dd364611156 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -17,49 +17,24 @@
  * original bio state.
  */
 
-struct dm_bio_vec_details {
-#if PAGE_SIZE < 65536
-	__u16 bv_len;
-	__u16 bv_offset;
-#else
-	unsigned bv_len;
-	unsigned bv_offset;
-#endif
-};
-
 struct dm_bio_details {
 	struct block_device *bi_bdev;
 	unsigned long bi_flags;
 	struct bvec_iter bi_iter;
-	struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
 };
 
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
-	unsigned i;
-
 	bd->bi_bdev = bio->bi_bdev;
 	bd->bi_flags = bio->bi_flags;
 	bd->bi_iter = bio->bi_iter;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
-		bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
-	}
 }
 
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
-	unsigned i;
-
 	bio->bi_bdev = bd->bi_bdev;
 	bio->bi_flags = bd->bi_flags;
 	bio->bi_iter = bd->bi_iter;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
-		bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
-	}
 }
 
 #endif
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ccd064ea4fe..44a2fa6814c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1155,7 +1155,6 @@ struct clone_info {
 	struct dm_io *io;
 	sector_t sector;
 	sector_t sector_count;
-	unsigned short idx;
 };
 
 static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
@@ -1164,68 +1163,24 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
 	bio->bi_iter.bi_size = to_bytes(len);
 }
 
-static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
-{
-	bio->bi_iter.bi_idx = idx;
-	bio->bi_vcnt = idx + bv_count;
-	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
-}
-
-static void clone_bio_integrity(struct bio *bio, struct bio *clone,
-				unsigned short idx, unsigned len, unsigned offset,
-				unsigned trim)
-{
-	if (!bio_integrity(bio))
-		return;
-
-	bio_integrity_clone(clone, bio, GFP_NOIO);
-
-	if (trim)
-		bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
-}
-
-/*
- * Creates a little bio that just does part of a bvec.
- */
-static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
-			    sector_t sector, unsigned short idx,
-			    unsigned offset, unsigned len)
-{
-	struct bio *clone = &tio->clone;
-	struct bio_vec *bv = bio->bi_io_vec + idx;
-
-	*clone->bi_io_vec = *bv;
-
-	bio_setup_sector(clone, sector, len);
-
-	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw;
-	clone->bi_vcnt = 1;
-	clone->bi_io_vec->bv_offset = offset;
-	clone->bi_io_vec->bv_len = clone->bi_iter.bi_size;
-	clone->bi_flags |= 1 << BIO_CLONED;
-
-	clone_bio_integrity(bio, clone, idx, len, offset, 1);
-}
-
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
 static void clone_bio(struct dm_target_io *tio, struct bio *bio,
-		      sector_t sector, unsigned short idx,
-		      unsigned short bv_count, unsigned len)
+		      sector_t sector, unsigned len)
 {
 	struct bio *clone = &tio->clone;
-	unsigned trim = 0;
 
-	__bio_clone(clone, bio);
-	bio_setup_sector(clone, sector, len);
-	bio_setup_bv(clone, idx, bv_count);
+	__bio_clone_fast(clone, bio);
+
+	if (bio_integrity(bio))
+		bio_integrity_clone(clone, bio, GFP_NOIO);
+
+	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+	clone->bi_iter.bi_size = to_bytes(len);
 
-	if (idx != bio->bi_iter.bi_idx ||
-	    clone->bi_iter.bi_size < bio->bi_iter.bi_size)
-		trim = 1;
-	clone_bio_integrity(bio, clone, idx, len, 0, trim);
+	if (bio_integrity(bio))
+		bio_integrity_trim(clone, 0, len);
 }
 
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@ -1258,7 +1213,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 	 * and discard, so no need for concern about wasted bvec allocations.
 	 */
-	 __bio_clone(clone, ci->bio);
+	 __bio_clone_fast(clone, ci->bio);
 	if (len)
 		bio_setup_sector(clone, ci->sector, len);
 
@@ -1287,10 +1242,7 @@ static int __send_empty_flush(struct clone_info *ci)
 }
 
 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
-				     sector_t sector, int nr_iovecs,
-				     unsigned short idx, unsigned short bv_count,
-				     unsigned offset, unsigned len,
-				     unsigned split_bvec)
+				     sector_t sector, unsigned len)
 {
 	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;
@@ -1304,11 +1256,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
 		num_target_bios = ti->num_write_bios(ti, bio);
 
 	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
-		tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
-		if (split_bvec)
-			clone_split_bio(tio, bio, sector, idx, offset, len);
-		else
-			clone_bio(tio, bio, sector, idx, bv_count, len);
+		tio = alloc_tio(ci, ti, 0, target_bio_nr);
+		clone_bio(tio, bio, sector, len);
 		__map_bio(tio);
 	}
 }
@@ -1379,60 +1328,6 @@ static int __send_write_same(struct clone_info *ci)
 	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
-/*
- * Find maximum number of sectors / bvecs we can process with a single bio.
- */
-static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
-{
-	struct bio *bio = ci->bio;
-	sector_t bv_len, total_len = 0;
-
-	for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
-		bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
-
-		if (bv_len > max)
-			break;
-
-		max -= bv_len;
-		total_len += bv_len;
-	}
-
-	return total_len;
-}
-
-static int __split_bvec_across_targets(struct clone_info *ci,
-				       struct dm_target *ti, sector_t max)
-{
-	struct bio *bio = ci->bio;
-	struct bio_vec *bv = bio->bi_io_vec + ci->idx;
-	sector_t remaining = to_sector(bv->bv_len);
-	unsigned offset = 0;
-	sector_t len;
-
-	do {
-		if (offset) {
-			ti = dm_table_find_target(ci->map, ci->sector);
-			if (!dm_target_is_valid(ti))
-				return -EIO;
-
-			max = max_io_len(ci->sector, ti);
-		}
-
-		len = min(remaining, max);
-
-		__clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
-					 bv->bv_offset + offset, len, 1);
-
-		ci->sector += len;
-		ci->sector_count -= len;
-		offset += to_bytes(len);
-	} while (remaining -= len);
-
-	ci->idx++;
-
-	return 0;
-}
-
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1440,8 +1335,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 {
 	struct bio *bio = ci->bio;
 	struct dm_target *ti;
-	sector_t len, max;
-	int idx;
+	unsigned len;
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
 		return __send_discard(ci);
@@ -1452,41 +1346,14 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 
-	max = max_io_len(ci->sector, ti);
-
-	/*
-	 * Optimise for the simple case where we can do all of
-	 * the remaining io with a single clone.
-	 */
-	if (ci->sector_count <= max) {
-		__clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-					 ci->idx, bio->bi_vcnt - ci->idx, 0,
-					 ci->sector_count, 0);
-		ci->sector_count = 0;
-		return 0;
-	}
-
-	/*
-	 * There are some bvecs that don't span targets.
-	 * Do as many of these as possible.
-	 */
-	if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
-		len = __len_within_target(ci, max, &idx);
-
-		__clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-					 ci->idx, idx - ci->idx, 0, len, 0);
+	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
 
-		ci->sector += len;
-		ci->sector_count -= len;
-		ci->idx = idx;
+	__clone_and_map_data_bio(ci, ti, ci->sector, len);
 
-		return 0;
-	}
+	ci->sector += len;
+	ci->sector_count -= len;
 
-	/*
-	 * Handle a bvec that must be split between two or more targets.
-	 */
-	return __split_bvec_across_targets(ci, ti, max);
+	return 0;
 }
 
 /*
@@ -1512,7 +1379,6 @@ static void __split_and_process_bio(struct mapped_device *md,
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_iter.bi_sector;
-	ci.idx = bio->bi_iter.bi_idx;
 
 	start_io_acct(ci.io);
 
diff --git a/fs/bio.c b/fs/bio.c
index 00dc1893c6e..6e42b68ab0a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -514,40 +514,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
 }
 EXPORT_SYMBOL(bio_phys_segments);
 
-/**
- * 	__bio_clone	-	clone a bio
- * 	@bio: destination bio
- * 	@bio_src: bio to clone
- *
- *	Clone a &bio. Caller will own the returned bio, but not
- *	the actual data it points to. Reference count of returned
- * 	bio will be one.
- */
-void __bio_clone(struct bio *bio, struct bio *bio_src)
-{
-	if (bio_is_rw(bio_src)) {
-		struct bio_vec bv;
-		struct bvec_iter iter;
-
-		bio_for_each_segment(bv, bio_src, iter)
-			bio->bi_io_vec[bio->bi_vcnt++] = bv;
-	} else if (bio_has_data(bio_src)) {
-		memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
-		       bio_src->bi_max_vecs * sizeof(struct bio_vec));
-		bio->bi_vcnt = bio_src->bi_vcnt;
-	}
-
-	/*
-	 * most users will be overriding ->bi_bdev with a new target,
-	 * so we don't set nor calculate new physical/hw segment counts here
-	 */
-	bio->bi_bdev = bio_src->bi_bdev;
-	bio->bi_flags |= 1 << BIO_CLONED;
-	bio->bi_rw = bio_src->bi_rw;
-	bio->bi_iter = bio_src->bi_iter;
-}
-EXPORT_SYMBOL(__bio_clone);
-
 /**
  * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
  * 	@bio: destination bio
@@ -1921,44 +1887,6 @@ void bio_trim(struct bio *bio, int offset, int size)
 }
 EXPORT_SYMBOL_GPL(bio_trim);
 
-/**
- *      bio_sector_offset - Find hardware sector offset in bio
- *      @bio:           bio to inspect
- *      @index:         bio_vec index
- *      @offset:        offset in bv_page
- *
- *      Return the number of hardware sectors between beginning of bio
- *      and an end point indicated by a bio_vec index and an offset
- *      within that vector's page.
- */
-sector_t bio_sector_offset(struct bio *bio, unsigned short index,
-			   unsigned int offset)
-{
-	unsigned int sector_sz;
-	struct bio_vec *bv;
-	sector_t sectors;
-	int i;
-
-	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
-	sectors = 0;
-
-	if (index >= bio->bi_iter.bi_idx)
-		index = bio->bi_vcnt - 1;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		if (i == index) {
-			if (offset > bv->bv_offset)
-				sectors += (offset - bv->bv_offset) / sector_sz;
-			break;
-		}
-
-		sectors += bv->bv_len / sector_sz;
-	}
-
-	return sectors;
-}
-EXPORT_SYMBOL(bio_sector_offset);
-
 /*
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1f83f4a3083..0c32a45a419 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -330,7 +330,6 @@ extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
-extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
 extern struct bio_set *fs_bio_set;
@@ -370,7 +369,6 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
-extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int, gfp_t);
 struct sg_iovec;
-- 
cgit v1.2.3-70-g09d2


From c8db444820a1e37ebdca38a0210bca85f832214f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 22 Nov 2013 19:39:06 -0800
Subject: block: Don't save/copy bvec array anymore

Now that drivers have been converted to the bvec_iter primitives, they
shouldn't be modifying the biovec anymore and thus saving it is
unnecessary - code that was previously making a backup of the bvec array
can now just save bio->bi_iter.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c | 54 +++++++++++++-----------------------------------------
 1 file changed, 13 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 6e42b68ab0a..9cff939b1bc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -967,60 +967,33 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 EXPORT_SYMBOL(bio_copy_data);
 
 struct bio_map_data {
-	struct bio_vec *iovecs;
-	struct sg_iovec *sgvecs;
 	int nr_sgvecs;
 	int is_our_pages;
+	struct sg_iovec sgvecs[];
 };
 
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
 			     struct sg_iovec *iov, int iov_count,
 			     int is_our_pages)
 {
-	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
 	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
 	bmd->nr_sgvecs = iov_count;
 	bmd->is_our_pages = is_our_pages;
 	bio->bi_private = bmd;
 }
 
-static void bio_free_map_data(struct bio_map_data *bmd)
-{
-	kfree(bmd->iovecs);
-	kfree(bmd->sgvecs);
-	kfree(bmd);
-}
-
 static struct bio_map_data *bio_alloc_map_data(int nr_segs,
 					       unsigned int iov_count,
 					       gfp_t gfp_mask)
 {
-	struct bio_map_data *bmd;
-
 	if (iov_count > UIO_MAXIOV)
 		return NULL;
 
-	bmd = kmalloc(sizeof(*bmd), gfp_mask);
-	if (!bmd)
-		return NULL;
-
-	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
-	if (!bmd->iovecs) {
-		kfree(bmd);
-		return NULL;
-	}
-
-	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
-	if (bmd->sgvecs)
-		return bmd;
-
-	kfree(bmd->iovecs);
-	kfree(bmd);
-	return NULL;
+	return kmalloc(sizeof(struct bio_map_data) +
+		       sizeof(struct sg_iovec) * iov_count, gfp_mask);
 }
 
-static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-			  struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
 			  int to_user, int from_user, int do_free_page)
 {
 	int ret = 0, i;
@@ -1030,7 +1003,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		char *bv_addr = page_address(bvec->bv_page);
-		unsigned int bv_len = iovecs[i].bv_len;
+		unsigned int bv_len = bvec->bv_len;
 
 		while (bv_len && iov_idx < iov_count) {
 			unsigned int bytes;
@@ -1090,14 +1063,14 @@ int bio_uncopy_user(struct bio *bio)
 		 * don't copy into a random user address space, just free.
 		 */
 		if (current->mm)
-			ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
-					     bmd->nr_sgvecs, bio_data_dir(bio) == READ,
+			ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
+					     bio_data_dir(bio) == READ,
 					     0, bmd->is_our_pages);
 		else if (bmd->is_our_pages)
 			bio_for_each_segment_all(bvec, bio, i)
 				__free_page(bvec->bv_page);
 	}
-	bio_free_map_data(bmd);
+	kfree(bmd);
 	bio_put(bio);
 	return ret;
 }
@@ -1211,7 +1184,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	 */
 	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
 	    (map_data && map_data->from_user)) {
-		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
+		ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
 		if (ret)
 			goto cleanup;
 	}
@@ -1225,7 +1198,7 @@ cleanup:
 
 	bio_put(bio);
 out_bmd:
-	bio_free_map_data(bmd);
+	kfree(bmd);
 	return ERR_PTR(ret);
 }
 
@@ -1542,16 +1515,15 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		char *addr = page_address(bvec->bv_page);
-		int len = bmd->iovecs[i].bv_len;
 
 		if (read)
-			memcpy(p, addr, len);
+			memcpy(p, addr, bvec->bv_len);
 
 		__free_page(bvec->bv_page);
-		p += len;
+		p += bvec->bv_len;
 	}
 
-	bio_free_map_data(bmd);
+	kfree(bmd);
 	bio_put(bio);
 }
 
-- 
cgit v1.2.3-70-g09d2


From e90abc8ec323c1fd2a25600097ef7ae1e91f39b0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 7 Aug 2013 14:31:42 -0700
Subject: block: Remove bi_idx hacks

Now that drivers have been converted to the new bvec_iter primitives,
there's no need to trim the bvec before we submit it; and we can't trim
it once we start sharing bvecs.

It used to be that passing a partially completed bio (i.e. one with
nonzero bi_idx) to generic_make_request() was a dangerous thing -
various drivers would choke on such things. But with immutable biovecs
and our new bio splitting that shares the biovecs, submitting partially
completed bios has to work (and should work, now that all the drivers
have been completed to the new primitives)

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/io.c | 47 ++---------------------------------------------
 fs/bio.c               | 23 -----------------------
 2 files changed, 2 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 6e04f3bb028..0f0ab659914 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,49 +11,6 @@
 
 #include <linux/blkdev.h>
 
-static void bch_bi_idx_hack_endio(struct bio *bio, int error)
-{
-	struct bio *p = bio->bi_private;
-
-	bio_endio(p, error);
-	bio_put(bio);
-}
-
-static void bch_generic_make_request_hack(struct bio *bio)
-{
-	if (bio->bi_iter.bi_idx) {
-		struct bio_vec bv;
-		struct bvec_iter iter;
-		unsigned segs = bio_segments(bio);
-		struct bio *clone = bio_alloc(GFP_NOIO, segs);
-
-		bio_for_each_segment(bv, bio, iter)
-			clone->bi_io_vec[clone->bi_vcnt++] = bv;
-
-		clone->bi_iter.bi_sector = bio->bi_iter.bi_sector;
-		clone->bi_bdev		= bio->bi_bdev;
-		clone->bi_rw		= bio->bi_rw;
-		clone->bi_vcnt		= segs;
-		clone->bi_iter.bi_size	= bio->bi_iter.bi_size;
-
-		clone->bi_private	= bio;
-		clone->bi_end_io	= bch_bi_idx_hack_endio;
-
-		bio = clone;
-	}
-
-	/*
-	 * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
-	 * bios might have had more than that (before we split them per device
-	 * limitations).
-	 *
-	 * To be taken out once immutable bvec stuff is in.
-	 */
-	bio->bi_max_vecs = bio->bi_vcnt;
-
-	generic_make_request(bio);
-}
-
 /**
  * bch_bio_split - split a bio
  * @bio:	bio to split
@@ -222,12 +179,12 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 		n->bi_private	= &s->cl;
 
 		closure_get(&s->cl);
-		bch_generic_make_request_hack(n);
+		generic_make_request(n);
 	} while (n != bio);
 
 	continue_at(&s->cl, bch_bio_submit_split_done, NULL);
 submit:
-	bch_generic_make_request_hack(bio);
+	generic_make_request(bio);
 }
 
 /* Bios with headers */
diff --git a/fs/bio.c b/fs/bio.c
index 9cff939b1bc..e6dfa06773a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1822,11 +1822,7 @@ void bio_trim(struct bio *bio, int offset, int size)
 {
 	/* 'bio' is a cloned bio which we need to trim to match
 	 * the given offset and size.
-	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
 	 */
-	int i;
-	struct bio_vec *bvec;
-	int sofar = 0;
 
 	size <<= 9;
 	if (offset == 0 && size == bio->bi_iter.bi_size)
@@ -1837,25 +1833,6 @@ void bio_trim(struct bio *bio, int offset, int size)
 	bio_advance(bio, offset << 9);
 
 	bio->bi_iter.bi_size = size;
-
-	/* avoid any complications with bi_idx being non-zero*/
-	if (bio->bi_iter.bi_idx) {
-		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_iter.bi_idx,
-			(bio->bi_vcnt - bio->bi_iter.bi_idx) *
-			sizeof(struct bio_vec));
-		bio->bi_vcnt -= bio->bi_iter.bi_idx;
-		bio->bi_iter.bi_idx = 0;
-	}
-	/* Make sure vcnt and last bv are not too big */
-	bio_for_each_segment_all(bvec, bio, i) {
-		if (sofar + bvec->bv_len > size)
-			bvec->bv_len = size - sofar;
-		if (bvec->bv_len == 0) {
-			bio->bi_vcnt = i;
-			break;
-		}
-		sofar += bvec->bv_len;
-	}
 }
 EXPORT_SYMBOL_GPL(bio_trim);
 
-- 
cgit v1.2.3-70-g09d2


From 196d38bccfcfa32faed8c561868336fdfa0fe8e4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 23 Nov 2013 18:34:15 -0800
Subject: block: Generic bio chaining

This adds a generic mechanism for chaining bio completions. This is
going to be used for a bio_split() replacement, and it turns out to be
very useful in a fair amount of driver code - a fair number of drivers
were implementing this in their own roundabout ways, often painfully.

Note that this means it's no longer to call bio_endio() more than once
on the same bio! This can cause problems for drivers that save/restore
bi_end_io. Arguably they shouldn't be saving/restoring bi_end_io at all
- in all but the simplest cases they'd be better off just cloning the
bio, and immutable biovecs is making bio cloning cheaper. But for now,
we add a bio_endio_nodec() for these cases.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/io.c       |  2 +-
 drivers/md/dm-cache-target.c |  6 ++++
 drivers/md/dm-snap.c         |  1 +
 drivers/md/dm-thin.c         |  8 +++--
 drivers/md/dm-verity.c       |  2 +-
 fs/bio-integrity.c           |  2 +-
 fs/bio.c                     | 76 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/bio.h          |  2 ++
 include/linux/blk_types.h    |  2 ++
 9 files changed, 90 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 0f0ab659914..522f9577844 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -133,7 +133,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
-	bio_endio(s->bio, 0);
+	bio_endio_nodec(s->bio, 0);
 
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 86f9c83eb30..bf3a206abd7 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -765,6 +765,12 @@ static void writethrough_endio(struct bio *bio, int err)
 
 	dm_unhook_bio(&pb->hook_info, bio);
 
+	/*
+	 * Must bump bi_remaining to allow bio to complete with
+	 * restored bi_end_io.
+	 */
+	atomic_inc(&bio->bi_remaining);
+
 	if (err) {
 		bio_endio(bio, err);
 		return;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3ded8c729df..80b5cabbea2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1415,6 +1415,7 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
+		atomic_inc(&full_bio->bi_remaining);
 	}
 	free_pending_exception(pe);
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index a65402480c8..1abb4a24c33 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -611,8 +611,10 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio)
+	if (m->bio) {
 		m->bio->bi_end_io = m->saved_bi_end_io;
+		atomic_inc(&m->bio->bi_remaining);
+	}
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -626,8 +628,10 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	int r;
 
 	bio = m->bio;
-	if (bio)
+	if (bio) {
 		bio->bi_end_io = m->saved_bi_end_io;
+		atomic_inc(&bio->bi_remaining);
+	}
 
 	if (m->err) {
 		cell_error(pool, m->cell);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index ac35e959d49..796007a5e0e 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -385,7 +385,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	bio_endio(bio, error);
+	bio_endio_nodec(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index fed744b8c9e..9d547d2e357 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -502,7 +502,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio(bio, error);
+	bio_endio_nodec(bio, error);
 }
 
 /**
diff --git a/fs/bio.c b/fs/bio.c
index e6dfa06773a..b0a16dbc71e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -273,6 +273,7 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
+	atomic_set(&bio->bi_remaining, 1);
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -295,9 +296,35 @@ void bio_reset(struct bio *bio)
 
 	memset(bio, 0, BIO_RESET_BYTES);
 	bio->bi_flags = flags|(1 << BIO_UPTODATE);
+	atomic_set(&bio->bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
+static void bio_chain_endio(struct bio *bio, int error)
+{
+	bio_endio(bio->bi_private, error);
+	bio_put(bio);
+}
+
+/**
+ * bio_chain - chain bio completions
+ *
+ * The caller won't have a bi_end_io called when @bio completes - instead,
+ * @parent's bi_end_io won't be called until both @parent and @bio have
+ * completed; the chained bio will also be freed when it completes.
+ *
+ * The caller must not set bi_private or bi_end_io in @bio.
+ */
+void bio_chain(struct bio *bio, struct bio *parent)
+{
+	BUG_ON(bio->bi_private || bio->bi_end_io);
+
+	bio->bi_private = parent;
+	bio->bi_end_io	= bio_chain_endio;
+	atomic_inc(&parent->bi_remaining);
+}
+EXPORT_SYMBOL(bio_chain);
+
 static void bio_alloc_rescue(struct work_struct *work)
 {
 	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
@@ -1719,16 +1746,53 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
  **/
 void bio_endio(struct bio *bio, int error)
 {
-	if (error)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		error = -EIO;
+	while (bio) {
+		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
+
+		if (error)
+			clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			error = -EIO;
+
+		if (!atomic_dec_and_test(&bio->bi_remaining))
+			return;
 
-	if (bio->bi_end_io)
-		bio->bi_end_io(bio, error);
+		/*
+		 * Need to have a real endio function for chained bios,
+		 * otherwise various corner cases will break (like stacking
+		 * block devices that save/restore bi_end_io) - however, we want
+		 * to avoid unbounded recursion and blowing the stack. Tail call
+		 * optimization would handle this, but compiling with frame
+		 * pointers also disables gcc's sibling call optimization.
+		 */
+		if (bio->bi_end_io == bio_chain_endio) {
+			struct bio *parent = bio->bi_private;
+			bio_put(bio);
+			bio = parent;
+		} else {
+			if (bio->bi_end_io)
+				bio->bi_end_io(bio, error);
+			bio = NULL;
+		}
+	}
 }
 EXPORT_SYMBOL(bio_endio);
 
+/**
+ * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
+ * @bio:	bio
+ * @error:	error, if any
+ *
+ * For code that has saved and restored bi_end_io; thing hard before using this
+ * function, probably you should've cloned the entire bio.
+ **/
+void bio_endio_nodec(struct bio *bio, int error)
+{
+	atomic_inc(&bio->bi_remaining);
+	bio_endio(bio, error);
+}
+EXPORT_SYMBOL(bio_endio_nodec);
+
 void bio_pair_release(struct bio_pair *bp)
 {
 	if (atomic_dec_and_test(&bp->cnt)) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0c32a45a419..64f5169c224 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -356,6 +356,7 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 }
 
 extern void bio_endio(struct bio *, int);
+extern void bio_endio_nodec(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
@@ -364,6 +365,7 @@ extern void bio_advance(struct bio *, unsigned);
 
 extern void bio_init(struct bio *);
 extern void bio_reset(struct bio *);
+void bio_chain(struct bio *, struct bio *);
 
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d369f8f6af7..bbc3a6c88fc 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -65,6 +65,8 @@ struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 
+	atomic_t		bi_remaining;
+
 	bio_end_io_t		*bi_end_io;
 
 	void			*bi_private;
-- 
cgit v1.2.3-70-g09d2


From ee67891bf132612feb7b999ee1f3350b40867cb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 7 Aug 2013 14:32:45 -0700
Subject: block: Rename bio_split() -> bio_pair_split()

This is prep work for introducing a more general bio_split().

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: NeilBrown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Lars Ellenberg <lars.ellenberg@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Sage Weil <sage@inktank.com>
---
 drivers/block/pktcdvd.c | 2 +-
 drivers/md/linear.c     | 2 +-
 drivers/md/raid0.c      | 6 +++---
 drivers/md/raid10.c     | 2 +-
 fs/bio.c                | 4 ++--
 include/linux/bio.h     | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ce986bacf7b..28789b82ae7 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2399,7 +2399,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 		if (last_zone != zone) {
 			BUG_ON(last_zone != zone + pd->settings.size);
 			first_sectors = last_zone - bio->bi_iter.bi_sector;
-			bp = bio_split(bio, first_sectors);
+			bp = bio_pair_split(bio, first_sectors);
 			BUG_ON(!bp);
 			pkt_make_request(q, &bp->bio1);
 			pkt_make_request(q, &bp->bio2);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index fb3b0d04edf..e9b53e9793b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -326,7 +326,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 
 		rcu_read_unlock();
 
-		bp = bio_split(bio, end_sector - bio->bi_iter.bi_sector);
+		bp = bio_pair_split(bio, end_sector - bio->bi_iter.bi_sector);
 
 		linear_make_request(mddev, &bp->bio1);
 		linear_make_request(mddev, &bp->bio2);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 8ee1a6c658b..ea754dd1a5f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -534,11 +534,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 		 * refuse to split for us, so we need to split it.
 		 */
 		if (likely(is_power_of_2(chunk_sects)))
-			bp = bio_split(bio, chunk_sects - (sector &
+			bp = bio_pair_split(bio, chunk_sects - (sector &
 							   (chunk_sects-1)));
 		else
-			bp = bio_split(bio, chunk_sects -
-				       sector_div(sector, chunk_sects));
+			bp = bio_pair_split(bio, chunk_sects -
+					    sector_div(sector, chunk_sects));
 		raid0_make_request(mddev, &bp->bio1);
 		raid0_make_request(mddev, &bp->bio2);
 		bio_pair_release(bp);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ac4bfa438c5..69c1bc8da88 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1193,7 +1193,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		bp = bio_split(bio, chunk_sects -
+		bp = bio_pair_split(bio, chunk_sects -
 			       (bio->bi_iter.bi_sector & (chunk_sects - 1)));
 
 		/* Each of these 'make_request' calls will call 'wait_barrier'.
diff --git a/fs/bio.c b/fs/bio.c
index b0a16dbc71e..a3e753f4d5a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1827,7 +1827,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
 /*
  * split a bio - only worry about a bio with a single page in its iovec
  */
-struct bio_pair *bio_split(struct bio *bi, int first_sectors)
+struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors)
 {
 	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
 
@@ -1874,7 +1874,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 
 	return bp;
 }
-EXPORT_SYMBOL(bio_split);
+EXPORT_SYMBOL(bio_pair_split);
 
 /**
  * bio_trim - trim a bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 64f5169c224..aa67af0b31a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -317,7 +317,7 @@ struct bio_pair {
 	atomic_t			cnt;
 	int				error;
 };
-extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
+extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 extern void bio_trim(struct bio *bio, int offset, int size);
 
-- 
cgit v1.2.3-70-g09d2


From 20d0189b1012a37d2533a87fb451f7852f2418d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 23 Nov 2013 18:21:01 -0800
Subject: block: Introduce new bio_split()

The new bio_split() can split arbitrary bios - it's not restricted to
single page bios, like the old bio_split() (previously renamed to
bio_pair_split()). It also has different semantics - it doesn't allocate
a struct bio_pair, leaving it up to the caller to handle completions.

Then convert the existing bio_pair_split() users to the new bio_split()
- and also nvme, which was open coding bio splitting.

(We have to take that BUG_ON() out of bio_integrity_trim() because this
bio_split() needs to use it, and there's no reason it has to be used on
bios marked as cloned; BIO_CLONED doesn't seem to have clearly
documented semantics anyways.)

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Neil Brown <neilb@suse.de>
---
 drivers/block/nvme-core.c   | 106 +++-------------------------------
 drivers/block/pktcdvd.c     | 136 ++++++++++++++++++++++++--------------------
 drivers/md/bcache/bcache.h  |   1 -
 drivers/md/bcache/io.c      |  82 +-------------------------
 drivers/md/bcache/request.c |  12 ++--
 drivers/md/linear.c         |  96 +++++++++++++++----------------
 drivers/md/raid0.c          |  77 +++++++++----------------
 drivers/md/raid10.c         | 113 +++++++++++++++---------------------
 fs/bio.c                    |  36 ++++++++++++
 include/linux/bio.h         |  22 +++++++
 10 files changed, 272 insertions(+), 409 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 5539d292087..1f14ac40394 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -441,104 +441,19 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
 	return total_len;
 }
 
-struct nvme_bio_pair {
-	struct bio b1, b2, *parent;
-	struct bio_vec *bv1, *bv2;
-	int err;
-	atomic_t cnt;
-};
-
-static void nvme_bio_pair_endio(struct bio *bio, int err)
-{
-	struct nvme_bio_pair *bp = bio->bi_private;
-
-	if (err)
-		bp->err = err;
-
-	if (atomic_dec_and_test(&bp->cnt)) {
-		bio_endio(bp->parent, bp->err);
-		kfree(bp->bv1);
-		kfree(bp->bv2);
-		kfree(bp);
-	}
-}
-
-static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx,
-							int len, int offset)
-{
-	struct nvme_bio_pair *bp;
-
-	BUG_ON(len > bio->bi_iter.bi_size);
-	BUG_ON(idx > bio->bi_vcnt);
-
-	bp = kmalloc(sizeof(*bp), GFP_ATOMIC);
-	if (!bp)
-		return NULL;
-	bp->err = 0;
-
-	bp->b1 = *bio;
-	bp->b2 = *bio;
-
-	bp->b1.bi_iter.bi_size = len;
-	bp->b2.bi_iter.bi_size -= len;
-	bp->b1.bi_vcnt = idx;
-	bp->b2.bi_iter.bi_idx = idx;
-	bp->b2.bi_iter.bi_sector += len >> 9;
-
-	if (offset) {
-		bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec),
-								GFP_ATOMIC);
-		if (!bp->bv1)
-			goto split_fail_1;
-
-		bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec),
-								GFP_ATOMIC);
-		if (!bp->bv2)
-			goto split_fail_2;
-
-		memcpy(bp->bv1, bio->bi_io_vec,
-			bio->bi_max_vecs * sizeof(struct bio_vec));
-		memcpy(bp->bv2, bio->bi_io_vec,
-			bio->bi_max_vecs * sizeof(struct bio_vec));
-
-		bp->b1.bi_io_vec = bp->bv1;
-		bp->b2.bi_io_vec = bp->bv2;
-		bp->b2.bi_io_vec[idx].bv_offset += offset;
-		bp->b2.bi_io_vec[idx].bv_len -= offset;
-		bp->b1.bi_io_vec[idx].bv_len = offset;
-		bp->b1.bi_vcnt++;
-	} else
-		bp->bv1 = bp->bv2 = NULL;
-
-	bp->b1.bi_private = bp;
-	bp->b2.bi_private = bp;
-
-	bp->b1.bi_end_io = nvme_bio_pair_endio;
-	bp->b2.bi_end_io = nvme_bio_pair_endio;
-
-	bp->parent = bio;
-	atomic_set(&bp->cnt, 2);
-
-	return bp;
-
- split_fail_2:
-	kfree(bp->bv1);
- split_fail_1:
-	kfree(bp);
-	return NULL;
-}
-
 static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
-						int idx, int len, int offset)
+				 int len)
 {
-	struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset);
-	if (!bp)
+	struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
+	if (!split)
 		return -ENOMEM;
 
+	bio_chain(split, bio);
+
 	if (bio_list_empty(&nvmeq->sq_cong))
 		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-	bio_list_add(&nvmeq->sq_cong, &bp->b1);
-	bio_list_add(&nvmeq->sq_cong, &bp->b2);
+	bio_list_add(&nvmeq->sq_cong, split);
+	bio_list_add(&nvmeq->sq_cong, bio);
 
 	return 0;
 }
@@ -568,8 +483,7 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 		} else {
 			if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
 				return nvme_split_and_submit(bio, nvmeq,
-							     iter.bi_idx,
-							     length, 0);
+							     length);
 
 			sg = sg ? sg + 1 : iod->sg;
 			sg_set_page(sg, bvec.bv_page,
@@ -578,9 +492,7 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 		}
 
 		if (split_len - length < bvec.bv_len)
-			return nvme_split_and_submit(bio, nvmeq, iter.bi_idx,
-						     split_len,
-						     split_len - length);
+			return nvme_split_and_submit(bio, nvmeq, split_len);
 		length += bvec.bv_len;
 		bvprv = bvec;
 		first = 0;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 28789b82ae7..3dda09a5ec4 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2338,75 +2338,29 @@ static void pkt_end_io_read_cloned(struct bio *bio, int err)
 	pkt_bio_finished(pd);
 }
 
-static void pkt_make_request(struct request_queue *q, struct bio *bio)
+static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct pktcdvd_device *pd;
-	char b[BDEVNAME_SIZE];
+	struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
+	struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+
+	psd->pd = pd;
+	psd->bio = bio;
+	cloned_bio->bi_bdev = pd->bdev;
+	cloned_bio->bi_private = psd;
+	cloned_bio->bi_end_io = pkt_end_io_read_cloned;
+	pd->stats.secs_r += bio_sectors(bio);
+	pkt_queue_bio(pd, cloned_bio);
+}
+
+static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
+{
+	struct pktcdvd_device *pd = q->queuedata;
 	sector_t zone;
 	struct packet_data *pkt;
 	int was_empty, blocked_bio;
 	struct pkt_rb_node *node;
 
-	pd = q->queuedata;
-	if (!pd) {
-		pr_err("%s incorrect request queue\n",
-		       bdevname(bio->bi_bdev, b));
-		goto end_io;
-	}
-
-	/*
-	 * Clone READ bios so we can have our own bi_end_io callback.
-	 */
-	if (bio_data_dir(bio) == READ) {
-		struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
-		struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
-
-		psd->pd = pd;
-		psd->bio = bio;
-		cloned_bio->bi_bdev = pd->bdev;
-		cloned_bio->bi_private = psd;
-		cloned_bio->bi_end_io = pkt_end_io_read_cloned;
-		pd->stats.secs_r += bio_sectors(bio);
-		pkt_queue_bio(pd, cloned_bio);
-		return;
-	}
-
-	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
-		pkt_notice(pd, "WRITE for ro device (%llu)\n",
-			   (unsigned long long)bio->bi_iter.bi_sector);
-		goto end_io;
-	}
-
-	if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
-		pkt_err(pd, "wrong bio size\n");
-		goto end_io;
-	}
-
-	blk_queue_bounce(q, &bio);
-
 	zone = get_zone(bio->bi_iter.bi_sector, pd);
-	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
-		(unsigned long long)bio->bi_iter.bi_sector,
-		(unsigned long long)bio_end_sector(bio));
-
-	/* Check if we have to split the bio */
-	{
-		struct bio_pair *bp;
-		sector_t last_zone;
-		int first_sectors;
-
-		last_zone = get_zone(bio_end_sector(bio) - 1, pd);
-		if (last_zone != zone) {
-			BUG_ON(last_zone != zone + pd->settings.size);
-			first_sectors = last_zone - bio->bi_iter.bi_sector;
-			bp = bio_pair_split(bio, first_sectors);
-			BUG_ON(!bp);
-			pkt_make_request(q, &bp->bio1);
-			pkt_make_request(q, &bp->bio2);
-			bio_pair_release(bp);
-			return;
-		}
-	}
 
 	/*
 	 * If we find a matching packet in state WAITING or READ_WAIT, we can
@@ -2480,6 +2434,64 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 		 */
 		wake_up(&pd->wqueue);
 	}
+}
+
+static void pkt_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct pktcdvd_device *pd;
+	char b[BDEVNAME_SIZE];
+	struct bio *split;
+
+	pd = q->queuedata;
+	if (!pd) {
+		pr_err("%s incorrect request queue\n",
+		       bdevname(bio->bi_bdev, b));
+		goto end_io;
+	}
+
+	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
+		(unsigned long long)bio->bi_iter.bi_sector,
+		(unsigned long long)bio_end_sector(bio));
+
+	/*
+	 * Clone READ bios so we can have our own bi_end_io callback.
+	 */
+	if (bio_data_dir(bio) == READ) {
+		pkt_make_request_read(pd, bio);
+		return;
+	}
+
+	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
+		pkt_notice(pd, "WRITE for ro device (%llu)\n",
+			   (unsigned long long)bio->bi_iter.bi_sector);
+		goto end_io;
+	}
+
+	if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
+		pkt_err(pd, "wrong bio size\n");
+		goto end_io;
+	}
+
+	blk_queue_bounce(q, &bio);
+
+	do {
+		sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
+		sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
+
+		if (last_zone != zone) {
+			BUG_ON(last_zone != zone + pd->settings.size);
+
+			split = bio_split(bio, last_zone -
+					  bio->bi_iter.bi_sector,
+					  GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
+
+		pkt_make_request_write(q, split);
+	} while (split != bio);
+
 	return;
 end_io:
 	bio_io_error(bio);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6b6fe935be7..964353c5329 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -901,7 +901,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
 void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
 
-struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *);
 void bch_generic_make_request(struct bio *, struct bio_split_pool *);
 void __bch_submit_bbio(struct bio *, struct cache_set *);
 void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 522f9577844..fa028fa82df 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,84 +11,6 @@
 
 #include <linux/blkdev.h>
 
-/**
- * bch_bio_split - split a bio
- * @bio:	bio to split
- * @sectors:	number of sectors to split from the front of @bio
- * @gfp:	gfp mask
- * @bs:		bio set to allocate from
- *
- * Allocates and returns a new bio which represents @sectors from the start of
- * @bio, and updates @bio to represent the remaining sectors.
- *
- * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
- * unchanged.
- *
- * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
- * bvec boundry; it is the caller's responsibility to ensure that @bio is not
- * freed before the split.
- */
-struct bio *bch_bio_split(struct bio *bio, int sectors,
-			  gfp_t gfp, struct bio_set *bs)
-{
-	unsigned vcnt = 0, nbytes = sectors << 9;
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	struct bio *ret = NULL;
-
-	BUG_ON(sectors <= 0);
-
-	if (sectors >= bio_sectors(bio))
-		return bio;
-
-	if (bio->bi_rw & REQ_DISCARD) {
-		ret = bio_alloc_bioset(gfp, 1, bs);
-		if (!ret)
-			return NULL;
-		goto out;
-	}
-
-	bio_for_each_segment(bv, bio, iter) {
-		vcnt++;
-
-		if (nbytes <= bv.bv_len)
-			break;
-
-		nbytes -= bv.bv_len;
-	}
-
-	ret = bio_alloc_bioset(gfp, vcnt, bs);
-	if (!ret)
-		return NULL;
-
-	bio_for_each_segment(bv, bio, iter) {
-		ret->bi_io_vec[ret->bi_vcnt++] = bv;
-
-		if (ret->bi_vcnt == vcnt)
-			break;
-	}
-
-	ret->bi_io_vec[ret->bi_vcnt - 1].bv_len = nbytes;
-out:
-	ret->bi_bdev	= bio->bi_bdev;
-	ret->bi_iter.bi_sector	= bio->bi_iter.bi_sector;
-	ret->bi_iter.bi_size	= sectors << 9;
-	ret->bi_rw	= bio->bi_rw;
-
-	if (bio_integrity(bio)) {
-		if (bio_integrity_clone(ret, bio, gfp)) {
-			bio_put(ret);
-			return NULL;
-		}
-
-		bio_integrity_trim(ret, 0, bio_sectors(ret));
-	}
-
-	bio_advance(bio, ret->bi_iter.bi_size);
-
-	return ret;
-}
-
 static unsigned bch_bio_max_sectors(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -172,8 +94,8 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 	bio_get(bio);
 
 	do {
-		n = bch_bio_split(bio, bch_bio_max_sectors(bio),
-				  GFP_NOIO, s->p->bio_split);
+		n = bio_next_split(bio, bch_bio_max_sectors(bio),
+				   GFP_NOIO, s->p->bio_split);
 
 		n->bi_end_io	= bch_bio_submit_split_endio;
 		n->bi_private	= &s->cl;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 63451c72478..5878cdb3952 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -371,7 +371,7 @@ static void bch_data_insert_start(struct closure *cl)
 				       op->writeback))
 			goto err;
 
-		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+		n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
 
 		n->bi_end_io	= bch_data_insert_endio;
 		n->bi_private	= cl;
@@ -679,9 +679,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	if (KEY_DIRTY(k))
 		s->read_dirty_data = true;
 
-	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
-				     KEY_OFFSET(k) - bio->bi_iter.bi_sector),
-			  GFP_NOIO, s->d->bio_split);
+	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
+				      KEY_OFFSET(k) - bio->bi_iter.bi_sector),
+			   GFP_NOIO, s->d->bio_split);
 
 	bio_key = &container_of(n, struct bbio, bio)->key;
 	bch_bkey_copy_single_ptr(bio_key, k, ptr);
@@ -920,7 +920,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct bio *miss, *cache_bio;
 
 	if (s->cache_miss || s->iop.bypass) {
-		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
 	}
@@ -943,7 +943,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	s->iop.replace = true;
 
-	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+	miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 
 	/* btree_search_recurse()'s btree iterator is no good anymore */
 	ret = miss == bio ? MAP_DONE : -EINTR;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index e9b53e9793b..56f534b4a2d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,65 +288,65 @@ static int linear_stop (struct mddev *mddev)
 
 static void linear_make_request(struct mddev *mddev, struct bio *bio)
 {
+	char b[BDEVNAME_SIZE];
 	struct dev_info *tmp_dev;
-	sector_t start_sector;
+	struct bio *split;
+	sector_t start_sector, end_sector, data_offset;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	rcu_read_lock();
-	tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
-	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
-
-
-	if (unlikely(bio->bi_iter.bi_sector >= (tmp_dev->end_sector)
-		     || (bio->bi_iter.bi_sector < start_sector))) {
-		char b[BDEVNAME_SIZE];
-
-		printk(KERN_ERR
-		       "md/linear:%s: make_request: Sector %llu out of bounds on "
-		       "dev %s: %llu sectors, offset %llu\n",
-		       mdname(mddev),
-		       (unsigned long long)bio->bi_iter.bi_sector,
-		       bdevname(tmp_dev->rdev->bdev, b),
-		       (unsigned long long)tmp_dev->rdev->sectors,
-		       (unsigned long long)start_sector);
-		rcu_read_unlock();
-		bio_io_error(bio);
-		return;
-	}
-	if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
-		/* This bio crosses a device boundary, so we have to
-		 * split it.
-		 */
-		struct bio_pair *bp;
-		sector_t end_sector = tmp_dev->end_sector;
+	do {
+		rcu_read_lock();
 
-		rcu_read_unlock();
-
-		bp = bio_pair_split(bio, end_sector - bio->bi_iter.bi_sector);
+		tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
+		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+		end_sector = tmp_dev->end_sector;
+		data_offset = tmp_dev->rdev->data_offset;
+		bio->bi_bdev = tmp_dev->rdev->bdev;
 
-		linear_make_request(mddev, &bp->bio1);
-		linear_make_request(mddev, &bp->bio2);
-		bio_pair_release(bp);
-		return;
-	}
-		    
-	bio->bi_bdev = tmp_dev->rdev->bdev;
-	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector
-		+ tmp_dev->rdev->data_offset;
-	rcu_read_unlock();
+		rcu_read_unlock();
 
-	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-		/* Just ignore it */
-		bio_endio(bio, 0);
-		return;
-	}
+		if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
+			     bio->bi_iter.bi_sector < start_sector))
+			goto out_of_bounds;
+
+		if (unlikely(bio_end_sector(bio) > end_sector)) {
+			/* This bio crosses a device boundary, so we have to
+			 * split it.
+			 */
+			split = bio_split(bio, end_sector -
+					  bio->bi_iter.bi_sector,
+					  GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
 
-	generic_make_request(bio);
+		split->bi_iter.bi_sector = split->bi_iter.bi_sector -
+			start_sector + data_offset;
+
+		if (unlikely((split->bi_rw & REQ_DISCARD) &&
+			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+			/* Just ignore it */
+			bio_endio(split, 0);
+		} else
+			generic_make_request(split);
+	} while (split != bio);
+	return;
+
+out_of_bounds:
+	printk(KERN_ERR
+	       "md/linear:%s: make_request: Sector %llu out of bounds on "
+	       "dev %s: %llu sectors, offset %llu\n",
+	       mdname(mddev),
+	       (unsigned long long)bio->bi_iter.bi_sector,
+	       bdevname(tmp_dev->rdev->bdev, b),
+	       (unsigned long long)tmp_dev->rdev->sectors,
+	       (unsigned long long)start_sector);
+	bio_io_error(bio);
 }
 
 static void linear_status (struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ea754dd1a5f..407a99e46f6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -513,65 +513,44 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 
 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
-	unsigned int chunk_sects;
-	sector_t sector_offset;
 	struct strip_zone *zone;
 	struct md_rdev *tmp_dev;
+	struct bio *split;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	chunk_sects = mddev->chunk_sectors;
-	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
+	do {
 		sector_t sector = bio->bi_iter.bi_sector;
-		struct bio_pair *bp;
-		/* Sanity check -- queue functions should prevent this happening */
-		if (bio_multiple_segments(bio))
-			goto bad_map;
-		/* This is a one page bio that upper layers
-		 * refuse to split for us, so we need to split it.
-		 */
-		if (likely(is_power_of_2(chunk_sects)))
-			bp = bio_pair_split(bio, chunk_sects - (sector &
-							   (chunk_sects-1)));
-		else
-			bp = bio_pair_split(bio, chunk_sects -
-					    sector_div(sector, chunk_sects));
-		raid0_make_request(mddev, &bp->bio1);
-		raid0_make_request(mddev, &bp->bio2);
-		bio_pair_release(bp);
-		return;
-	}
-
-	sector_offset = bio->bi_iter.bi_sector;
-	zone = find_zone(mddev->private, &sector_offset);
-	tmp_dev = map_sector(mddev, zone, bio->bi_iter.bi_sector,
-			     &sector_offset);
-	bio->bi_bdev = tmp_dev->bdev;
-	bio->bi_iter.bi_sector = sector_offset + zone->dev_start +
-		tmp_dev->data_offset;
-
-	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-		/* Just ignore it */
-		bio_endio(bio, 0);
-		return;
-	}
-
-	generic_make_request(bio);
-	return;
-
-bad_map:
-	printk("md/raid0:%s: make_request bug: can't convert block across chunks"
-	       " or bigger than %dk %llu %d\n",
-	       mdname(mddev), chunk_sects / 2,
-	       (unsigned long long)bio->bi_iter.bi_sector,
-	       bio_sectors(bio) / 2);
+		unsigned chunk_sects = mddev->chunk_sectors;
+
+		unsigned sectors = chunk_sects -
+			(likely(is_power_of_2(chunk_sects))
+			 ? (sector & (chunk_sects-1))
+			 : sector_div(sector, chunk_sects));
+
+		if (sectors < bio_sectors(bio)) {
+			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
 
-	bio_io_error(bio);
-	return;
+		zone = find_zone(mddev->private, &sector);
+		tmp_dev = map_sector(mddev, zone, sector, &sector);
+		split->bi_bdev = tmp_dev->bdev;
+		split->bi_iter.bi_sector = sector + zone->dev_start +
+			tmp_dev->data_offset;
+
+		if (unlikely((split->bi_rw & REQ_DISCARD) &&
+			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+			/* Just ignore it */
+			bio_endio(split, 0);
+		} else
+			generic_make_request(split);
+	} while (split != bio);
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 69c1bc8da88..6d43d88657a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1152,14 +1152,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void make_request(struct mddev *mddev, struct bio * bio)
+static void __make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 	struct bio *read_bio;
 	int i;
-	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
-	int chunk_sects = chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
@@ -1174,69 +1172,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	int max_sectors;
 	int sectors;
 
-	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-		md_flush_request(mddev, bio);
-		return;
-	}
-
-	/* If this request crosses a chunk boundary, we need to
-	 * split it.  This will only happen for 1 PAGE (or less) requests.
-	 */
-	if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + bio_sectors(bio)
-		     > chunk_sects
-		     && (conf->geo.near_copies < conf->geo.raid_disks
-			 || conf->prev.near_copies < conf->prev.raid_disks))) {
-		struct bio_pair *bp;
-		/* Sanity check -- queue functions should prevent this happening */
-		if (bio_multiple_segments(bio))
-			goto bad_map;
-		/* This is a one page bio that upper layers
-		 * refuse to split for us, so we need to split it.
-		 */
-		bp = bio_pair_split(bio, chunk_sects -
-			       (bio->bi_iter.bi_sector & (chunk_sects - 1)));
-
-		/* Each of these 'make_request' calls will call 'wait_barrier'.
-		 * If the first succeeds but the second blocks due to the resync
-		 * thread raising the barrier, we will deadlock because the
-		 * IO to the underlying device will be queued in generic_make_request
-		 * and will never complete, so will never reduce nr_pending.
-		 * So increment nr_waiting here so no new raise_barriers will
-		 * succeed, and so the second wait_barrier cannot block.
-		 */
-		spin_lock_irq(&conf->resync_lock);
-		conf->nr_waiting++;
-		spin_unlock_irq(&conf->resync_lock);
-
-		make_request(mddev, &bp->bio1);
-		make_request(mddev, &bp->bio2);
-
-		spin_lock_irq(&conf->resync_lock);
-		conf->nr_waiting--;
-		wake_up(&conf->wait_barrier);
-		spin_unlock_irq(&conf->resync_lock);
-
-		bio_pair_release(bp);
-		return;
-	bad_map:
-		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
-		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-		       (unsigned long long)bio->bi_iter.bi_sector,
-		       bio_sectors(bio) / 2);
-
-		bio_io_error(bio);
-		return;
-	}
-
-	md_write_start(mddev, bio);
-
-	/*
-	 * Register the new request and wait if the reconstruction
-	 * thread has put up a bar for new requests.
-	 * Continue immediately if no resync is active currently.
-	 */
-	wait_barrier(conf);
-
 	sectors = bio_sectors(bio);
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
@@ -1600,6 +1535,52 @@ retry_write:
 		goto retry_write;
 	}
 	one_write_done(r10_bio);
+}
+
+static void make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r10conf *conf = mddev->private;
+	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+	int chunk_sects = chunk_mask + 1;
+
+	struct bio *split;
+
+	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bio);
+		return;
+	}
+
+	md_write_start(mddev, bio);
+
+	/*
+	 * Register the new request and wait if the reconstruction
+	 * thread has put up a bar for new requests.
+	 * Continue immediately if no resync is active currently.
+	 */
+	wait_barrier(conf);
+
+	do {
+
+		/*
+		 * If this request crosses a chunk boundary, we need to split
+		 * it.
+		 */
+		if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+			     bio_sectors(bio) > chunk_sects
+			     && (conf->geo.near_copies < conf->geo.raid_disks
+				 || conf->prev.near_copies <
+				 conf->prev.raid_disks))) {
+			split = bio_split(bio, chunk_sects -
+					  (bio->bi_iter.bi_sector &
+					   (chunk_sects - 1)),
+					  GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
+
+		__make_request(mddev, split);
+	} while (split != bio);
 
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
diff --git a/fs/bio.c b/fs/bio.c
index a3e753f4d5a..7b062befac8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1793,6 +1793,42 @@ void bio_endio_nodec(struct bio *bio, int error)
 }
 EXPORT_SYMBOL(bio_endio_nodec);
 
+/**
+ * bio_split - split a bio
+ * @bio:	bio to split
+ * @sectors:	number of sectors to split from the front of @bio
+ * @gfp:	gfp mask
+ * @bs:		bio set to allocate from
+ *
+ * Allocates and returns a new bio which represents @sectors from the start of
+ * @bio, and updates @bio to represent the remaining sectors.
+ *
+ * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's
+ * responsibility to ensure that @bio is not freed before the split.
+ */
+struct bio *bio_split(struct bio *bio, int sectors,
+		      gfp_t gfp, struct bio_set *bs)
+{
+	struct bio *split = NULL;
+
+	BUG_ON(sectors <= 0);
+	BUG_ON(sectors >= bio_sectors(bio));
+
+	split = bio_clone_fast(bio, gfp, bs);
+	if (!split)
+		return NULL;
+
+	split->bi_iter.bi_size = sectors << 9;
+
+	if (bio_integrity(split))
+		bio_integrity_trim(split, 0, sectors);
+
+	bio_advance(bio, split->bi_iter.bi_size);
+
+	return split;
+}
+EXPORT_SYMBOL(bio_split);
+
 void bio_pair_release(struct bio_pair *bp)
 {
 	if (atomic_dec_and_test(&bp->cnt)) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index aa67af0b31a..19e31b2f5b2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -321,6 +321,28 @@ extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 extern void bio_trim(struct bio *bio, int offset, int size);
 
+extern struct bio *bio_split(struct bio *bio, int sectors,
+			     gfp_t gfp, struct bio_set *bs);
+
+/**
+ * bio_next_split - get next @sectors from a bio, splitting if necessary
+ * @bio:	bio to split
+ * @sectors:	number of sectors to split from the front of @bio
+ * @gfp:	gfp mask
+ * @bs:		bio set to allocate from
+ *
+ * Returns a bio representing the next @sectors of @bio - if the bio is smaller
+ * than @sectors, returns the original bio unchanged.
+ */
+static inline struct bio *bio_next_split(struct bio *bio, int sectors,
+					 gfp_t gfp, struct bio_set *bs)
+{
+	if (sectors >= bio_sectors(bio))
+		return bio;
+
+	return bio_split(bio, sectors, gfp, bs);
+}
+
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
-- 
cgit v1.2.3-70-g09d2


From 4b1faf931650d4a35b2a570318862821d6a962e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 7 Aug 2013 14:33:00 -0700
Subject: block: Kill bio_pair_split()

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 fs/bio-integrity.c  | 45 ---------------------------
 fs/bio.c            | 90 -----------------------------------------------------
 include/linux/bio.h | 30 ------------------
 3 files changed, 165 deletions(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9d547d2e357..80d972d739e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -580,51 +580,6 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset,
 }
 EXPORT_SYMBOL(bio_integrity_trim);
 
-/**
- * bio_integrity_split - Split integrity metadata
- * @bio:	Protected bio
- * @bp:		Resulting bio_pair
- * @sectors:	Offset
- *
- * Description: Splits an integrity page into a bio_pair.
- */
-void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
-{
-	struct blk_integrity *bi;
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	unsigned int nr_sectors;
-
-	if (bio_integrity(bio) == 0)
-		return;
-
-	bi = bdev_get_integrity(bio->bi_bdev);
-	BUG_ON(bi == NULL);
-	BUG_ON(bip->bip_vcnt != 1);
-
-	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-
-	bp->bio1.bi_integrity = &bp->bip1;
-	bp->bio2.bi_integrity = &bp->bip2;
-
-	bp->iv1 = bip->bip_vec[bip->bip_iter.bi_idx];
-	bp->iv2 = bip->bip_vec[bip->bip_iter.bi_idx];
-
-	bp->bip1.bip_vec = &bp->iv1;
-	bp->bip2.bip_vec = &bp->iv2;
-
-	bp->iv1.bv_len = sectors * bi->tuple_size;
-	bp->iv2.bv_offset += sectors * bi->tuple_size;
-	bp->iv2.bv_len -= sectors * bi->tuple_size;
-
-	bp->bip1.bip_iter.bi_sector = bio->bi_integrity->bip_iter.bi_sector;
-	bp->bip2.bip_iter.bi_sector =
-		bio->bi_integrity->bip_iter.bi_sector + nr_sectors;
-
-	bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
-	bp->bip1.bip_iter.bi_idx = bp->bip2.bip_iter.bi_idx = 0;
-}
-EXPORT_SYMBOL(bio_integrity_split);
-
 /**
  * bio_integrity_clone - Callback for cloning bios with integrity metadata
  * @bio:	New bio
diff --git a/fs/bio.c b/fs/bio.c
index 7b062befac8..75c49a38223 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -38,8 +38,6 @@
  */
 #define BIO_INLINE_VECS		4
 
-static mempool_t *bio_split_pool __read_mostly;
-
 /*
  * if you change this list, also change bvec_alloc or things will
  * break badly! cannot be bigger than what you can fit into an
@@ -1829,89 +1827,6 @@ struct bio *bio_split(struct bio *bio, int sectors,
 }
 EXPORT_SYMBOL(bio_split);
 
-void bio_pair_release(struct bio_pair *bp)
-{
-	if (atomic_dec_and_test(&bp->cnt)) {
-		struct bio *master = bp->bio1.bi_private;
-
-		bio_endio(master, bp->error);
-		mempool_free(bp, bp->bio2.bi_private);
-	}
-}
-EXPORT_SYMBOL(bio_pair_release);
-
-static void bio_pair_end_1(struct bio *bi, int err)
-{
-	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
-
-	if (err)
-		bp->error = err;
-
-	bio_pair_release(bp);
-}
-
-static void bio_pair_end_2(struct bio *bi, int err)
-{
-	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
-
-	if (err)
-		bp->error = err;
-
-	bio_pair_release(bp);
-}
-
-/*
- * split a bio - only worry about a bio with a single page in its iovec
- */
-struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors)
-{
-	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
-
-	if (!bp)
-		return bp;
-
-	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
-				bi->bi_iter.bi_sector + first_sectors);
-
-	BUG_ON(bio_multiple_segments(bi));
-	atomic_set(&bp->cnt, 3);
-	bp->error = 0;
-	bp->bio1 = *bi;
-	bp->bio2 = *bi;
-	bp->bio2.bi_iter.bi_sector += first_sectors;
-	bp->bio2.bi_iter.bi_size -= first_sectors << 9;
-	bp->bio1.bi_iter.bi_size = first_sectors << 9;
-
-	if (bi->bi_vcnt != 0) {
-		bp->bv1 = bio_iovec(bi);
-		bp->bv2 = bio_iovec(bi);
-
-		if (bio_is_rw(bi)) {
-			bp->bv2.bv_offset += first_sectors << 9;
-			bp->bv2.bv_len -= first_sectors << 9;
-			bp->bv1.bv_len = first_sectors << 9;
-		}
-
-		bp->bio1.bi_io_vec = &bp->bv1;
-		bp->bio2.bi_io_vec = &bp->bv2;
-
-		bp->bio1.bi_max_vecs = 1;
-		bp->bio2.bi_max_vecs = 1;
-	}
-
-	bp->bio1.bi_end_io = bio_pair_end_1;
-	bp->bio2.bi_end_io = bio_pair_end_2;
-
-	bp->bio1.bi_private = bi;
-	bp->bio2.bi_private = bio_split_pool;
-
-	if (bio_integrity(bi))
-		bio_integrity_split(bi, bp, first_sectors);
-
-	return bp;
-}
-EXPORT_SYMBOL(bio_pair_split);
-
 /**
  * bio_trim - trim a bio
  * @bio:	bio to trim
@@ -2113,11 +2028,6 @@ static int __init init_bio(void)
 	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
 		panic("bio: can't create integrity pool\n");
 
-	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
-						     sizeof(struct bio_pair));
-	if (!bio_split_pool)
-		panic("bio: can't create split pool\n");
-
 	return 0;
 }
 subsys_initcall(init_bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 19e31b2f5b2..70654521dab 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -297,30 +297,7 @@ struct bio_integrity_payload {
 };
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-/*
- * A bio_pair is used when we need to split a bio.
- * This can only happen for a bio that refers to just one
- * page of data, and in the unusual situation when the
- * page crosses a chunk/device boundary
- *
- * The address of the master bio is stored in bio1.bi_private
- * The address of the pool the pair was allocated from is stored
- *   in bio2.bi_private
- */
-struct bio_pair {
-	struct bio			bio1, bio2;
-	struct bio_vec			bv1, bv2;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-	struct bio_integrity_payload	bip1, bip2;
-	struct bio_vec			iv1, iv2;
-#endif
-	atomic_t			cnt;
-	int				error;
-};
-extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors);
-extern void bio_pair_release(struct bio_pair *dbio);
 extern void bio_trim(struct bio *bio, int offset, int size);
-
 extern struct bio *bio_split(struct bio *bio, int sectors,
 			     gfp_t gfp, struct bio_set *bs);
 
@@ -677,7 +654,6 @@ extern int bio_integrity_prep(struct bio *);
 extern void bio_integrity_endio(struct bio *, int);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
-extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
 extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
 extern int bioset_integrity_create(struct bio_set *, int);
 extern void bioset_integrity_free(struct bio_set *);
@@ -721,12 +697,6 @@ static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 	return 0;
 }
 
-static inline void bio_integrity_split(struct bio *bio, struct bio_pair *bp,
-				       int sectors)
-{
-	return;
-}
-
 static inline void bio_integrity_advance(struct bio *bio,
 					 unsigned int bytes_done)
 {
-- 
cgit v1.2.3-70-g09d2


From c84a3b27798dfce928b867fa1c9f3c3fd66f0a31 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 18:01:46 -0500
Subject: sysfs: drop kobj_ns_type handling, take #2

The way namespace tags are implemented in sysfs is more complicated
than necessary.  As each tag is a pointer value and required to be
non-NULL under a namespace enabled parent, there's no need to record
separately what type each tag is.  If multiple namespace types are
needed, which currently aren't, we can simply compare the tag to a set
of allowed tags in the superblock assuming that the tags, being
pointers, won't have the same value across multiple types.

This patch rips out kobj_ns_type handling from sysfs.  sysfs now has
an enable switch to turn on namespace under a node.  If enabled, all
children are required to have non-NULL namespace tags and filtered
against the super_block's tag.

kobject namespace determination is now performed in
lib/kobject.c::create_dir() making sysfs_read_ns_type() unnecessary.
The sanity checks are also moved.  create_dir() is restructured to
ease such addition.  This removes most kobject namespace knowledge
from sysfs proper which will enable proper separation and layering of
sysfs.

This is the second try.  The first one was cb26a311578e ("sysfs: drop
kobj_ns_type handling") which tried to automatically enable namespace
if there are children with non-NULL namespace tags; however, it was
broken for symlinks as they should inherit the target's tag iff
namespace is enabled in the parent.  This led to namespace filtering
enabled incorrectly for wireless net class devices through phy80211
symlinks and thus network configuration failure.  a1212d278c05
("Revert "sysfs: drop kobj_ns_type handling"") reverted the commit.

This shouldn't introduce any behavior changes, for real.

v2: Dummy implementation of sysfs_enable_ns() for !CONFIG_SYSFS was
    missing and caused build failure.  Reported by kbuild test robot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c        | 92 ++++++++++++++++++++-------------------------------
 fs/sysfs/mount.c      | 24 ++++----------
 fs/sysfs/symlink.c    | 26 ++++-----------
 fs/sysfs/sysfs.h      | 25 ++++----------
 include/linux/sysfs.h |  6 ++++
 lib/kobject.c         | 27 ++++++++++++---
 6 files changed, 83 insertions(+), 117 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5e73d6626e5..b3cf61dc57c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -279,7 +279,6 @@ static int sysfs_dentry_delete(const struct dentry *dentry)
 static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct sysfs_dirent *sd;
-	int type;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -300,13 +299,9 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved to a different namespace */
-	type = KOBJ_NS_TYPE_NONE;
-	if (sd->s_parent) {
-		type = sysfs_ns_type(sd->s_parent);
-		if (type != KOBJ_NS_TYPE_NONE &&
-				sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
-			goto out_bad;
-	}
+	if (sd->s_parent && (sd->s_parent->s_flags & SYSFS_FLAG_NS) &&
+	    sysfs_info(dentry->d_sb)->ns != sd->s_ns)
+		goto out_bad;
 
 	mutex_unlock(&sysfs_mutex);
 out_valid:
@@ -423,13 +418,14 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		    struct sysfs_dirent *parent_sd)
 {
+	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
-	if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
+	if (has_ns != (bool)sd->s_ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(parent_sd) ? "required" : "invalid",
-			parent_sd->s_name, sd->s_name);
+		     has_ns ? "required" : "invalid",
+		     parent_sd->s_name, sd->s_name);
 		return -EINVAL;
 	}
 
@@ -610,12 +606,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 				       const void *ns)
 {
 	struct rb_node *node = parent_sd->s_dir.children.rb_node;
+	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
 	unsigned int hash;
 
-	if (!!sysfs_ns_type(parent_sd) != !!ns) {
+	if (has_ns != (bool)ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(parent_sd) ? "required" : "invalid",
-			parent_sd->s_name, name);
+		     has_ns ? "required" : "invalid",
+		     parent_sd->s_name, name);
 		return NULL;
 	}
 
@@ -667,7 +664,6 @@ struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-		      enum kobj_ns_type type,
 		      const char *name, const void *ns,
 		      struct sysfs_dirent **p_sd)
 {
@@ -681,7 +677,6 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	if (!sd)
 		return -ENOMEM;
 
-	sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
 	sd->s_ns = ns;
 	sd->s_dir.kobj = kobj;
 
@@ -701,33 +696,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd)
 {
-	return create_dir(kobj, kobj->sd,
-			  KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
-}
-
-/**
- *	sysfs_read_ns_type: return associated ns_type
- *	@kobj: the kobject being queried
- *
- *	Each kobject can be tagged with exactly one namespace type
- *	(i.e. network or user).  Return the ns_type associated with
- *	this object if any
- */
-static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
-{
-	const struct kobj_ns_type_operations *ops;
-	enum kobj_ns_type type;
-
-	ops = kobj_child_ns_ops(kobj);
-	if (!ops)
-		return KOBJ_NS_TYPE_NONE;
-
-	type = ops->type;
-	BUG_ON(type <= KOBJ_NS_TYPE_NONE);
-	BUG_ON(type >= KOBJ_NS_TYPES);
-	BUG_ON(!kobj_ns_type_registered(type));
-
-	return type;
+	return create_dir(kobj, kobj->sd, name, NULL, p_sd);
 }
 
 /**
@@ -737,7 +706,6 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
  */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
-	enum kobj_ns_type type;
 	struct sysfs_dirent *parent_sd, *sd;
 	int error = 0;
 
@@ -751,9 +719,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (!parent_sd)
 		return -ENOENT;
 
-	type = sysfs_read_ns_type(kobj);
-
-	error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd);
+	error = create_dir(kobj, parent_sd, kobject_name(kobj), ns, &sd);
 	if (!error)
 		kobj->sd = sd;
 	return error;
@@ -767,13 +733,12 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct sysfs_dirent *parent_sd = parent->d_fsdata;
 	struct sysfs_dirent *sd;
 	struct inode *inode;
-	enum kobj_ns_type type;
-	const void *ns;
+	const void *ns = NULL;
 
 	mutex_lock(&sysfs_mutex);
 
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dir->i_sb)->ns[type];
+	if (parent_sd->s_flags & SYSFS_FLAG_NS)
+		ns = sysfs_info(dir->i_sb)->ns;
 
 	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
 
@@ -1029,6 +994,21 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 	return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
 }
 
+/**
+ * sysfs_enable_ns - enable namespace under a directory
+ * @sd: directory of interest, should be empty
+ *
+ * This is to be called right after @sd is created to enable namespace
+ * under it.  All children of @sd must have non-NULL namespace tags and
+ * only the ones which match the super_block's tag will be visible.
+ */
+void sysfs_enable_ns(struct sysfs_dirent *sd)
+{
+	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
+	sd->s_flags |= SYSFS_FLAG_NS;
+}
+
 /* Relationship between s_mode and the DT_xxx types */
 static inline unsigned char dt_type(struct sysfs_dirent *sd)
 {
@@ -1096,15 +1076,15 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 	struct dentry *dentry = file->f_path.dentry;
 	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
 	struct sysfs_dirent *pos = file->private_data;
-	enum kobj_ns_type type;
-	const void *ns;
-
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dentry->d_sb)->ns[type];
+	const void *ns = NULL;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 	mutex_lock(&sysfs_mutex);
+
+	if (parent_sd->s_flags & SYSFS_FLAG_NS)
+		ns = sysfs_info(dentry->d_sb)->ns;
+
 	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
 	     pos;
 	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a..8c24bce2f4a 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -36,7 +36,7 @@ static const struct super_operations sysfs_ops = {
 struct sysfs_dirent sysfs_root = {
 	.s_name		= "",
 	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
+	.s_flags	= SYSFS_DIR,
 	.s_mode		= S_IFDIR | S_IRUGO | S_IXUGO,
 	.s_ino		= 1,
 };
@@ -77,14 +77,8 @@ static int sysfs_test_super(struct super_block *sb, void *data)
 {
 	struct sysfs_super_info *sb_info = sysfs_info(sb);
 	struct sysfs_super_info *info = data;
-	enum kobj_ns_type type;
-	int found = 1;
 
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-		if (sb_info->ns[type] != info->ns[type])
-			found = 0;
-	}
-	return found;
+	return sb_info->ns == info->ns;
 }
 
 static int sysfs_set_super(struct super_block *sb, void *data)
@@ -98,9 +92,7 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 
 static void free_sysfs_super_info(struct sysfs_super_info *info)
 {
-	int type;
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		kobj_ns_drop(type, info->ns[type]);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, info->ns);
 	kfree(info);
 }
 
@@ -108,7 +100,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	struct sysfs_super_info *info;
-	enum kobj_ns_type type;
 	struct super_block *sb;
 	int error;
 
@@ -116,18 +107,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
 			return ERR_PTR(-EPERM);
 
-		for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-			if (!kobj_ns_current_may_mount(type))
-				return ERR_PTR(-EPERM);
-		}
+		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
+			return ERR_PTR(-EPERM);
 	}
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		info->ns[type] = kobj_ns_grab_current(type);
+	info->ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3ae3f1bf1a0..c660363fdae 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,7 +28,6 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent *target_sd = NULL;
 	struct sysfs_dirent *sd = NULL;
 	struct sysfs_addrm_cxt acxt;
-	enum kobj_ns_type ns_type;
 	int error;
 
 	BUG_ON(!name || !parent_sd);
@@ -52,29 +51,16 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	if (!sd)
 		goto out_put;
 
-	ns_type = sysfs_ns_type(parent_sd);
-	if (ns_type)
+	if (parent_sd->s_flags & SYSFS_FLAG_NS)
 		sd->s_ns = target_sd->s_ns;
 	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt);
-	/* Symlinks must be between directories with the same ns_type */
-	if (!ns_type ||
-	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-		if (warn)
-			error = sysfs_add_one(&acxt, sd, parent_sd);
-		else
-			error = __sysfs_add_one(&acxt, sd, parent_sd);
-	} else {
-		error = -EINVAL;
-		WARN(1, KERN_WARNING
-			"sysfs: symlink across ns_types %s/%s -> %s/%s\n",
-			parent_sd->s_name,
-			sd->s_name,
-			sd->s_symlink.target_sd->s_parent->s_name,
-			sd->s_symlink.target_sd->s_name);
-	}
+	if (warn)
+		error = sysfs_add_one(&acxt, sd, parent_sd);
+	else
+		error = __sysfs_add_one(&acxt, sd, parent_sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (error)
@@ -164,7 +150,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (targ->sd && sysfs_ns_type(kobj->sd))
+	if (targ->sd && (kobj->sd->s_flags & SYSFS_FLAG_NS))
 		ns = targ->sd->s_ns;
 	spin_unlock(&sysfs_symlink_target_lock);
 	sysfs_hash_and_remove(kobj->sd, name, ns);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0af09fbfb3f..e116c21a27b 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -90,11 +90,8 @@ struct sysfs_dirent {
 #define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
 #define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
 
-/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK		0xf00
-#define SYSFS_NS_TYPE_SHIFT		8
-
-#define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
+#define SYSFS_FLAG_NS			0x01000
 #define SYSFS_FLAG_REMOVED		0x02000
 
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
@@ -102,15 +99,6 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 	return sd->s_flags & SYSFS_TYPE_MASK;
 }
 
-/*
- * Return any namespace tags on this dirent.
- * enum kobj_ns_type is defined in linux/kobject.h
- */
-static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
-{
-	return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
-}
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 #define sysfs_dirent_init_lockdep(sd)				\
@@ -155,12 +143,13 @@ struct sysfs_addrm_cxt {
  */
 
 /*
- * Each sb is associated with a set of namespace tags (i.e.
- * the network namespace of the task which mounted this sysfs
- * instance).
+ * Each sb is associated with one namespace tag, currently the network
+ * namespace of the task which mounted this sysfs instance.  If multiple
+ * tags become necessary, make the following an array and compare
+ * sysfs_dirent tag against every entry.
  */
 struct sysfs_super_info {
-	void *ns[KOBJ_NS_TYPES];
+	void *ns;
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 6695040a031..362a34d27e6 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -220,6 +220,8 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target,
 void sysfs_delete_link(struct kobject *dir, struct kobject *targ,
 			const char *name);
 
+void sysfs_enable_ns(struct sysfs_dirent *sd);
+
 int __must_check sysfs_create_group(struct kobject *kobj,
 				    const struct attribute_group *grp);
 int __must_check sysfs_create_groups(struct kobject *kobj,
@@ -353,6 +355,10 @@ static inline void sysfs_delete_link(struct kobject *k, struct kobject *t,
 {
 }
 
+static inline void sysfs_enable_ns(struct sysfs_dirent *sd)
+{
+}
+
 static inline int sysfs_create_group(struct kobject *kobj,
 				     const struct attribute_group *grp)
 {
diff --git a/lib/kobject.c b/lib/kobject.c
index 5b4b8886435..16e9335b32d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -65,13 +65,17 @@ static int populate_dir(struct kobject *kobj)
 
 static int create_dir(struct kobject *kobj)
 {
+	const struct kobj_ns_type_operations *ops;
 	int error;
 
 	error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
-	if (!error) {
-		error = populate_dir(kobj);
-		if (error)
-			sysfs_remove_dir(kobj);
+	if (error)
+		return error;
+
+	error = populate_dir(kobj);
+	if (error) {
+		sysfs_remove_dir(kobj);
+		return error;
 	}
 
 	/*
@@ -80,7 +84,20 @@ static int create_dir(struct kobject *kobj)
 	 */
 	sysfs_get(kobj->sd);
 
-	return error;
+	/*
+	 * If @kobj has ns_ops, its children need to be filtered based on
+	 * their namespace tags.  Enable namespace support on @kobj->sd.
+	 */
+	ops = kobj_child_ns_ops(kobj);
+	if (ops) {
+		BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);
+		BUG_ON(ops->type >= KOBJ_NS_TYPES);
+		BUG_ON(!kobj_ns_type_registered(ops->type));
+
+		sysfs_enable_ns(kobj->sd);
+	}
+
+	return 0;
 }
 
 static int get_kobj_path_length(struct kobject *kobj)
-- 
cgit v1.2.3-70-g09d2


From ae2108ad32f5ca55e9895d5597e6552be1607569 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 17:21:47 -0500
Subject: sysfs: make __sysfs_add_one() fail if the parent isn't a directory

Currently the kobject based interface guarantees that a parent
sysfs_dirent is always a directory; however, the planned kernfs
interface will be directly based on sysfs_dirents and the caller may
specify non-directory node as the parent.  Add an explicit check in
__sysfs_add_one() so that such attempts fail with -EINVAL.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b3cf61dc57c..98701c00b9b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -429,6 +429,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		return -EINVAL;
 	}
 
+	if (sysfs_type(parent_sd) != SYSFS_DIR)
+		return -EINVAL;
+
 	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
 	sd->s_parent = sysfs_get(parent_sd);
 
-- 
cgit v1.2.3-70-g09d2


From b8441ed279bff09a0a5ddeacf8f4087d2fb424ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 24 Nov 2013 09:54:58 -0500
Subject: sysfs, kernfs: add skeletons for kernfs

Core sysfs implementation will be separated into kernfs so that it can
be used by other non-kobject users.

This patch creates fs/kernfs/ directory and makes boilerplate changes.
kernfs interface will be directly based on sysfs_dirent and its
forward declaration is moved to include/linux/kernfs.h which is
included from include/linux/sysfs.h.  sysfs core implementation will
be gradually separated out and moved to kernfs.

This patch doesn't introduce any functional changes.

v2: mount.c added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/Makefile            |  2 +-
 fs/kernfs/Makefile     |  5 +++++
 fs/kernfs/dir.c        |  9 +++++++++
 fs/kernfs/file.c       |  9 +++++++++
 fs/kernfs/inode.c      |  9 +++++++++
 fs/kernfs/mount.c      |  9 +++++++++
 fs/kernfs/symlink.c    |  9 +++++++++
 include/linux/kernfs.h | 12 ++++++++++++
 include/linux/sysfs.h  |  3 +--
 9 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 fs/kernfs/Makefile
 create mode 100644 fs/kernfs/dir.c
 create mode 100644 fs/kernfs/file.c
 create mode 100644 fs/kernfs/inode.c
 create mode 100644 fs/kernfs/mount.c
 create mode 100644 fs/kernfs/symlink.c
 create mode 100644 include/linux/kernfs.h

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28..39a824f44e7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_FHANDLE)		+= fhandle.o
 obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
-obj-$(CONFIG_SYSFS)		+= sysfs/
+obj-$(CONFIG_SYSFS)		+= sysfs/ kernfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 00000000000..674337c7667
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the kernfs pseudo filesystem
+#
+
+obj-y		:= mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 00000000000..1061602ce81
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,9 @@
+/*
+ * fs/kernfs/dir.c - kernfs directory implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 00000000000..90b1e88dad4
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,9 @@
+/*
+ * fs/kernfs/file.c - kernfs file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 00000000000..86bfeea07de
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,9 @@
+/*
+ * fs/kernfs/inode.c - kernfs inode implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 00000000000..872e262e516
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,9 @@
+/*
+ * fs/kernfs/mount.c - kernfs mount implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 00000000000..2578715baf0
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,9 @@
+/*
+ * fs/kernfs/symlink.c - kernfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
new file mode 100644
index 00000000000..254b9e872b0
--- /dev/null
+++ b/include/linux/kernfs.h
@@ -0,0 +1,12 @@
+/*
+ * kernfs.h - pseudo filesystem decoupled from vfs locking
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __LINUX_KERNFS_H
+#define __LINUX_KERNFS_H
+
+struct sysfs_dirent;
+
+#endif	/* __LINUX_KERNFS_H */
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 362a34d27e6..e17381a92e1 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -12,6 +12,7 @@
 #ifndef _SYSFS_H_
 #define _SYSFS_H_
 
+#include <linux/kernfs.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/list.h>
@@ -175,8 +176,6 @@ struct sysfs_ops {
 	ssize_t	(*store)(struct kobject *, struct attribute *, const char *, size_t);
 };
 
-struct sysfs_dirent;
-
 #ifdef CONFIG_SYSFS
 
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
-- 
cgit v1.2.3-70-g09d2


From 879f40d193bb3c6c13930e88e3e9d5d7baf84d19 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 17:21:49 -0500
Subject: sysfs, kernfs: introduce kernfs_remove[_by_name[_ns]]()

Introduce kernfs removal interfaces - kernfs_remove() and
kernfs_remove_by_name[_ns]().

These are just renames of sysfs_remove() and sysfs_hash_and_remove().
No functional changes.

v2: Dummy kernfs_remove_by_name_ns() for !CONFIG_SYSFS updated to
    return -ENOSYS instead of 0.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c         | 20 ++++++++++----------
 fs/sysfs/file.c        |  6 +++---
 fs/sysfs/group.c       | 15 +++++++--------
 fs/sysfs/symlink.c     |  4 ++--
 fs/sysfs/sysfs.h       |  3 ---
 include/linux/kernfs.h | 24 ++++++++++++++++++++++++
 6 files changed, 46 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 98701c00b9b..edbde4e6e5e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -829,8 +829,8 @@ static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
 	return pos->s_parent;
 }
 
-static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
-			   struct sysfs_dirent *sd)
+static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
+			    struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *pos, *next;
 
@@ -849,22 +849,22 @@ static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
 }
 
 /**
- * sysfs_remove - remove a sysfs_dirent recursively
+ * kernfs_remove - remove a sysfs_dirent recursively
  * @sd: the sysfs_dirent to remove
  *
  * Remove @sd along with all its subdirectories and files.
  */
-void sysfs_remove(struct sysfs_dirent *sd)
+void kernfs_remove(struct sysfs_dirent *sd)
 {
 	struct sysfs_addrm_cxt acxt;
 
 	sysfs_addrm_start(&acxt);
-	__sysfs_remove(&acxt, sd);
+	__kernfs_remove(&acxt, sd);
 	sysfs_addrm_finish(&acxt);
 }
 
 /**
- * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
+ * kernfs_remove_by_name_ns - find a sysfs_dirent by name and remove it
  * @dir_sd: parent of the target
  * @name: name of the sysfs_dirent to remove
  * @ns: namespace tag of the sysfs_dirent to remove
@@ -872,8 +872,8 @@ void sysfs_remove(struct sysfs_dirent *sd)
  * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
  * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
  */
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns)
+int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
+			     const void *ns)
 {
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
@@ -888,7 +888,7 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
 
 	sd = sysfs_find_dirent(dir_sd, name, ns);
 	if (sd)
-		__sysfs_remove(&acxt, sd);
+		__kernfs_remove(&acxt, sd);
 
 	sysfs_addrm_finish(&acxt);
 
@@ -928,7 +928,7 @@ void sysfs_remove_dir(struct kobject *kobj)
 
 	if (sd) {
 		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
-		sysfs_remove(sd);
+		kernfs_remove(sd);
 	}
 }
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 79b5da2acbe..5664410136b 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -952,7 +952,7 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 {
 	struct sysfs_dirent *dir_sd = kobj->sd;
 
-	sysfs_hash_and_remove(dir_sd, attr->name, ns);
+	kernfs_remove_by_name_ns(dir_sd, attr->name, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
@@ -980,7 +980,7 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 	else
 		dir_sd = sysfs_get(kobj->sd);
 	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, attr->name, NULL);
+		kernfs_remove_by_name(dir_sd, attr->name);
 		sysfs_put(dir_sd);
 	}
 }
@@ -1008,7 +1008,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 void sysfs_remove_bin_file(struct kobject *kobj,
 			   const struct bin_attribute *attr)
 {
-	sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
+	kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
 
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1898a10e38c..4bd99734083 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -26,7 +26,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 
 	if (grp->attrs)
 		for (attr = grp->attrs; *attr; attr++)
-			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+			kernfs_remove_by_name(dir_sd, (*attr)->name);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
 			sysfs_remove_bin_file(kobj, *bin_attr);
@@ -49,8 +49,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 * re-adding (if required) the file.
 			 */
 			if (update)
-				sysfs_hash_and_remove(dir_sd, (*attr)->name,
-						      NULL);
+				kernfs_remove_by_name(dir_sd, (*attr)->name);
 			if (grp->is_visible) {
 				mode = grp->is_visible(kobj, *attr, i);
 				if (!mode)
@@ -111,7 +110,7 @@ static int internal_create_group(struct kobject *kobj, int update,
 	error = create_files(sd, kobj, grp, update);
 	if (error) {
 		if (grp->name)
-			sysfs_remove(sd);
+			kernfs_remove(sd);
 	}
 	sysfs_put(sd);
 	return error;
@@ -219,7 +218,7 @@ void sysfs_remove_group(struct kobject *kobj,
 
 	remove_files(sd, kobj, grp);
 	if (grp->name)
-		sysfs_remove(sd);
+		kernfs_remove(sd);
 
 	sysfs_put(sd);
 }
@@ -270,7 +269,7 @@ int sysfs_merge_group(struct kobject *kobj,
 		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
 	if (error) {
 		while (--i >= 0)
-			sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL);
+			kernfs_remove_by_name(dir_sd, (*--attr)->name);
 	}
 	sysfs_put(dir_sd);
 
@@ -292,7 +291,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
 	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
 	if (dir_sd) {
 		for (attr = grp->attrs; *attr; ++attr)
-			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+			kernfs_remove_by_name(dir_sd, (*attr)->name);
 		sysfs_put(dir_sd);
 	}
 }
@@ -335,7 +334,7 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 
 	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
 	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, link_name, NULL);
+		kernfs_remove_by_name(dir_sd, link_name);
 		sysfs_put(dir_sd);
 	}
 }
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c660363fdae..71583fc8100 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -153,7 +153,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	if (targ->sd && (kobj->sd->s_flags & SYSFS_FLAG_NS))
 		ns = targ->sd->s_ns;
 	spin_unlock(&sysfs_symlink_target_lock);
-	sysfs_hash_and_remove(kobj->sd, name, ns);
+	kernfs_remove_by_name_ns(kobj->sd, name, ns);
 }
 
 /**
@@ -170,7 +170,7 @@ void sysfs_remove_link(struct kobject *kobj, const char *name)
 	else
 		parent_sd = kobj->sd;
 
-	sysfs_hash_and_remove(parent_sd, name, NULL);
+	kernfs_remove_by_name(parent_sd, name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index e116c21a27b..97625b15ca0 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -173,9 +173,6 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		    struct sysfs_dirent *parent_sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		  struct sysfs_dirent *parent_sd);
-void sysfs_remove(struct sysfs_dirent *sd);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 254b9e872b0..83e151ad061 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -7,6 +7,30 @@
 #ifndef __LINUX_KERNFS_H
 #define __LINUX_KERNFS_H
 
+#include <linux/kernel.h>
+
 struct sysfs_dirent;
 
+#ifdef CONFIG_SYSFS
+
+void kernfs_remove(struct sysfs_dirent *sd);
+int kernfs_remove_by_name_ns(struct sysfs_dirent *parent, const char *name,
+			     const void *ns);
+
+#else	/* CONFIG_SYSFS */
+
+static inline void kernfs_remove(struct sysfs_dirent *sd) { }
+
+static inline int kernfs_remove_by_name_ns(struct sysfs_dirent *parent,
+					   const char *name, const void *ns)
+{ return -ENOSYS; }
+
+#endif	/* CONFIG_SYSFS */
+
+static inline int kernfs_remove_by_name(struct sysfs_dirent *parent,
+					const char *name)
+{
+	return kernfs_remove_by_name_ns(parent, name, NULL);
+}
+
 #endif	/* __LINUX_KERNFS_H */
-- 
cgit v1.2.3-70-g09d2


From 5d0e26bb59a680a5d97db5b6629941603e8de229 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 17:21:50 -0500
Subject: sysfs, kernfs: introduce kernfs_create_link()

Separate out kernfs symlink interface - kernfs_create_link() - which
takes and returns sysfs_dirents, from sysfs_do_create_link_sd().
sysfs_do_create_link_sd() now just determines the parent and target
sysfs_dirents and invokes the new interface and handles dup warning.

This patch doesn't introduce behavior changes.

v2: Dummy implementation for !CONFIG_SYSFS updated to return -ENOSYS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/symlink.c     | 76 ++++++++++++++++++++++++++++++--------------------
 include/linux/kernfs.h |  9 ++++++
 2 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 71583fc8100..41138e91947 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -21,14 +21,48 @@
 
 #include "sysfs.h"
 
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
+					const char *name,
+					struct sysfs_dirent *target)
+{
+	struct sysfs_dirent *sd;
+	struct sysfs_addrm_cxt acxt;
+	int error;
+
+	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
+	if (!sd)
+		return ERR_PTR(-ENOMEM);
+
+	if (parent->s_flags & SYSFS_FLAG_NS)
+		sd->s_ns = target->s_ns;
+	sd->s_symlink.target_sd = target;
+	sysfs_get(target);	/* ref owned by symlink */
+
+	sysfs_addrm_start(&acxt);
+	error = __sysfs_add_one(&acxt, sd, parent);
+	sysfs_addrm_finish(&acxt);
+
+	if (!error)
+		return sd;
+
+	sysfs_put(sd);
+	return ERR_PTR(error);
+}
+
+
 static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 				   struct kobject *target,
 				   const char *name, int warn)
 {
-	struct sysfs_dirent *target_sd = NULL;
-	struct sysfs_dirent *sd = NULL;
-	struct sysfs_addrm_cxt acxt;
-	int error;
+	struct sysfs_dirent *sd, *target_sd = NULL;
 
 	BUG_ON(!name || !parent_sd);
 
@@ -42,36 +76,18 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 		target_sd = sysfs_get(target->sd);
 	spin_unlock(&sysfs_symlink_target_lock);
 
-	error = -ENOENT;
 	if (!target_sd)
-		goto out_put;
+		return -ENOENT;
 
-	error = -ENOMEM;
-	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
-	if (!sd)
-		goto out_put;
-
-	if (parent_sd->s_flags & SYSFS_FLAG_NS)
-		sd->s_ns = target_sd->s_ns;
-	sd->s_symlink.target_sd = target_sd;
-	target_sd = NULL;	/* reference is now owned by the symlink */
-
-	sysfs_addrm_start(&acxt);
-	if (warn)
-		error = sysfs_add_one(&acxt, sd, parent_sd);
-	else
-		error = __sysfs_add_one(&acxt, sd, parent_sd);
-	sysfs_addrm_finish(&acxt);
-
-	if (error)
-		goto out_put;
+	sd = kernfs_create_link(parent_sd, name, target_sd);
+	sysfs_put(target_sd);
 
-	return 0;
+	if (!IS_ERR(sd))
+		return 0;
 
- out_put:
-	sysfs_put(target_sd);
-	sysfs_put(sd);
-	return error;
+	if (warn && PTR_ERR(sd) == -EEXIST)
+		sysfs_warn_dup(parent_sd, name);
+	return PTR_ERR(sd);
 }
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 83e151ad061..fe6290d4177 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -8,17 +8,26 @@
 #define __LINUX_KERNFS_H
 
 #include <linux/kernel.h>
+#include <linux/err.h>
 
 struct sysfs_dirent;
 
 #ifdef CONFIG_SYSFS
 
+struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
+					const char *name,
+					struct sysfs_dirent *target);
 void kernfs_remove(struct sysfs_dirent *sd);
 int kernfs_remove_by_name_ns(struct sysfs_dirent *parent, const char *name,
 			     const void *ns);
 
 #else	/* CONFIG_SYSFS */
 
+static inline struct sysfs_dirent *
+kernfs_create_link(struct sysfs_dirent *parent, const char *name,
+		   struct sysfs_dirent *target)
+{ return ERR_PTR(-ENOSYS); }
+
 static inline void kernfs_remove(struct sysfs_dirent *sd) { }
 
 static inline int kernfs_remove_by_name_ns(struct sysfs_dirent *parent,
-- 
cgit v1.2.3-70-g09d2


From 890ece160c6465b49c42975d529c3481d89da8f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 17:21:51 -0500
Subject: sysfs, kernfs: introduce kernfs_rename[_ns]()

Introduce kernfs rename interface, krenfs_rename[_ns]().

This is just rename of sysfs_rename().  No functional changes.
Function comment is added to kernfs_rename_ns() and @new_parent_sd is
renamed to @new_parent for consistency with other kernfs interfaces.

v2: Dummy implementation for !CONFIG_SYSFS updated to return -ENOSYS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c         | 23 +++++++++++++++--------
 fs/sysfs/symlink.c     |  2 +-
 fs/sysfs/sysfs.h       |  3 ---
 include/linux/kernfs.h |  7 +++++++
 4 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index edbde4e6e5e..5ba896630d0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -932,20 +932,27 @@ void sysfs_remove_dir(struct kobject *kobj)
 	}
 }
 
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const char *new_name, const void *new_ns)
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @sd: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
+		     const char *new_name, const void *new_ns)
 {
 	int error;
 
 	mutex_lock(&sysfs_mutex);
 
 	error = 0;
-	if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
+	if ((sd->s_parent == new_parent) && (sd->s_ns == new_ns) &&
 	    (strcmp(sd->s_name, new_name) == 0))
 		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
+	if (sysfs_find_dirent(new_parent, new_name, new_ns))
 		goto out;
 
 	/* rename sysfs_dirent */
@@ -963,11 +970,11 @@ int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
 	 * Move to the appropriate place in the appropriate directories rbtree.
 	 */
 	sysfs_unlink_sibling(sd);
-	sysfs_get(new_parent_sd);
+	sysfs_get(new_parent);
 	sysfs_put(sd->s_parent);
 	sd->s_ns = new_ns;
 	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = new_parent_sd;
+	sd->s_parent = new_parent;
 	sysfs_link_sibling(sd);
 
 	error = 0;
@@ -981,7 +988,7 @@ int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 {
 	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
 
-	return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns);
+	return kernfs_rename_ns(kobj->sd, parent_sd, new_name, new_ns);
 }
 
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
@@ -994,7 +1001,7 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
 		new_parent_kobj->sd : &sysfs_root;
 
-	return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
+	return kernfs_rename_ns(sd, new_parent_sd, sd->s_name, new_ns);
 }
 
 /**
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 41138e91947..0922c53bd75 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -226,7 +226,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
 		goto out;
 
-	result = sysfs_rename(sd, parent_sd, new, new_ns);
+	result = kernfs_rename_ns(sd, parent_sd, new, new_ns);
 
 out:
 	sysfs_put(sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 97625b15ca0..8b3fc210b90 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -185,9 +185,6 @@ void release_sysfs_dirent(struct sysfs_dirent *sd);
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd);
 
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const char *new_name, const void *new_ns);
-
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
 	if (sd) {
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index fe6290d4177..803d9600cf7 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -20,6 +20,8 @@ struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 void kernfs_remove(struct sysfs_dirent *sd);
 int kernfs_remove_by_name_ns(struct sysfs_dirent *parent, const char *name,
 			     const void *ns);
+int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
+		     const char *new_name, const void *new_ns);
 
 #else	/* CONFIG_SYSFS */
 
@@ -34,6 +36,11 @@ static inline int kernfs_remove_by_name_ns(struct sysfs_dirent *parent,
 					   const char *name, const void *ns)
 { return -ENOSYS; }
 
+static inline int kernfs_rename_ns(struct sysfs_dirent *sd,
+				   struct sysfs_dirent *new_parent,
+				   const char *new_name, const void *new_ns)
+{ return -ENOSYS; }
+
 #endif	/* CONFIG_SYSFS */
 
 static inline int kernfs_remove_by_name(struct sysfs_dirent *parent,
-- 
cgit v1.2.3-70-g09d2


From 5d60418e54751c856f5aecc308620fde9572e481 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 17:21:52 -0500
Subject: sysfs, kernfs: introduce kernfs_setattr()

Introduce kernfs setattr interface - kernfs_setattr().

sysfs_sd_setattr() is renamed to __kernfs_setattr() and
kernfs_setattr() is a simple wrapper around it with sysfs_mutex
locking.  sysfs_chmod_file() is updated to get an explicit ref on
kobj->sd and then invoke kernfs_setattr() so that it doesn't have to
use internal interface.

This patch doesn't introduce any behavior differences.

v2: Dummy implementation for !CONFIG_SYSFS updated to return -ENOSYS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c        | 13 +++++--------
 fs/sysfs/inode.c       | 21 +++++++++++++++++++--
 fs/sysfs/sysfs.h       |  1 -
 include/linux/kernfs.h |  8 ++++++++
 4 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5664410136b..564abd201af 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -922,19 +922,16 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 	struct iattr newattrs;
 	int rc;
 
-	mutex_lock(&sysfs_mutex);
-
-	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
+	sd = sysfs_get_dirent(kobj->sd, attr->name);
 	if (!sd)
-		goto out;
+		return -ENOENT;
 
 	newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE;
-	rc = sysfs_sd_setattr(sd, &newattrs);
 
- out:
-	mutex_unlock(&sysfs_mutex);
+	rc = kernfs_setattr(sd, &newattrs);
+
+	sysfs_put(sd);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 1750f790af3..5f7e2afb345 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -67,7 +67,7 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
 	return attrs;
 }
 
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
+static int __kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
 {
 	struct sysfs_inode_attrs *sd_attrs;
 	struct iattr *iattrs;
@@ -102,6 +102,23 @@ int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
 	return 0;
 }
 
+/**
+ * kernfs_setattr - set iattr on a node
+ * @sd: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
+{
+	int ret;
+
+	mutex_lock(&sysfs_mutex);
+	ret = __kernfs_setattr(sd, iattr);
+	mutex_unlock(&sysfs_mutex);
+	return ret;
+}
+
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -116,7 +133,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (error)
 		goto out;
 
-	error = sysfs_sd_setattr(sd, iattr);
+	error = __kernfs_setattr(sd, iattr);
 	if (error)
 		goto out;
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 8b3fc210b90..2abccfdd932 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -207,7 +207,6 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
  */
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
 int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 803d9600cf7..8cb67387571 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -10,6 +10,9 @@
 #include <linux/kernel.h>
 #include <linux/err.h>
 
+struct file;
+struct iattr;
+
 struct sysfs_dirent;
 
 #ifdef CONFIG_SYSFS
@@ -22,6 +25,7 @@ int kernfs_remove_by_name_ns(struct sysfs_dirent *parent, const char *name,
 			     const void *ns);
 int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 		     const char *new_name, const void *new_ns);
+int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr);
 
 #else	/* CONFIG_SYSFS */
 
@@ -41,6 +45,10 @@ static inline int kernfs_rename_ns(struct sysfs_dirent *sd,
 				   const char *new_name, const void *new_ns)
 { return -ENOSYS; }
 
+static inline int kernfs_setattr(struct sysfs_dirent *sd,
+				 const struct iattr *iattr)
+{ return -ENOSYS; }
+
 #endif	/* CONFIG_SYSFS */
 
 static inline int kernfs_remove_by_name(struct sysfs_dirent *parent,
-- 
cgit v1.2.3-70-g09d2


From 7c6e2d362c19f01e6d6c8be59d83a89722032884 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:14 -0500
Subject: sysfs, kernfs: replace sysfs_dirent->s_dir.kobj and
 ->s_attr.[bin_]attr with ->priv

A directory sysfs_dirent points to the associated kobj.  A regular or
bin file points to the associated [bin_]attribute.  This patch
replaces sysfs_dirent->s_dir.kobj and ->s_attr.[bin_]attr with void *
->priv.

This is to prepare for kernfs interface so that sysfs can specify the
private data in the same way for directories and files.  This lower
debuggability but not by much - the whole thing was overlaid in a
union anyway.  If debuggability becomes an issue, we can later add
->priv accessors which explicitly check for the sysfs_dirent type and
performs casting.

This patch doesn't introduce any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c     |  2 +-
 fs/sysfs/file.c    | 26 +++++++++++++-------------
 fs/sysfs/inode.c   |  2 +-
 fs/sysfs/symlink.c |  2 +-
 fs/sysfs/sysfs.h   | 13 +++++--------
 5 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5ba896630d0..aeb08bd3f3d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -681,7 +681,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 		return -ENOMEM;
 
 	sd->s_ns = ns;
-	sd->s_dir.kobj = kobj;
+	sd->priv = kobj;
 
 	/* link in */
 	sysfs_addrm_start(&acxt);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0f3f0a252a5..9b58d874c82 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -74,7 +74,7 @@ static struct sysfs_open_file *sysfs_of(struct file *file)
  */
 static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 {
-	struct kobject *kobj = sd->s_parent->s_dir.kobj;
+	struct kobject *kobj = sd->s_parent->priv;
 
 	if (!sysfs_ignore_lockdep(sd))
 		lockdep_assert_held(sd);
@@ -89,7 +89,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 static int sysfs_seq_show(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	struct kobject *kobj = of->sd->s_parent->priv;
 	const struct sysfs_ops *ops;
 	char *buf;
 	ssize_t count;
@@ -120,7 +120,7 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	 */
 	ops = sysfs_file_ops(of->sd);
 	if (ops->show)
-		count = ops->show(kobj, of->sd->s_attr.attr, buf);
+		count = ops->show(kobj, of->sd->priv, buf);
 	else
 		count = 0;
 
@@ -154,8 +154,8 @@ static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
 			      size_t bytes, loff_t *off)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	struct bin_attribute *battr = of->sd->priv;
+	struct kobject *kobj = of->sd->s_parent->priv;
 	loff_t size = file_inode(file)->i_size;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	loff_t offs = *off;
@@ -221,7 +221,7 @@ static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
 static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
 			      size_t count)
 {
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	struct kobject *kobj = of->sd->s_parent->priv;
 	int rc = 0;
 
 	/*
@@ -236,7 +236,7 @@ static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
 	}
 
 	if (sysfs_is_bin(of->sd)) {
-		struct bin_attribute *battr = of->sd->s_attr.bin_attr;
+		struct bin_attribute *battr = of->sd->priv;
 
 		rc = -EIO;
 		if (battr->write)
@@ -245,7 +245,7 @@ static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
 	} else {
 		const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
 
-		rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
+		rc = ops->store(kobj, of->sd->priv, buf, count);
 	}
 
 	sysfs_put_active(of->sd);
@@ -466,8 +466,8 @@ static const struct vm_operations_struct sysfs_bin_vm_ops = {
 static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	struct bin_attribute *battr = of->sd->priv;
+	struct kobject *kobj = of->sd->s_parent->priv;
 	int rc;
 
 	mutex_lock(&of->mutex);
@@ -607,7 +607,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 static int sysfs_open_file(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	struct kobject *kobj = attr_sd->s_parent->priv;
 	struct sysfs_open_file *of;
 	bool has_read, has_write, has_mmap;
 	int error = -EACCES;
@@ -617,7 +617,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		return -ENODEV;
 
 	if (sysfs_is_bin(attr_sd)) {
-		struct bin_attribute *battr = attr_sd->s_attr.bin_attr;
+		struct bin_attribute *battr = attr_sd->priv;
 
 		has_read = battr->read || battr->mmap;
 		has_write = battr->write || battr->mmap;
@@ -848,7 +848,7 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 		return -ENOMEM;
 
 	sd->s_ns = ns;
-	sd->s_attr.attr = (void *)attr;
+	sd->priv = (void *)attr;
 	sysfs_dirent_init_lockdep(sd);
 
 	sysfs_addrm_start(&acxt);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 5f7e2afb345..81cc8585b32 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -275,7 +275,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		inode->i_fop = &sysfs_file_operations;
 		break;
 	case SYSFS_KOBJ_BIN_ATTR:
-		bin_attr = sd->s_attr.bin_attr;
+		bin_attr = sd->priv;
 		inode->i_size = bin_attr->size;
 		inode->i_fop = &sysfs_bin_operations;
 		break;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 0922c53bd75..352fbbbc055 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -223,7 +223,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 	result = -EINVAL;
 	if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
 		goto out;
-	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+	if (sd->s_symlink.target_sd->priv != targ)
 		goto out;
 
 	result = kernfs_rename_ns(sd, parent_sd, new, new_ns);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 2abccfdd932..a6f3fa3f02f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -17,8 +17,6 @@ struct sysfs_open_dirent;
 
 /* type-specific structures for sysfs_dirent->s_* union members */
 struct sysfs_elem_dir {
-	struct kobject		*kobj;
-
 	unsigned long		subdirs;
 	/* children rbtree starts here and goes through sd->s_rb */
 	struct rb_root		children;
@@ -29,10 +27,6 @@ struct sysfs_elem_symlink {
 };
 
 struct sysfs_elem_attr {
-	union {
-		struct attribute	*attr;
-		struct bin_attribute	*bin_attr;
-	};
 	struct sysfs_open_dirent *open;
 };
 
@@ -74,6 +68,8 @@ struct sysfs_dirent {
 		struct sysfs_elem_attr		s_attr;
 	};
 
+	void			*priv;
+
 	unsigned short		s_flags;
 	umode_t			s_mode;
 	unsigned int		s_ino;
@@ -103,7 +99,7 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 
 #define sysfs_dirent_init_lockdep(sd)				\
 do {								\
-	struct attribute *attr = sd->s_attr.attr;		\
+	struct attribute *attr = sd->priv;			\
 	struct lock_class_key *key = attr->key;			\
 	if (!key)						\
 		key = &attr->skey;				\
@@ -114,10 +110,11 @@ do {								\
 /* Test for attributes that want to ignore lockdep for read-locking */
 static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
 {
+	struct attribute *attr = sd->priv;
 	int type = sysfs_type(sd);
 
 	return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
-		sd->s_attr.attr->ignore_lockdep;
+		attr->ignore_lockdep;
 }
 
 #else
-- 
cgit v1.2.3-70-g09d2


From 93b2b8e4aa4317e3fe6414d117deb5f3c362e8bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:15 -0500
Subject: sysfs, kernfs: introduce kernfs_create_dir[_ns]()

Introduce kernfs interface to manipulate a directory which takes and
returns sysfs_dirents.

create_dir() is renamed to kernfs_create_dir_ns() and its argumantes
and return value are updated.  create_dir() usages are replaced with
kernfs_create_dir_ns() and sysfs_create_subdir() usages are replaced
with kernfs_create_dir().  Dup warnings are handled explicitly by
sysfs users of the kernfs interface.

sysfs_enable_ns() is renamed to kernfs_enable_ns().

This patch doesn't introduce any behavior changes.

v2: Dummy implementation for !CONFIG_SYSFS updated to return -ENOSYS.

v3: kernfs_enable_ns() added.

v4: Refreshed on top of "sysfs: drop kobj_ns_type handling, take #2"
    so that this patch removes sysfs_enable_ns().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c         | 54 ++++++++++++++++++++++++++++----------------------
 fs/sysfs/group.c       |  9 ++++++---
 fs/sysfs/sysfs.h       |  3 ---
 include/linux/kernfs.h | 17 ++++++++++++++++
 include/linux/sysfs.h  |  6 ------
 lib/kobject.c          |  2 +-
 6 files changed, 54 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index aeb08bd3f3d..cfbf4091fe5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -666,9 +666,18 @@ struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
 }
 EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
 
-static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-		      const char *name, const void *ns,
-		      struct sysfs_dirent **p_sd)
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
+					  const char *name, void *priv,
+					  const void *ns)
 {
 	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	struct sysfs_addrm_cxt acxt;
@@ -678,28 +687,21 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	/* allocate */
 	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
 	if (!sd)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	sd->s_ns = ns;
-	sd->priv = kobj;
+	sd->priv = priv;
 
 	/* link in */
 	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, parent_sd);
+	rc = __sysfs_add_one(&acxt, sd, parent);
 	sysfs_addrm_finish(&acxt);
 
-	if (rc == 0)
-		*p_sd = sd;
-	else
-		sysfs_put(sd);
+	if (!rc)
+		return sd;
 
-	return rc;
-}
-
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-			struct sysfs_dirent **p_sd)
-{
-	return create_dir(kobj, kobj->sd, name, NULL, p_sd);
+	sysfs_put(sd);
+	return ERR_PTR(rc);
 }
 
 /**
@@ -710,7 +712,6 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
 	struct sysfs_dirent *parent_sd, *sd;
-	int error = 0;
 
 	BUG_ON(!kobj);
 
@@ -722,10 +723,15 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (!parent_sd)
 		return -ENOENT;
 
-	error = create_dir(kobj, parent_sd, kobject_name(kobj), ns, &sd);
-	if (!error)
-		kobj->sd = sd;
-	return error;
+	sd = kernfs_create_dir_ns(parent_sd, kobject_name(kobj), kobj, ns);
+	if (IS_ERR(sd)) {
+		if (PTR_ERR(sd) == -EEXIST)
+			sysfs_warn_dup(parent_sd, kobject_name(kobj));
+		return PTR_ERR(sd);
+	}
+
+	kobj->sd = sd;
+	return 0;
 }
 
 static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
@@ -1005,14 +1011,14 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 }
 
 /**
- * sysfs_enable_ns - enable namespace under a directory
+ * kernfs_enable_ns - enable namespace under a directory
  * @sd: directory of interest, should be empty
  *
  * This is to be called right after @sd is created to enable namespace
  * under it.  All children of @sd must have non-NULL namespace tags and
  * only the ones which match the super_block's tag will be visible.
  */
-void sysfs_enable_ns(struct sysfs_dirent *sd)
+void kernfs_enable_ns(struct sysfs_dirent *sd)
 {
 	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
 	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 4bd99734083..065689ddb4c 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -101,9 +101,12 @@ static int internal_create_group(struct kobject *kobj, int update,
 		return -EINVAL;
 	}
 	if (grp->name) {
-		error = sysfs_create_subdir(kobj, grp->name, &sd);
-		if (error)
-			return error;
+		sd = kernfs_create_dir(kobj->sd, grp->name, kobj);
+		if (IS_ERR(sd)) {
+			if (PTR_ERR(sd) == -EEXIST)
+				sysfs_warn_dup(kobj->sd, grp->name);
+			return PTR_ERR(sd);
+		}
 	} else
 		sd = kobj->sd;
 	sysfs_get(sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index a6f3fa3f02f..9ac234ef494 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -179,9 +179,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
 void release_sysfs_dirent(struct sysfs_dirent *sd);
 
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-			struct sysfs_dirent **p_sd);
-
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
 	if (sd) {
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 8cb67387571..e8b73d4a08d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -17,6 +17,9 @@ struct sysfs_dirent;
 
 #ifdef CONFIG_SYSFS
 
+struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
+					  const char *name, void *priv,
+					  const void *ns);
 struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 					const char *name,
 					struct sysfs_dirent *target);
@@ -25,10 +28,16 @@ int kernfs_remove_by_name_ns(struct sysfs_dirent *parent, const char *name,
 			     const void *ns);
 int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 		     const char *new_name, const void *new_ns);
+void kernfs_enable_ns(struct sysfs_dirent *sd);
 int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr);
 
 #else	/* CONFIG_SYSFS */
 
+static inline struct sysfs_dirent *
+kernfs_create_dir_ns(struct sysfs_dirent *parent, const char *name, void *priv,
+		     const void *ns)
+{ return ERR_PTR(-ENOSYS); }
+
 static inline struct sysfs_dirent *
 kernfs_create_link(struct sysfs_dirent *parent, const char *name,
 		   struct sysfs_dirent *target)
@@ -45,12 +54,20 @@ static inline int kernfs_rename_ns(struct sysfs_dirent *sd,
 				   const char *new_name, const void *new_ns)
 { return -ENOSYS; }
 
+static inline void kernfs_enable_ns(struct sysfs_dirent *sd) { }
+
 static inline int kernfs_setattr(struct sysfs_dirent *sd,
 				 const struct iattr *iattr)
 { return -ENOSYS; }
 
 #endif	/* CONFIG_SYSFS */
 
+static inline struct sysfs_dirent *
+kernfs_create_dir(struct sysfs_dirent *parent, const char *name, void *priv)
+{
+	return kernfs_create_dir_ns(parent, name, priv, NULL);
+}
+
 static inline int kernfs_remove_by_name(struct sysfs_dirent *parent,
 					const char *name)
 {
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e17381a92e1..2bc735d3e93 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -219,8 +219,6 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target,
 void sysfs_delete_link(struct kobject *dir, struct kobject *targ,
 			const char *name);
 
-void sysfs_enable_ns(struct sysfs_dirent *sd);
-
 int __must_check sysfs_create_group(struct kobject *kobj,
 				    const struct attribute_group *grp);
 int __must_check sysfs_create_groups(struct kobject *kobj,
@@ -354,10 +352,6 @@ static inline void sysfs_delete_link(struct kobject *k, struct kobject *t,
 {
 }
 
-static inline void sysfs_enable_ns(struct sysfs_dirent *sd)
-{
-}
-
 static inline int sysfs_create_group(struct kobject *kobj,
 				     const struct attribute_group *grp)
 {
diff --git a/lib/kobject.c b/lib/kobject.c
index 16e9335b32d..b8d848fb137 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -94,7 +94,7 @@ static int create_dir(struct kobject *kobj)
 		BUG_ON(ops->type >= KOBJ_NS_TYPES);
 		BUG_ON(!kobj_ns_type_registered(ops->type));
 
-		sysfs_enable_ns(kobj->sd);
+		kernfs_enable_ns(kobj->sd);
 	}
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From c2b19daf6760fae9d5db9e9d1683644728888293 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:16 -0500
Subject: sysfs, kernfs: prepare read path for kernfs

We're in the process of separating out core sysfs functionality into
kernfs which will deal with sysfs_dirents directly.  This patch
rearranges read path so that the kernfs and sysfs parts are separate.

* Regular file read path is refactored such that
  kernfs_seq_start/next/stop/show() handle all the boilerplate work
  including locking and updating event count for poll, while
  sysfs_kf_seq_show() deals with interaction with kobj show method.

* Bin file read path is refactored such that kernfs_file_direct_read()
  handles all the boilerplate work including buffer management and
  locking, while sysfs_kf_bin_read() deals with interaction with
  bin_attribute read method.

kernfs_file_read() is added.  It invokes either the seq_file or direct
read path depending on the file type.  This will eventually allow
using the same file_operations for both file types, which is necessary
to separate out kernfs.

While this patch changes the order of some operations, it shouldn't
change any visible behavior.

v2: Dropped unnecessary zeroing of @count from sysfs_kf_seq_show().
    Add comments explaining single_open() behavior.  Both suggested by
    Pavel.

v3: seq_stop() is called even after seq_start() failed.
    kernfs_seq_start() updated so that it doesn't unlock
    sysfs_open_file->mutex on failure so that kernfs_seq_stop()
    doesn't try to unlock an already unlocked mutex.  Reported by
    Fengguang.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 191 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 126 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9b58d874c82..b695b8b229f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -86,13 +86,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
  * details like buffering and seeking.  The following function pipes
  * sysfs_ops->show() result through seq_file.
  */
-static int sysfs_seq_show(struct seq_file *sf, void *v)
+static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
 	struct kobject *kobj = of->sd->s_parent->priv;
-	const struct sysfs_ops *ops;
-	char *buf;
+	const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
 	ssize_t count;
+	char *buf;
 
 	/* acquire buffer and ensure that it's >= PAGE_SIZE */
 	count = seq_get_buf(sf, &buf);
@@ -102,33 +102,14 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	}
 
 	/*
-	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-	 * nests outside active ref and is just to ensure that the ops
-	 * aren't called concurrently for the same open file.
+	 * Invoke show().  Control may reach here via seq file lseek even
+	 * if @ops->show() isn't implemented.
 	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		mutex_unlock(&of->mutex);
-		return -ENODEV;
-	}
-
-	of->event = atomic_read(&of->sd->s_attr.open->event);
-
-	/*
-	 * Lookup @ops and invoke show().  Control may reach here via seq
-	 * file lseek even if @ops->show() isn't implemented.
-	 */
-	ops = sysfs_file_ops(of->sd);
-	if (ops->show)
+	if (ops->show) {
 		count = ops->show(kobj, of->sd->priv, buf);
-	else
-		count = 0;
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-
-	if (count < 0)
-		return count;
+		if (count < 0)
+			return count;
+	}
 
 	/*
 	 * The code works fine with PAGE_SIZE return but it's likely to
@@ -144,68 +125,146 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	return 0;
 }
 
-/*
- * Read method for bin files.  As reading a bin file can have side-effects,
- * the exact offset and bytes specified in read(2) call should be passed to
- * the read callback making it difficult to use seq_file.  Implement
- * simplistic custom buffering for bin files.
- */
-static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
-			      size_t bytes, loff_t *off)
+static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
+				 size_t count, loff_t pos)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
 	struct bin_attribute *battr = of->sd->priv;
 	struct kobject *kobj = of->sd->s_parent->priv;
-	loff_t size = file_inode(file)->i_size;
-	int count = min_t(size_t, bytes, PAGE_SIZE);
-	loff_t offs = *off;
-	char *buf;
+	loff_t size = file_inode(of->file)->i_size;
 
-	if (!bytes)
+	if (!count)
 		return 0;
 
 	if (size) {
-		if (offs > size)
+		if (pos > size)
 			return 0;
-		if (offs + count > size)
-			count = size - offs;
+		if (pos + count > size)
+			count = size - pos;
 	}
 
-	buf = kmalloc(count, GFP_KERNEL);
+	if (!battr->read)
+		return -EIO;
+
+	return battr->read(of->file, kobj, battr, buf, pos, count);
+}
+
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+	struct sysfs_open_file *of = sf->private;
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd))
+		return ERR_PTR(-ENODEV);
+
+	/*
+	 * The same behavior and code as single_open().  Returns !NULL if
+	 * pos is at the beginning; otherwise, NULL.
+	 */
+	return NULL + !*ppos;
+}
+
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+	/*
+	 * The same behavior and code as single_open(), always terminate
+	 * after the initial read.
+	 */
+	++*ppos;
+	return NULL;
+}
+
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+	struct sysfs_open_file *of = sf->private;
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+}
+
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+	struct sysfs_open_file *of = sf->private;
+
+	of->event = atomic_read(&of->sd->s_attr.open->event);
+
+	return sysfs_kf_seq_show(sf, v);
+}
+
+static const struct seq_operations kernfs_seq_ops = {
+	.start = kernfs_seq_start,
+	.next = kernfs_seq_next,
+	.stop = kernfs_seq_stop,
+	.show = kernfs_seq_show,
+};
+
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file.  Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
+				       char __user *user_buf, size_t count,
+				       loff_t *ppos)
+{
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	char *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
-	/* need of->sd for battr, its parent for kobj */
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
 	mutex_lock(&of->mutex);
 	if (!sysfs_get_active(of->sd)) {
-		count = -ENODEV;
+		len = -ENODEV;
 		mutex_unlock(&of->mutex);
 		goto out_free;
 	}
 
-	if (battr->read)
-		count = battr->read(file, kobj, battr, buf, offs, count);
-	else
-		count = -EIO;
+	len = sysfs_kf_bin_read(of, buf, len, *ppos);
 
 	sysfs_put_active(of->sd);
 	mutex_unlock(&of->mutex);
 
-	if (count < 0)
+	if (len < 0)
 		goto out_free;
 
-	if (copy_to_user(userbuf, buf, count)) {
-		count = -EFAULT;
+	if (copy_to_user(user_buf, buf, len)) {
+		len = -EFAULT;
 		goto out_free;
 	}
 
-	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
-
-	*off = offs + count;
+	*ppos += len;
 
  out_free:
 	kfree(buf);
-	return count;
+	return len;
+}
+
+/**
+ * kernfs_file_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct sysfs_open_file *of = sysfs_of(file);
+
+	if (sysfs_is_bin(of->sd))
+		return kernfs_file_direct_read(of, user_buf, count, ppos);
+	else
+		return seq_read(file, user_buf, count, ppos);
 }
 
 /**
@@ -677,12 +736,14 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	 * and readable regular files are the vast majority anyway.
 	 */
 	if (sysfs_is_bin(attr_sd))
-		error = single_open(file, NULL, of);
+		error = seq_open(file, NULL);
 	else
-		error = single_open(file, sysfs_seq_show, of);
+		error = seq_open(file, &kernfs_seq_ops);
 	if (error)
 		goto err_free;
 
+	((struct seq_file *)file->private_data)->private = of;
+
 	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
 	if (file->f_mode & FMODE_WRITE)
 		file->f_mode |= FMODE_PWRITE;
@@ -697,7 +758,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	return 0;
 
 err_close:
-	single_release(inode, file);
+	seq_release(inode, file);
 err_free:
 	kfree(of);
 err_out:
@@ -711,7 +772,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
 	struct sysfs_open_file *of = sysfs_of(filp);
 
 	sysfs_put_open_dirent(sd, of);
-	single_release(inode, filp);
+	seq_release(inode, filp);
 	kfree(of);
 
 	return 0;
@@ -816,7 +877,7 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 EXPORT_SYMBOL_GPL(sysfs_notify);
 
 const struct file_operations sysfs_file_operations = {
-	.read		= seq_read,
+	.read		= kernfs_file_read,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 	.open		= sysfs_open_file,
@@ -825,7 +886,7 @@ const struct file_operations sysfs_file_operations = {
 };
 
 const struct file_operations sysfs_bin_operations = {
-	.read		= sysfs_bin_read,
+	.read		= kernfs_file_read,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 	.mmap		= sysfs_bin_mmap,
-- 
cgit v1.2.3-70-g09d2


From 50b38ca086e4d9920eede98b871b971e9958d70d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:17 -0500
Subject: sysfs, kernfs: prepare write path for kernfs

We're in the process of separating out core sysfs functionality into
kernfs which will deal with sysfs_dirents directly.  This patch
rearranges write path so that the kernfs and sysfs parts are separate.

kernfs_file_write() handles all boilerplate work including buffer
management and locking and invokes sysfs_kf_write() or
sysfs_kf_bin_write() depending on the file type which deals with the
interaction with kobj store or bin_attribute write method.

While this patch changes the order of some operations, it shouldn't
change any visible behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 103 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 50 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b695b8b229f..2f849e82c0e 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -267,61 +267,50 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 		return seq_read(file, user_buf, count, ppos);
 }
 
-/**
- * flush_write_buffer - push buffer to kobject
- * @of: open file
- * @buf: data buffer for file
- * @off: file offset to write to
- * @count: number of bytes
- *
- * Get the correct pointers for the kobject and the attribute we're dealing
- * with, then call the store() method for it with @buf.
- */
-static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
-			      size_t count)
+/* kernfs write callback for regular sysfs files */
+static ssize_t sysfs_kf_write(struct sysfs_open_file *of, char *buf,
+			      size_t count, loff_t pos)
 {
+	const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
 	struct kobject *kobj = of->sd->s_parent->priv;
-	int rc = 0;
 
-	/*
-	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-	 * nests outside active ref and is just to ensure that the ops
-	 * aren't called concurrently for the same open file.
-	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		mutex_unlock(&of->mutex);
-		return -ENODEV;
-	}
+	if (!count)
+		return 0;
 
-	if (sysfs_is_bin(of->sd)) {
-		struct bin_attribute *battr = of->sd->priv;
+	return ops->store(kobj, of->sd->priv, buf, count);
+}
 
-		rc = -EIO;
-		if (battr->write)
-			rc = battr->write(of->file, kobj, battr, buf, off,
-					  count);
-	} else {
-		const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
+/* kernfs write callback for bin sysfs files */
+static ssize_t sysfs_kf_bin_write(struct sysfs_open_file *of, char *buf,
+				  size_t count, loff_t pos)
+{
+	struct bin_attribute *battr = of->sd->priv;
+	struct kobject *kobj = of->sd->s_parent->priv;
+	loff_t size = file_inode(of->file)->i_size;
 
-		rc = ops->store(kobj, of->sd->priv, buf, count);
+	if (size) {
+		if (size <= pos)
+			return 0;
+		count = min_t(ssize_t, count, size - pos);
 	}
+	if (!count)
+		return 0;
 
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
+	if (!battr->write)
+		return -EIO;
 
-	return rc;
+	return battr->write(of->file, kobj, battr, buf, pos, count);
 }
 
 /**
- * sysfs_write_file - write an attribute
+ * kernfs_file_write - kernfs vfs write callback
  * @file: file pointer
  * @user_buf: data to write
  * @count: number of bytes
  * @ppos: starting offset
  *
- * Copy data in from userland and pass it to the matching
- * sysfs_ops->store() by invoking flush_write_buffer().
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
  *
  * There is no easy way for us to know if userspace is only doing a partial
  * write, so we don't support them. We expect the entire buffer to come on
@@ -329,23 +318,13 @@ static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
  * modify only the the value you're changing, then write entire buffer
  * back.
  */
-static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
-				size_t count, loff_t *ppos)
+static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
+				 size_t count, loff_t *ppos)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
 	ssize_t len = min_t(size_t, count, PAGE_SIZE);
-	loff_t size = file_inode(file)->i_size;
 	char *buf;
 
-	if (sysfs_is_bin(of->sd) && size) {
-		if (size <= *ppos)
-			return 0;
-		len = min_t(ssize_t, len, size - *ppos);
-	}
-
-	if (!len)
-		return 0;
-
 	buf = kmalloc(len + 1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
@@ -356,7 +335,25 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
 	}
 	buf[len] = '\0';	/* guarantee string termination */
 
-	len = flush_write_buffer(of, buf, *ppos, len);
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		mutex_unlock(&of->mutex);
+		len = -ENODEV;
+		goto out_free;
+	}
+
+	if (sysfs_is_bin(of->sd))
+		len = sysfs_kf_bin_write(of, buf, len, *ppos);
+	else
+		len = sysfs_kf_write(of, buf, len, *ppos);
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+
 	if (len > 0)
 		*ppos += len;
 out_free:
@@ -878,7 +875,7 @@ EXPORT_SYMBOL_GPL(sysfs_notify);
 
 const struct file_operations sysfs_file_operations = {
 	.read		= kernfs_file_read,
-	.write		= sysfs_write_file,
+	.write		= kernfs_file_write,
 	.llseek		= generic_file_llseek,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
@@ -887,7 +884,7 @@ const struct file_operations sysfs_file_operations = {
 
 const struct file_operations sysfs_bin_operations = {
 	.read		= kernfs_file_read,
-	.write		= sysfs_write_file,
+	.write		= kernfs_file_write,
 	.llseek		= generic_file_llseek,
 	.mmap		= sysfs_bin_mmap,
 	.open		= sysfs_open_file,
-- 
cgit v1.2.3-70-g09d2


From fdbffaa478fc77e999cbe2ac0dcfbf609103e675 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:18 -0500
Subject: sysfs, kernfs: prepare mmap path for kernfs

We're in the process of separating out core sysfs functionality into
kernfs which will deal with sysfs_dirents directly.  This patch
rearranges mmap path so that the kernfs and sysfs parts are separate.

sysfs_kf_bin_mmap() which handles the interaction with bin_attribute
mmap method is factored out of sysfs_bin_mmap(), which is renamed to
kernfs_file_mmap().  All vma ops are renamed accordingly.

sysfs_bin_mmap() is updated such that it can be used for both file
types.  This will eventually allow using the same file_operations for
both file types, which is necessary to separate out kernfs.

This patch doesn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 69 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 2f849e82c0e..2e24e89bd92 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -361,7 +361,19 @@ out_free:
 	return len;
 }
 
-static void sysfs_bin_vma_open(struct vm_area_struct *vma)
+static int sysfs_kf_bin_mmap(struct sysfs_open_file *of,
+			     struct vm_area_struct *vma)
+{
+	struct bin_attribute *battr = of->sd->priv;
+	struct kobject *kobj = of->sd->s_parent->priv;
+
+	if (!battr->mmap)
+		return -ENODEV;
+
+	return battr->mmap(of->file, kobj, battr, vma);
+}
+
+static void kernfs_vma_open(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	struct sysfs_open_file *of = sysfs_of(file);
@@ -378,7 +390,7 @@ static void sysfs_bin_vma_open(struct vm_area_struct *vma)
 	sysfs_put_active(of->sd);
 }
 
-static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct sysfs_open_file *of = sysfs_of(file);
@@ -398,8 +410,8 @@ static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return ret;
 }
 
-static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
-				  struct vm_fault *vmf)
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+				   struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct sysfs_open_file *of = sysfs_of(file);
@@ -421,8 +433,8 @@ static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
 	return ret;
 }
 
-static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
-			    void *buf, int len, int write)
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+			     void *buf, int len, int write)
 {
 	struct file *file = vma->vm_file;
 	struct sysfs_open_file *of = sysfs_of(file);
@@ -443,8 +455,8 @@ static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
 }
 
 #ifdef CONFIG_NUMA
-static int sysfs_bin_set_policy(struct vm_area_struct *vma,
-				struct mempolicy *new)
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+				 struct mempolicy *new)
 {
 	struct file *file = vma->vm_file;
 	struct sysfs_open_file *of = sysfs_of(file);
@@ -464,8 +476,8 @@ static int sysfs_bin_set_policy(struct vm_area_struct *vma,
 	return ret;
 }
 
-static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
-					      unsigned long addr)
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+					       unsigned long addr)
 {
 	struct file *file = vma->vm_file;
 	struct sysfs_open_file *of = sysfs_of(file);
@@ -485,8 +497,9 @@ static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
 	return pol;
 }
 
-static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-			     const nodemask_t *to, unsigned long flags)
+static int kernfs_vma_migrate(struct vm_area_struct *vma,
+			      const nodemask_t *from, const nodemask_t *to,
+			      unsigned long flags)
 {
 	struct file *file = vma->vm_file;
 	struct sysfs_open_file *of = sysfs_of(file);
@@ -507,36 +520,31 @@ static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 }
 #endif
 
-static const struct vm_operations_struct sysfs_bin_vm_ops = {
-	.open		= sysfs_bin_vma_open,
-	.fault		= sysfs_bin_fault,
-	.page_mkwrite	= sysfs_bin_page_mkwrite,
-	.access		= sysfs_bin_access,
+static const struct vm_operations_struct kernfs_vm_ops = {
+	.open		= kernfs_vma_open,
+	.fault		= kernfs_vma_fault,
+	.page_mkwrite	= kernfs_vma_page_mkwrite,
+	.access		= kernfs_vma_access,
 #ifdef CONFIG_NUMA
-	.set_policy	= sysfs_bin_set_policy,
-	.get_policy	= sysfs_bin_get_policy,
-	.migrate	= sysfs_bin_migrate,
+	.set_policy	= kernfs_vma_set_policy,
+	.get_policy	= kernfs_vma_get_policy,
+	.migrate	= kernfs_vma_migrate,
 #endif
 };
 
-static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
+static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->priv;
-	struct kobject *kobj = of->sd->s_parent->priv;
 	int rc;
 
 	mutex_lock(&of->mutex);
 
-	/* need of->sd for battr, its parent for kobj */
 	rc = -ENODEV;
 	if (!sysfs_get_active(of->sd))
 		goto out_unlock;
 
-	if (!battr->mmap)
-		goto out_put;
-
-	rc = battr->mmap(file, kobj, battr, vma);
+	if (sysfs_is_bin(of->sd))
+		rc = sysfs_kf_bin_mmap(of, vma);
 	if (rc)
 		goto out_put;
 
@@ -563,7 +571,7 @@ static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
 	rc = 0;
 	of->mmapped = 1;
 	of->vm_ops = vma->vm_ops;
-	vma->vm_ops = &sysfs_bin_vm_ops;
+	vma->vm_ops = &kernfs_vm_ops;
 out_put:
 	sysfs_put_active(of->sd);
 out_unlock:
@@ -877,6 +885,7 @@ const struct file_operations sysfs_file_operations = {
 	.read		= kernfs_file_read,
 	.write		= kernfs_file_write,
 	.llseek		= generic_file_llseek,
+	.mmap		= kernfs_file_mmap,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
 	.poll		= sysfs_poll,
@@ -886,7 +895,7 @@ const struct file_operations sysfs_bin_operations = {
 	.read		= kernfs_file_read,
 	.write		= kernfs_file_write,
 	.llseek		= generic_file_llseek,
-	.mmap		= sysfs_bin_mmap,
+	.mmap		= kernfs_file_mmap,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
 	.poll		= sysfs_poll,
-- 
cgit v1.2.3-70-g09d2


From c6fb449515f23edea828fb90a460d3622e261dba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:19 -0500
Subject: sysfs, kernfs: prepare open, release, poll paths for kernfs

We're in the process of separating out core sysfs functionality into
kernfs which will deal with sysfs_dirents directly.  This patch
prepares the rest - open, release and poll.  There isn't much to do.
Just renaming is enough.  As sysfs_file_operations and
sysfs_bin_operations are identical now, use the same file_operations
for both - kernfs_file_operations.

This patch doesn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c  | 24 +++++++-----------------
 fs/sysfs/inode.c |  4 ++--
 fs/sysfs/sysfs.h |  3 +--
 3 files changed, 10 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 2e24e89bd92..a43df04c81f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -668,7 +668,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 	kfree(od);
 }
 
-static int sysfs_open_file(struct inode *inode, struct file *file)
+static int kernfs_file_open(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->priv;
@@ -771,7 +771,7 @@ err_out:
 	return error;
 }
 
-static int sysfs_release(struct inode *inode, struct file *filp)
+static int kernfs_file_release(struct inode *inode, struct file *filp)
 {
 	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
 	struct sysfs_open_file *of = sysfs_of(filp);
@@ -822,7 +822,7 @@ void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
  * to see if it supports poll (Neither 'poll' nor 'select' return
  * an appropriate error code).  When in doubt, set a suitable timeout value.
  */
-static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
 {
 	struct sysfs_open_file *of = sysfs_of(filp);
 	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
@@ -881,24 +881,14 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 }
 EXPORT_SYMBOL_GPL(sysfs_notify);
 
-const struct file_operations sysfs_file_operations = {
+const struct file_operations kernfs_file_operations = {
 	.read		= kernfs_file_read,
 	.write		= kernfs_file_write,
 	.llseek		= generic_file_llseek,
 	.mmap		= kernfs_file_mmap,
-	.open		= sysfs_open_file,
-	.release	= sysfs_release,
-	.poll		= sysfs_poll,
-};
-
-const struct file_operations sysfs_bin_operations = {
-	.read		= kernfs_file_read,
-	.write		= kernfs_file_write,
-	.llseek		= generic_file_llseek,
-	.mmap		= kernfs_file_mmap,
-	.open		= sysfs_open_file,
-	.release	= sysfs_release,
-	.poll		= sysfs_poll,
+	.open		= kernfs_file_open,
+	.release	= kernfs_file_release,
+	.poll		= kernfs_file_poll,
 };
 
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 81cc8585b32..4c463dabfc6 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -272,12 +272,12 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		break;
 	case SYSFS_KOBJ_ATTR:
 		inode->i_size = PAGE_SIZE;
-		inode->i_fop = &sysfs_file_operations;
+		inode->i_fop = &kernfs_file_operations;
 		break;
 	case SYSFS_KOBJ_BIN_ATTR:
 		bin_attr = sd->priv;
 		inode->i_size = bin_attr->size;
-		inode->i_fop = &sysfs_bin_operations;
+		inode->i_fop = &kernfs_file_operations;
 		break;
 	case SYSFS_KOBJ_LINK:
 		inode->i_op = &sysfs_symlink_inode_operations;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 9ac234ef494..619250d2d7c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -212,8 +212,7 @@ int sysfs_inode_init(void);
 /*
  * file.c
  */
-extern const struct file_operations sysfs_file_operations;
-extern const struct file_operations sysfs_bin_operations;
+extern const struct file_operations kernfs_file_operations;
 
 int sysfs_add_file(struct sysfs_dirent *dir_sd,
 		   const struct attribute *attr, int type);
-- 
cgit v1.2.3-70-g09d2


From dd8a5b036b6e8d50854e130555f90f062c5eacec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:20 -0500
Subject: sysfs, kernfs: move sysfs_open_file to include/linux/kernfs.h

sysfs_open_file will be used as the primary handle for kernfs methods.
Move its definition from fs/sysfs/file.c to include/linux/kernfs.h and
mark the public and private fields.

This is pure relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c        | 11 -----------
 include/linux/kernfs.h | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a43df04c81f..acba5835577 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,17 +47,6 @@ struct sysfs_open_dirent {
 	struct list_head	files; /* goes through sysfs_open_file.list */
 };
 
-struct sysfs_open_file {
-	struct sysfs_dirent	*sd;
-	struct file		*file;
-	struct mutex		mutex;
-	int			event;
-	struct list_head	list;
-
-	bool			mmapped;
-	const struct vm_operations_struct *vm_ops;
-};
-
 static bool sysfs_is_bin(struct sysfs_dirent *sd)
 {
 	return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index e8b73d4a08d..b923052c29d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -9,12 +9,30 @@
 
 #include <linux/kernel.h>
 #include <linux/err.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
 
 struct file;
 struct iattr;
+struct seq_file;
+struct vm_area_struct;
 
 struct sysfs_dirent;
 
+struct sysfs_open_file {
+	/* published fields */
+	struct sysfs_dirent	*sd;
+	struct file		*file;
+
+	/* private fields, do not use outside kernfs proper */
+	struct mutex		mutex;
+	int			event;
+	struct list_head	list;
+
+	bool			mmapped;
+	const struct vm_operations_struct *vm_ops;
+};
+
 #ifdef CONFIG_SYSFS
 
 struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
-- 
cgit v1.2.3-70-g09d2


From f6acf8bb6a40ba3bfcf542e4c4c9e8968c8cb57a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:21 -0500
Subject: sysfs, kernfs: introduce kernfs_ops

We're in the process of separating out core sysfs functionality into
kernfs which will deal with sysfs_dirents directly.  This patch
introduces kernfs_ops which hosts methods kernfs users implement and
updates fs/sysfs/file.c such that sysfs_kf_*() functions populate
kernfs_ops and kernfs_file_*() functions call the matching entries
from kernfs_ops.

kernfs_ops contains the following groups of methods.

* seq_show() - for kernfs files which use seq_file for reads.

* read() - for direct read implementations.  Used iff seq_show() is
  not implemented.

* write() - for writes.

* mmap() - for mmaps.

Notes:

* sysfs_elem_attr->ops is added so that kernfs_ops can be accessed
  from sysfs_dirent.  kernfs_ops() helper is added to verify locking
  and access the field.

* SYSFS_FLAG_HAS_(SEQ_SHOW|MMAP) added.  sd->s_attr->ops is accessible
  only while holding active_ref and there are cases where we want to
  take different actions depending on which ops are implemented.
  These two flags cache whether the two ops are implemented for those.

* kernfs_file_*() no longer test sysfs type but chooses different
  behaviors depending on which methods in kernfs_ops are implemented.
  The conversions are trivial except for the open path.  As
  kernfs_file_open() now decides whether to allow read/write accesses
  depending on the kernfs_ops implemented, the presence of methods in
  kobjs and attribute_bin should be propagated to kernfs_ops.
  sysfs_add_file_mode_ns() is updated so that it propagates presence /
  absence of the callbacks through _empty, _ro, _wo, _rw kernfs_ops.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c        | 146 +++++++++++++++++++++++++++++++++++++------------
 fs/sysfs/sysfs.h       |   3 +
 include/linux/kernfs.h |  26 +++++++++
 3 files changed, 141 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index acba5835577..cbebc335af8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -57,6 +57,17 @@ static struct sysfs_open_file *sysfs_of(struct file *file)
 	return ((struct seq_file *)file->private_data)->private;
 }
 
+/*
+ * Determine the kernfs_ops for the given sysfs_dirent.  This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct sysfs_dirent *sd)
+{
+	if (!sysfs_ignore_lockdep(sd))
+		lockdep_assert_held(sd);
+	return sd->s_attr.ops;
+}
+
 /*
  * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
  * must be called while holding an active reference.
@@ -180,7 +191,7 @@ static int kernfs_seq_show(struct seq_file *sf, void *v)
 
 	of->event = atomic_read(&of->sd->s_attr.open->event);
 
-	return sysfs_kf_seq_show(sf, v);
+	return of->sd->s_attr.ops->seq_show(sf, v);
 }
 
 static const struct seq_operations kernfs_seq_ops = {
@@ -201,6 +212,7 @@ static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
 				       loff_t *ppos)
 {
 	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
 	char *buf;
 
 	buf = kmalloc(len, GFP_KERNEL);
@@ -218,7 +230,11 @@ static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
 		goto out_free;
 	}
 
-	len = sysfs_kf_bin_read(of, buf, len, *ppos);
+	ops = kernfs_ops(of->sd);
+	if (ops->read)
+		len = ops->read(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
 
 	sysfs_put_active(of->sd);
 	mutex_unlock(&of->mutex);
@@ -250,10 +266,10 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 {
 	struct sysfs_open_file *of = sysfs_of(file);
 
-	if (sysfs_is_bin(of->sd))
-		return kernfs_file_direct_read(of, user_buf, count, ppos);
-	else
+	if (of->sd->s_flags & SYSFS_FLAG_HAS_SEQ_SHOW)
 		return seq_read(file, user_buf, count, ppos);
+	else
+		return kernfs_file_direct_read(of, user_buf, count, ppos);
 }
 
 /* kernfs write callback for regular sysfs files */
@@ -312,6 +328,7 @@ static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
 {
 	struct sysfs_open_file *of = sysfs_of(file);
 	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
 	char *buf;
 
 	buf = kmalloc(len + 1, GFP_KERNEL);
@@ -335,10 +352,11 @@ static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
 		goto out_free;
 	}
 
-	if (sysfs_is_bin(of->sd))
-		len = sysfs_kf_bin_write(of, buf, len, *ppos);
+	ops = kernfs_ops(of->sd);
+	if (ops->write)
+		len = ops->write(of, buf, len, *ppos);
 	else
-		len = sysfs_kf_write(of, buf, len, *ppos);
+		len = -EINVAL;
 
 	sysfs_put_active(of->sd);
 	mutex_unlock(&of->mutex);
@@ -524,6 +542,7 @@ static const struct vm_operations_struct kernfs_vm_ops = {
 static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
+	const struct kernfs_ops *ops;
 	int rc;
 
 	mutex_lock(&of->mutex);
@@ -532,8 +551,9 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!sysfs_get_active(of->sd))
 		goto out_unlock;
 
-	if (sysfs_is_bin(of->sd))
-		rc = sysfs_kf_bin_mmap(of, vma);
+	ops = kernfs_ops(of->sd);
+	if (ops->mmap)
+		rc = ops->mmap(of, vma);
 	if (rc)
 		goto out_put;
 
@@ -660,34 +680,19 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 static int kernfs_file_open(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->priv;
+	const struct kernfs_ops *ops;
 	struct sysfs_open_file *of;
 	bool has_read, has_write, has_mmap;
 	int error = -EACCES;
 
-	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
-	if (sysfs_is_bin(attr_sd)) {
-		struct bin_attribute *battr = attr_sd->priv;
+	ops = kernfs_ops(attr_sd);
 
-		has_read = battr->read || battr->mmap;
-		has_write = battr->write || battr->mmap;
-		has_mmap = battr->mmap;
-	} else {
-		const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
-
-		/* every kobject with an attribute needs a ktype assigned */
-		if (WARN(!ops, KERN_ERR
-			 "missing sysfs attribute operations for kobject: %s\n",
-			 kobject_name(kobj)))
-			goto err_out;
-
-		has_read = ops->show;
-		has_write = ops->store;
-		has_mmap = false;
-	}
+	has_read = ops->seq_show || ops->read || ops->mmap;
+	has_write = ops->write || ops->mmap;
+	has_mmap = ops->mmap;
 
 	/* check perms and supported operations */
 	if ((file->f_mode & FMODE_WRITE) &&
@@ -729,10 +734,10 @@ static int kernfs_file_open(struct inode *inode, struct file *file)
 	 * seq_file or is not requested.  This unifies private data access
 	 * and readable regular files are the vast majority anyway.
 	 */
-	if (sysfs_is_bin(attr_sd))
-		error = seq_open(file, NULL);
-	else
+	if (ops->seq_show)
 		error = seq_open(file, &kernfs_seq_ops);
+	else
+		error = seq_open(file, NULL);
 	if (error)
 		goto err_free;
 
@@ -777,7 +782,7 @@ void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
 	struct sysfs_open_dirent *od;
 	struct sysfs_open_file *of;
 
-	if (!sysfs_is_bin(sd))
+	if (!(sd->s_flags & SYSFS_FLAG_HAS_MMAP))
 		return;
 
 	spin_lock_irq(&sysfs_open_dirent_lock);
@@ -880,23 +885,96 @@ const struct file_operations kernfs_file_operations = {
 	.poll		= kernfs_file_poll,
 };
 
+static const struct kernfs_ops sysfs_file_kfops_empty = {
+};
+
+static const struct kernfs_ops sysfs_file_kfops_ro = {
+	.seq_show	= sysfs_kf_seq_show,
+};
+
+static const struct kernfs_ops sysfs_file_kfops_wo = {
+	.write		= sysfs_kf_write,
+};
+
+static const struct kernfs_ops sysfs_file_kfops_rw = {
+	.seq_show	= sysfs_kf_seq_show,
+	.write		= sysfs_kf_write,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_ro = {
+	.read		= sysfs_kf_bin_read,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_wo = {
+	.write		= sysfs_kf_bin_write,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_rw = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
+	.mmap		= sysfs_kf_bin_mmap,
+};
+
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			   const struct attribute *attr, int type,
 			   umode_t amode, const void *ns)
 {
 	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
+	const struct kernfs_ops *ops;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
 	int rc;
 
+	if (type == SYSFS_KOBJ_ATTR) {
+		struct kobject *kobj = dir_sd->priv;
+		const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
+
+		/* every kobject with an attribute needs a ktype assigned */
+		if (WARN(!sysfs_ops, KERN_ERR
+			 "missing sysfs attribute operations for kobject: %s\n",
+			 kobject_name(kobj)))
+			return -EINVAL;
+
+		if (sysfs_ops->show && sysfs_ops->store)
+			ops = &sysfs_file_kfops_rw;
+		else if (sysfs_ops->show)
+			ops = &sysfs_file_kfops_ro;
+		else if (sysfs_ops->store)
+			ops = &sysfs_file_kfops_wo;
+		else
+			ops = &sysfs_file_kfops_empty;
+	} else {
+		struct bin_attribute *battr = (void *)attr;
+
+		if ((battr->read && battr->write) || battr->mmap)
+			ops = &sysfs_bin_kfops_rw;
+		else if (battr->read)
+			ops = &sysfs_bin_kfops_ro;
+		else if (battr->write)
+			ops = &sysfs_bin_kfops_wo;
+		else
+			ops = &sysfs_file_kfops_empty;
+	}
+
 	sd = sysfs_new_dirent(attr->name, mode, type);
 	if (!sd)
 		return -ENOMEM;
 
+	sd->s_attr.ops = ops;
 	sd->s_ns = ns;
 	sd->priv = (void *)attr;
 	sysfs_dirent_init_lockdep(sd);
 
+	/*
+	 * sd->s_attr.ops is accesible only while holding active ref.  We
+	 * need to know whether some ops are implemented outside active
+	 * ref.  Cache their existence in flags.
+	 */
+	if (ops->seq_show)
+		sd->s_flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
+	if (ops->mmap)
+		sd->s_flags |= SYSFS_FLAG_HAS_MMAP;
+
 	sysfs_addrm_start(&acxt);
 	rc = sysfs_add_one(&acxt, sd, dir_sd);
 	sysfs_addrm_finish(&acxt);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 619250d2d7c..c05e0ddd026 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -27,6 +27,7 @@ struct sysfs_elem_symlink {
 };
 
 struct sysfs_elem_attr {
+	const struct kernfs_ops	*ops;
 	struct sysfs_open_dirent *open;
 };
 
@@ -89,6 +90,8 @@ struct sysfs_dirent {
 #define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
 #define SYSFS_FLAG_NS			0x01000
 #define SYSFS_FLAG_REMOVED		0x02000
+#define SYSFS_FLAG_HAS_SEQ_SHOW		0x04000
+#define SYSFS_FLAG_HAS_MMAP		0x08000
 
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index b923052c29d..97c6c0f9132 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -33,6 +33,32 @@ struct sysfs_open_file {
 	const struct vm_operations_struct *vm_ops;
 };
 
+struct kernfs_ops {
+	/*
+	 * Read is handled by either seq_file or raw_read().
+	 *
+	 * If seq_show() is present, seq_file path is active.  The behavior
+	 * is equivalent to single_open().  @sf->private points to the
+	 * associated sysfs_open_file.
+	 *
+	 * read() is bounced through kernel buffer and a read larger than
+	 * PAGE_SIZE results in partial operation of PAGE_SIZE.
+	 */
+	int (*seq_show)(struct seq_file *sf, void *v);
+
+	ssize_t (*read)(struct sysfs_open_file *of, char *buf, size_t bytes,
+			loff_t off);
+
+	/*
+	 * write() is bounced through kernel buffer and a write larger than
+	 * PAGE_SIZE results in partial operation of PAGE_SIZE.
+	 */
+	ssize_t (*write)(struct sysfs_open_file *of, char *buf, size_t bytes,
+			 loff_t off);
+
+	int (*mmap)(struct sysfs_open_file *of, struct vm_area_struct *vma);
+};
+
 #ifdef CONFIG_SYSFS
 
 struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
-- 
cgit v1.2.3-70-g09d2


From 471bd7b78bd56c580e91e00a0f656ca922ab3b3c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:22 -0500
Subject: sysfs, kernfs: add sysfs_dirent->s_attr.size

sysfs sets the size of regular files unconditionally at PAGE_SIZE and
takes the size of bin files from bin_attribute.  The latter is a
pretty bad interface which forces bin_attribute users to create a
separate copy of bin_attribute for each instance of the file -
e.g. pci resource files.

Add sysfs_dirent->s_attr.size so that the size can be specified
separately.  This unifies inode init paths of ATTR and BIN_ATTR
identical and allows for generic size handling for kernfs.

Unfortunately, this grows the size of sysfs_dirent by sizeof(loff_t).

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c  | 6 ++++++
 fs/sysfs/inode.c | 8 +-------
 fs/sysfs/sysfs.h | 1 +
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index cbebc335af8..0b0cec8e9d8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -923,6 +923,7 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 	const struct kernfs_ops *ops;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
+	loff_t size;
 	int rc;
 
 	if (type == SYSFS_KOBJ_ATTR) {
@@ -943,6 +944,8 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			ops = &sysfs_file_kfops_wo;
 		else
 			ops = &sysfs_file_kfops_empty;
+
+		size = PAGE_SIZE;
 	} else {
 		struct bin_attribute *battr = (void *)attr;
 
@@ -954,6 +957,8 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			ops = &sysfs_bin_kfops_wo;
 		else
 			ops = &sysfs_file_kfops_empty;
+
+		size = battr->size;
 	}
 
 	sd = sysfs_new_dirent(attr->name, mode, type);
@@ -961,6 +966,7 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 		return -ENOMEM;
 
 	sd->s_attr.ops = ops;
+	sd->s_attr.size = size;
 	sd->s_ns = ns;
 	sd->priv = (void *)attr;
 	sysfs_dirent_init_lockdep(sd);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4c463dabfc6..037a8925f56 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -254,8 +254,6 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
-	struct bin_attribute *bin_attr;
-
 	inode->i_private = sysfs_get(sd);
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
@@ -271,12 +269,8 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		inode->i_fop = &sysfs_dir_operations;
 		break;
 	case SYSFS_KOBJ_ATTR:
-		inode->i_size = PAGE_SIZE;
-		inode->i_fop = &kernfs_file_operations;
-		break;
 	case SYSFS_KOBJ_BIN_ATTR:
-		bin_attr = sd->priv;
-		inode->i_size = bin_attr->size;
+		inode->i_size = sd->s_attr.size;
 		inode->i_fop = &kernfs_file_operations;
 		break;
 	case SYSFS_KOBJ_LINK:
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index c05e0ddd026..d40e85e8c2e 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -29,6 +29,7 @@ struct sysfs_elem_symlink {
 struct sysfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct sysfs_open_dirent *open;
+	loff_t			size;
 };
 
 struct sysfs_inode_attrs {
-- 
cgit v1.2.3-70-g09d2


From a7dc66dfb4c6d6c1d7c14d5106ce467f1dbd4eba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:23 -0500
Subject: sysfs, kernfs: remove SYSFS_KOBJ_BIN_ATTR

After kernfs_ops and sysfs_dirent->s_attr.size addition, the
distinction between SYSFS_KOBJ_BIN_ATTR and SYSFS_KOBJ_ATTR is only
necessary while creating files to decide which kernfs_ops to use.
Afterwards, they behave exactly the same.

This patch removes SYSFS_KOBJ_BIN_ATTR along with sysfs_is_bin().
sysfs_add_file[_mode_ns]() are updated to take bool @is_bin instead of
@type.

This patch doesn't introduce any behavior changes.  This completely
isolates the distinction between the two sysfs file types in the sysfs
layer proper.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c  | 23 ++++++++---------------
 fs/sysfs/group.c |  5 ++---
 fs/sysfs/inode.c |  1 -
 fs/sysfs/sysfs.h | 11 ++++-------
 4 files changed, 14 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0b0cec8e9d8..e2ce6743113 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,11 +47,6 @@ struct sysfs_open_dirent {
 	struct list_head	files; /* goes through sysfs_open_file.list */
 };
 
-static bool sysfs_is_bin(struct sysfs_dirent *sd)
-{
-	return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
-}
-
 static struct sysfs_open_file *sysfs_of(struct file *file)
 {
 	return ((struct seq_file *)file->private_data)->private;
@@ -916,7 +911,7 @@ static const struct kernfs_ops sysfs_bin_kfops_rw = {
 };
 
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-			   const struct attribute *attr, int type,
+			   const struct attribute *attr, bool is_bin,
 			   umode_t amode, const void *ns)
 {
 	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
@@ -926,7 +921,7 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 	loff_t size;
 	int rc;
 
-	if (type == SYSFS_KOBJ_ATTR) {
+	if (!is_bin) {
 		struct kobject *kobj = dir_sd->priv;
 		const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
 
@@ -961,7 +956,7 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 		size = battr->size;
 	}
 
-	sd = sysfs_new_dirent(attr->name, mode, type);
+	sd = sysfs_new_dirent(attr->name, mode, SYSFS_KOBJ_ATTR);
 	if (!sd)
 		return -ENOMEM;
 
@@ -991,11 +986,10 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 	return rc;
 }
 
-
 int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
-		   int type)
+		   bool is_bin)
 {
-	return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL);
+	return sysfs_add_file_mode_ns(dir_sd, attr, is_bin, attr->mode, NULL);
 }
 
 /**
@@ -1009,8 +1003,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR,
-				      attr->mode, ns);
+	return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
 
 }
 EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -1049,7 +1042,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 	if (!dir_sd)
 		return -ENOENT;
 
-	error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
+	error = sysfs_add_file(dir_sd, attr, false);
 	sysfs_put(dir_sd);
 
 	return error;
@@ -1141,7 +1134,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+	return sysfs_add_file(kobj->sd, &attr->attr, true);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 065689ddb4c..9f65cd97a2d 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -55,8 +55,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 				if (!mode)
 					continue;
 			}
-			error = sysfs_add_file_mode_ns(dir_sd, *attr,
-						       SYSFS_KOBJ_ATTR,
+			error = sysfs_add_file_mode_ns(dir_sd, *attr, false,
 						       (*attr)->mode | mode,
 						       NULL);
 			if (unlikely(error))
@@ -269,7 +268,7 @@ int sysfs_merge_group(struct kobject *kobj,
 		return -ENOENT;
 
 	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
-		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+		error = sysfs_add_file(dir_sd, *attr, false);
 	if (error) {
 		while (--i >= 0)
 			kernfs_remove_by_name(dir_sd, (*--attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 037a8925f56..b3c717ab349 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -269,7 +269,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		inode->i_fop = &sysfs_dir_operations;
 		break;
 	case SYSFS_KOBJ_ATTR:
-	case SYSFS_KOBJ_BIN_ATTR:
 		inode->i_size = sd->s_attr.size;
 		inode->i_fop = &kernfs_file_operations;
 		break;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d40e85e8c2e..28898fa551c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -83,10 +83,9 @@ struct sysfs_dirent {
 #define SYSFS_TYPE_MASK			0x00ff
 #define SYSFS_DIR			0x0001
 #define SYSFS_KOBJ_ATTR			0x0002
-#define SYSFS_KOBJ_BIN_ATTR		0x0004
 #define SYSFS_KOBJ_LINK			0x0008
 #define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
+#define SYSFS_ACTIVE_REF		SYSFS_KOBJ_ATTR
 
 #define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
 #define SYSFS_FLAG_NS			0x01000
@@ -115,10 +114,8 @@ do {								\
 static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
 {
 	struct attribute *attr = sd->priv;
-	int type = sysfs_type(sd);
 
-	return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
-		attr->ignore_lockdep;
+	return sysfs_type(sd) == SYSFS_KOBJ_ATTR && attr->ignore_lockdep;
 }
 
 #else
@@ -219,10 +216,10 @@ int sysfs_inode_init(void);
 extern const struct file_operations kernfs_file_operations;
 
 int sysfs_add_file(struct sysfs_dirent *dir_sd,
-		   const struct attribute *attr, int type);
+		   const struct attribute *attr, bool is_bin);
 
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-			   const struct attribute *attr, int type,
+			   const struct attribute *attr, bool is_bin,
 			   umode_t amode, const void *ns);
 void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 
-- 
cgit v1.2.3-70-g09d2


From 496f73944a4a974f89d48920bf368aec8841b195 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:24 -0500
Subject: sysfs, kernfs: introduce kernfs_create_file[_ns]()

Introduce kernfs interface to create a file which takes and returns
sysfs_dirents.

The actual file creation part is separated out from
sysfs_add_file_mode_ns() into kernfs_create_file_ns().  The former now
only decides the kernfs_ops to use and the file's size and invokes the
latter.

This patch doesn't introduce behavior changes.

v2: Dummy implementation for !CONFIG_SYSFS updated to return -ENOSYS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c        | 53 +++++++++++++++++++++++++++++++++++++++-----------
 include/linux/kernfs.h | 18 +++++++++++++++++
 2 files changed, 60 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e2ce6743113..69cca0f4ccf 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -912,14 +912,11 @@ static const struct kernfs_ops sysfs_bin_kfops_rw = {
 
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			   const struct attribute *attr, bool is_bin,
-			   umode_t amode, const void *ns)
+			   umode_t mode, const void *ns)
 {
-	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
 	const struct kernfs_ops *ops;
-	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
 	loff_t size;
-	int rc;
 
 	if (!is_bin) {
 		struct kobject *kobj = dir_sd->priv;
@@ -956,14 +953,47 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 		size = battr->size;
 	}
 
-	sd = sysfs_new_dirent(attr->name, mode, SYSFS_KOBJ_ATTR);
+	sd = kernfs_create_file_ns(dir_sd, attr->name, mode, size,
+				   ops, (void *)attr, ns);
+	if (IS_ERR(sd)) {
+		if (PTR_ERR(sd) == -EEXIST)
+			sysfs_warn_dup(dir_sd, attr->name);
+		return PTR_ERR(sd);
+	}
+	return 0;
+}
+
+/**
+ * kernfs_create_file_ns - create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct sysfs_dirent *kernfs_create_file_ns(struct sysfs_dirent *parent,
+					   const char *name,
+					   umode_t mode, loff_t size,
+					   const struct kernfs_ops *ops,
+					   void *priv, const void *ns)
+{
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+	int rc;
+
+	sd = sysfs_new_dirent(name, (mode & S_IALLUGO) | S_IFREG,
+			      SYSFS_KOBJ_ATTR);
 	if (!sd)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	sd->s_attr.ops = ops;
 	sd->s_attr.size = size;
 	sd->s_ns = ns;
-	sd->priv = (void *)attr;
+	sd->priv = priv;
 	sysfs_dirent_init_lockdep(sd);
 
 	/*
@@ -977,13 +1007,14 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 		sd->s_flags |= SYSFS_FLAG_HAS_MMAP;
 
 	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, dir_sd);
+	rc = __sysfs_add_one(&acxt, sd, parent);
 	sysfs_addrm_finish(&acxt);
 
-	if (rc)
+	if (rc) {
 		sysfs_put(sd);
-
-	return rc;
+		return ERR_PTR(rc);
+	}
+	return sd;
 }
 
 int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 97c6c0f9132..d0912cf0208 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -64,6 +64,11 @@ struct kernfs_ops {
 struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 					  const char *name, void *priv,
 					  const void *ns);
+struct sysfs_dirent *kernfs_create_file_ns(struct sysfs_dirent *parent,
+					   const char *name,
+					   umode_t mode, loff_t size,
+					   const struct kernfs_ops *ops,
+					   void *priv, const void *ns);
 struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 					const char *name,
 					struct sysfs_dirent *target);
@@ -82,6 +87,12 @@ kernfs_create_dir_ns(struct sysfs_dirent *parent, const char *name, void *priv,
 		     const void *ns)
 { return ERR_PTR(-ENOSYS); }
 
+static inline struct sysfs_dirent *
+kernfs_create_file_ns(struct sysfs_dirent *parent, const char *name,
+		      umode_t mode, loff_t size, const struct kernfs_ops *ops,
+		      void *priv, const void *ns)
+{ return ERR_PTR(-ENOSYS); }
+
 static inline struct sysfs_dirent *
 kernfs_create_link(struct sysfs_dirent *parent, const char *name,
 		   struct sysfs_dirent *target)
@@ -112,6 +123,13 @@ kernfs_create_dir(struct sysfs_dirent *parent, const char *name, void *priv)
 	return kernfs_create_dir_ns(parent, name, priv, NULL);
 }
 
+static inline struct sysfs_dirent *
+kernfs_create_file(struct sysfs_dirent *parent, const char *name, umode_t mode,
+		   loff_t size, const struct kernfs_ops *ops, void *priv)
+{
+	return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL);
+}
+
 static inline int kernfs_remove_by_name(struct sysfs_dirent *parent,
 					const char *name)
 {
-- 
cgit v1.2.3-70-g09d2


From 2d0cfbec2a95c16818960fda1dfa815fd1a62070 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:25 -0500
Subject: sysfs, kernfs: remove sysfs_add_one()

sysfs_add_one() is a wrapper around __sysfs_add_one() which prints out
duplicate name warning if __sysfs_add_one() fails with -EEXIST.  The
previous kernfs conversions moved all dup warnings to sysfs interface
functions and sysfs_add_one() doesn't have any user left.

Remove sysfs_add_one() and update __sysfs_add_one() to take its name.

This patch doesn't make any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c     | 41 ++++-------------------------------------
 fs/sysfs/file.c    |  2 +-
 fs/sysfs/symlink.c |  2 +-
 fs/sysfs/sysfs.h   |  2 --
 4 files changed, 6 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index cfbf4091fe5..e88e9a94a08 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -395,7 +395,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
 }
 
 /**
- *	__sysfs_add_one - add sysfs_dirent to parent without warning
+ *	sysfs_add_one - add sysfs_dirent to parent without warning
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
  *	@parent_sd: the parent sysfs_dirent to add @sd to
@@ -415,8 +415,8 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		    struct sysfs_dirent *parent_sd)
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		  struct sysfs_dirent *parent_sd)
 {
 	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
 	struct sysfs_inode_attrs *ps_iattr;
@@ -487,39 +487,6 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
 	kfree(path);
 }
 
-/**
- *	sysfs_add_one - add sysfs_dirent to parent
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
- *	@parent_sd: the parent sysfs_dirent to add @sd to
- *
- *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *	the parent inode if @sd is a directory and link into the children
- *	list of the parent.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- *
- *	RETURNS:
- *	0 on success, -EEXIST if entry with the given name already
- *	exists.
- */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd)
-{
-	int ret;
-
-	ret = __sysfs_add_one(acxt, sd, parent_sd);
-
-	if (ret == -EEXIST)
-		sysfs_warn_dup(parent_sd, sd->s_name);
-	return ret;
-}
-
 /**
  *	sysfs_remove_one - remove sysfs_dirent from parent
  *	@acxt: addrm context to use
@@ -694,7 +661,7 @@ struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 
 	/* link in */
 	sysfs_addrm_start(&acxt);
-	rc = __sysfs_add_one(&acxt, sd, parent);
+	rc = sysfs_add_one(&acxt, sd, parent);
 	sysfs_addrm_finish(&acxt);
 
 	if (!rc)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 69cca0f4ccf..9852450867c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -1007,7 +1007,7 @@ struct sysfs_dirent *kernfs_create_file_ns(struct sysfs_dirent *parent,
 		sd->s_flags |= SYSFS_FLAG_HAS_MMAP;
 
 	sysfs_addrm_start(&acxt);
-	rc = __sysfs_add_one(&acxt, sd, parent);
+	rc = sysfs_add_one(&acxt, sd, parent);
 	sysfs_addrm_finish(&acxt);
 
 	if (rc) {
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 352fbbbc055..76efeab6db4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -47,7 +47,7 @@ struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 	sysfs_get(target);	/* ref owned by symlink */
 
 	sysfs_addrm_start(&acxt);
-	error = __sysfs_add_one(&acxt, sd, parent);
+	error = sysfs_add_one(&acxt, sd, parent);
 	sysfs_addrm_finish(&acxt);
 
 	if (!error)
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 28898fa551c..a6542d27bd9 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -167,8 +167,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
 void sysfs_put_active(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
 void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		    struct sysfs_dirent *parent_sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		  struct sysfs_dirent *parent_sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-- 
cgit v1.2.3-70-g09d2


From d19b9846df64d8845be682b6318bd1aee246cf60 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:26 -0500
Subject: sysfs, kernfs: add kernfs_ops->seq_{start|next|stop}()

kernfs_ops currently only supports single_open() behavior which is
pretty restrictive.  Add optional callbacks ->seq_{start|next|stop}()
which, when implemented, are invoked for seq_file traversal.  This
allows full seq_file functionality for kernfs users.  This currently
doesn't have any user and doesn't change any behavior.

v2: Refreshed on top of the updated "sysfs, kernfs: prepare read path
    for kernfs".

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c        | 39 ++++++++++++++++++++++++++++-----------
 include/linux/kernfs.h |  9 +++++++--
 2 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9852450867c..74e3478d9cb 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -146,6 +146,7 @@ static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 {
 	struct sysfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops;
 
 	/*
 	 * @of->mutex nests outside active ref and is just to ensure that
@@ -155,26 +156,42 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 	if (!sysfs_get_active(of->sd))
 		return ERR_PTR(-ENODEV);
 
-	/*
-	 * The same behavior and code as single_open().  Returns !NULL if
-	 * pos is at the beginning; otherwise, NULL.
-	 */
-	return NULL + !*ppos;
+	ops = kernfs_ops(of->sd);
+	if (ops->seq_start) {
+		return ops->seq_start(sf, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open().  Returns
+		 * !NULL if pos is at the beginning; otherwise, NULL.
+		 */
+		return NULL + !*ppos;
+	}
 }
 
 static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 {
-	/*
-	 * The same behavior and code as single_open(), always terminate
-	 * after the initial read.
-	 */
-	++*ppos;
-	return NULL;
+	struct sysfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->sd);
+
+	if (ops->seq_next) {
+		return ops->seq_next(sf, v, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open(), always
+		 * terminate after the initial read.
+		 */
+		++*ppos;
+		return NULL;
+	}
 }
 
 static void kernfs_seq_stop(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->sd);
+
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
 
 	sysfs_put_active(of->sd);
 	mutex_unlock(&of->mutex);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d0912cf0208..ba993ebcd81 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -37,8 +37,9 @@ struct kernfs_ops {
 	/*
 	 * Read is handled by either seq_file or raw_read().
 	 *
-	 * If seq_show() is present, seq_file path is active.  The behavior
-	 * is equivalent to single_open().  @sf->private points to the
+	 * If seq_show() is present, seq_file path is active.  Other seq
+	 * operations are optional and if not implemented, the behavior is
+	 * equivalent to single_open().  @sf->private points to the
 	 * associated sysfs_open_file.
 	 *
 	 * read() is bounced through kernel buffer and a read larger than
@@ -46,6 +47,10 @@ struct kernfs_ops {
 	 */
 	int (*seq_show)(struct seq_file *sf, void *v);
 
+	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
+	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
+	void (*seq_stop)(struct seq_file *sf, void *v);
+
 	ssize_t (*read)(struct sysfs_open_file *of, char *buf, size_t bytes,
 			loff_t off);
 
-- 
cgit v1.2.3-70-g09d2


From 024f647117d697165aaadf3f1af1343b7000149a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:27 -0500
Subject: sysfs, kernfs: introduce kernfs_notify()

Introduce kernfs interface to wake up poll(2) which takes and returns
sysfs_dirents.

sysfs_notify_dirent() is renamed to kernfs_notify() and sysfs_notify()
is updated so that it doesn't directly grab sysfs_mutex but acquires
the target sysfs_dirents using sysfs_get_dirent().
sysfs_notify_dirent() is reimplemented as a dumb inline wrapper around
kernfs_notify().

This patch doesn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c        | 33 ++++++++++++++++++++++-----------
 include/linux/kernfs.h |  3 +++
 include/linux/sysfs.h  |  9 +++++----
 3 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 74e3478d9cb..a68cbef3a67 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -851,7 +851,13 @@ static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
 	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
-void sysfs_notify_dirent(struct sysfs_dirent *sd)
+/**
+ * kernfs_notify - notify a kernfs file
+ * @sd: file to notify
+ *
+ * Notify @sd such that poll(2) on @sd wakes up.
+ */
+void kernfs_notify(struct sysfs_dirent *sd)
 {
 	struct sysfs_open_dirent *od;
 	unsigned long flags;
@@ -868,22 +874,27 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
 
 	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
 }
-EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
+EXPORT_SYMBOL_GPL(kernfs_notify);
 
 void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 {
-	struct sysfs_dirent *sd = k->sd;
-
-	mutex_lock(&sysfs_mutex);
+	struct sysfs_dirent *sd = k->sd, *tmp;
 
 	if (sd && dir)
-		sd = sysfs_find_dirent(sd, dir, NULL);
-	if (sd && attr)
-		sd = sysfs_find_dirent(sd, attr, NULL);
-	if (sd)
-		sysfs_notify_dirent(sd);
+		sd = sysfs_get_dirent(sd, dir);
+	else
+		sysfs_get(sd);
 
-	mutex_unlock(&sysfs_mutex);
+	if (sd && attr) {
+		tmp = sysfs_get_dirent(sd, attr);
+		sysfs_put(sd);
+		sd = tmp;
+	}
+
+	if (sd) {
+		kernfs_notify(sd);
+		sysfs_put(sd);
+	}
 }
 EXPORT_SYMBOL_GPL(sysfs_notify);
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index ba993ebcd81..f20796ecc76 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -84,6 +84,7 @@ int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 		     const char *new_name, const void *new_ns);
 void kernfs_enable_ns(struct sysfs_dirent *sd);
 int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr);
+void kernfs_notify(struct sysfs_dirent *sd);
 
 #else	/* CONFIG_SYSFS */
 
@@ -120,6 +121,8 @@ static inline int kernfs_setattr(struct sysfs_dirent *sd,
 				 const struct iattr *iattr)
 { return -ENOSYS; }
 
+static inline void kernfs_notify(struct sysfs_dirent *sd) { }
+
 #endif	/* CONFIG_SYSFS */
 
 static inline struct sysfs_dirent *
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 2bc735d3e93..0ab2b023b61 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -243,7 +243,6 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 				  const char *link_name);
 
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);
-void sysfs_notify_dirent(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
 					 const unsigned char *name,
 					 const void *ns);
@@ -418,9 +417,6 @@ static inline void sysfs_notify(struct kobject *kobj, const char *dir,
 				const char *attr)
 {
 }
-static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
-{
-}
 static inline struct sysfs_dirent *
 sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, const unsigned char *name,
 		    const void *ns)
@@ -466,4 +462,9 @@ sysfs_get_dirent(struct sysfs_dirent *parent_sd, const unsigned char *name)
 	return sysfs_get_dirent_ns(parent_sd, name, NULL);
 }
 
+static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
+{
+	kernfs_notify(sd);
+}
+
 #endif /* _SYSFS_H_ */
-- 
cgit v1.2.3-70-g09d2


From 2b25a62901a1af654c2604f19592b13742ad1601 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:28 -0500
Subject: sysfs, kernfs: reorganize SYSFS_* constants

We want to add one more SYSFS_FLAG_* but we can't use the next higher
bit, 0x10000, as the flag field is 16bits wide.  The flags are
currently arranged weirdly - 8 bits are set aside for the type flags
when there are only three three used, the first flag starts at 0x1000
instead of 0x0100 and flag literals have 5 digits (20 bits) when only
4 digits can be used.

Rearrange them so that type bits are only the lowest four, flags start
at 0x0010 and similar flags are grouped.

This patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/sysfs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index a6542d27bd9..c86456c9b19 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -80,18 +80,18 @@ struct sysfs_dirent {
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
 
-#define SYSFS_TYPE_MASK			0x00ff
+#define SYSFS_TYPE_MASK			0x000f
 #define SYSFS_DIR			0x0001
 #define SYSFS_KOBJ_ATTR			0x0002
-#define SYSFS_KOBJ_LINK			0x0008
+#define SYSFS_KOBJ_LINK			0x0004
 #define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
 #define SYSFS_ACTIVE_REF		SYSFS_KOBJ_ATTR
 
 #define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_NS			0x01000
-#define SYSFS_FLAG_REMOVED		0x02000
-#define SYSFS_FLAG_HAS_SEQ_SHOW		0x04000
-#define SYSFS_FLAG_HAS_MMAP		0x08000
+#define SYSFS_FLAG_REMOVED		0x0010
+#define SYSFS_FLAG_NS			0x0020
+#define SYSFS_FLAG_HAS_SEQ_SHOW		0x0040
+#define SYSFS_FLAG_HAS_MMAP		0x0080
 
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
-- 
cgit v1.2.3-70-g09d2


From 517e64f57883bd63c5a4ab8b3d0d3ed68c55d0cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:29 -0500
Subject: sysfs, kernfs: revamp sysfs_dirent active_ref lockdep annotation

Currently, sysfs_dirent active_ref lockdep annotation uses
attribute->[s]key as the lockdep key, which forces
kernfs_create_file_ns() to assume that sysfs_dirent->priv is pointing
to a struct attribute which may not be true for non-sysfs users.  This
patch restructures the lockdep annotation such that

* kernfs_ops contains lockdep_key which is used by default for files
  created kernfs_create_file_ns().

* kernfs_create_file_ns_key() is introduced which takes an extra @key
  argument.  The created file will use the specified key for
  active_ref lockdep annotation.  If NULL is specified, lockdep for
  the file is disabled.

* sysfs_add_file_mode_ns() is updated to use
  kernfs_create_file_ns_key() with the appropriate key from the
  attribute or NULL if ignore_lockdep is set.

This makes the lockdep annotation properly contained in kernfs while
allowing sysfs to cleanly keep its current behavior.  This patch
doesn't introduce any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c         |  4 ++--
 fs/sysfs/file.c        | 35 ++++++++++++++++++++++++-----------
 fs/sysfs/sysfs.h       | 32 +-------------------------------
 include/linux/kernfs.h | 37 +++++++++++++++++++++++++++++--------
 4 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e88e9a94a08..8f2d577b5f6 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -150,7 +150,7 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 	if (!atomic_inc_unless_negative(&sd->s_active))
 		return NULL;
 
-	if (likely(!sysfs_ignore_lockdep(sd)))
+	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
 		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
 	return sd;
 }
@@ -169,7 +169,7 @@ void sysfs_put_active(struct sysfs_dirent *sd)
 	if (unlikely(!sd))
 		return;
 
-	if (likely(!sysfs_ignore_lockdep(sd)))
+	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
 		rwsem_release(&sd->dep_map, 1, _RET_IP_);
 	v = atomic_dec_return(&sd->s_active);
 	if (likely(v != SD_DEACTIVATED_BIAS))
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a68cbef3a67..e4eca285b39 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -58,7 +58,7 @@ static struct sysfs_open_file *sysfs_of(struct file *file)
  */
 static const struct kernfs_ops *kernfs_ops(struct sysfs_dirent *sd)
 {
-	if (!sysfs_ignore_lockdep(sd))
+	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
 		lockdep_assert_held(sd);
 	return sd->s_attr.ops;
 }
@@ -71,7 +71,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 {
 	struct kobject *kobj = sd->s_parent->priv;
 
-	if (!sysfs_ignore_lockdep(sd))
+	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
 		lockdep_assert_held(sd);
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
@@ -942,6 +942,7 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			   const struct attribute *attr, bool is_bin,
 			   umode_t mode, const void *ns)
 {
+	struct lock_class_key *key = NULL;
 	const struct kernfs_ops *ops;
 	struct sysfs_dirent *sd;
 	loff_t size;
@@ -981,8 +982,12 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 		size = battr->size;
 	}
 
-	sd = kernfs_create_file_ns(dir_sd, attr->name, mode, size,
-				   ops, (void *)attr, ns);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (!attr->ignore_lockdep)
+		key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+	sd = kernfs_create_file_ns_key(dir_sd, attr->name, mode, size,
+				       ops, (void *)attr, ns, key);
 	if (IS_ERR(sd)) {
 		if (PTR_ERR(sd) == -EEXIST)
 			sysfs_warn_dup(dir_sd, attr->name);
@@ -992,7 +997,7 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 }
 
 /**
- * kernfs_create_file_ns - create a file
+ * kernfs_create_file_ns_key - create a file
  * @parent: directory to create the file in
  * @name: name of the file
  * @mode: mode of the file
@@ -1000,14 +1005,16 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
  * @ops: kernfs operations for the file
  * @priv: private data for the file
  * @ns: optional namespace tag of the file
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
  *
  * Returns the created node on success, ERR_PTR() value on error.
  */
-struct sysfs_dirent *kernfs_create_file_ns(struct sysfs_dirent *parent,
-					   const char *name,
-					   umode_t mode, loff_t size,
-					   const struct kernfs_ops *ops,
-					   void *priv, const void *ns)
+struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
+					       const char *name,
+					       umode_t mode, loff_t size,
+					       const struct kernfs_ops *ops,
+					       void *priv, const void *ns,
+					       struct lock_class_key *key)
 {
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
@@ -1022,7 +1029,13 @@ struct sysfs_dirent *kernfs_create_file_ns(struct sysfs_dirent *parent,
 	sd->s_attr.size = size;
 	sd->s_ns = ns;
 	sd->priv = priv;
-	sysfs_dirent_init_lockdep(sd);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (key) {
+		lockdep_init_map(&sd->dep_map, "s_active", key, 0);
+		sd->s_flags |= SYSFS_FLAG_LOCKDEP;
+	}
+#endif
 
 	/*
 	 * sd->s_attr.ops is accesible only while holding active ref.  We
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index c86456c9b19..e93f8b84561 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -92,43 +92,13 @@ struct sysfs_dirent {
 #define SYSFS_FLAG_NS			0x0020
 #define SYSFS_FLAG_HAS_SEQ_SHOW		0x0040
 #define SYSFS_FLAG_HAS_MMAP		0x0080
+#define SYSFS_FLAG_LOCKDEP		0x0100
 
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
 	return sd->s_flags & SYSFS_TYPE_MASK;
 }
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-#define sysfs_dirent_init_lockdep(sd)				\
-do {								\
-	struct attribute *attr = sd->priv;			\
-	struct lock_class_key *key = attr->key;			\
-	if (!key)						\
-		key = &attr->skey;				\
-								\
-	lockdep_init_map(&sd->dep_map, "s_active", key, 0);	\
-} while (0)
-
-/* Test for attributes that want to ignore lockdep for read-locking */
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-	struct attribute *attr = sd->priv;
-
-	return sysfs_type(sd) == SYSFS_KOBJ_ATTR && attr->ignore_lockdep;
-}
-
-#else
-
-#define sysfs_dirent_init_lockdep(sd) do {} while (0)
-
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-	return true;
-}
-
-#endif
-
 /*
  * Context structure to be used while adding/removing nodes.
  */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index f20796ecc76..105d09dcb06 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -11,6 +11,7 @@
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/lockdep.h>
 
 struct file;
 struct iattr;
@@ -62,6 +63,10 @@ struct kernfs_ops {
 			 loff_t off);
 
 	int (*mmap)(struct sysfs_open_file *of, struct vm_area_struct *vma);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key	lockdep_key;
+#endif
 };
 
 #ifdef CONFIG_SYSFS
@@ -69,11 +74,12 @@ struct kernfs_ops {
 struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 					  const char *name, void *priv,
 					  const void *ns);
-struct sysfs_dirent *kernfs_create_file_ns(struct sysfs_dirent *parent,
-					   const char *name,
-					   umode_t mode, loff_t size,
-					   const struct kernfs_ops *ops,
-					   void *priv, const void *ns);
+struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
+					       const char *name,
+					       umode_t mode, loff_t size,
+					       const struct kernfs_ops *ops,
+					       void *priv, const void *ns,
+					       struct lock_class_key *key);
 struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 					const char *name,
 					struct sysfs_dirent *target);
@@ -94,9 +100,10 @@ kernfs_create_dir_ns(struct sysfs_dirent *parent, const char *name, void *priv,
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct sysfs_dirent *
-kernfs_create_file_ns(struct sysfs_dirent *parent, const char *name,
-		      umode_t mode, loff_t size, const struct kernfs_ops *ops,
-		      void *priv, const void *ns)
+kernfs_create_file_ns_key(struct sysfs_dirent *parent, const char *name,
+			  umode_t mode, loff_t size,
+			  const struct kernfs_ops *ops, void *priv,
+			  const void *ns, struct lock_class_key *key)
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct sysfs_dirent *
@@ -131,6 +138,20 @@ kernfs_create_dir(struct sysfs_dirent *parent, const char *name, void *priv)
 	return kernfs_create_dir_ns(parent, name, priv, NULL);
 }
 
+static inline struct sysfs_dirent *
+kernfs_create_file_ns(struct sysfs_dirent *parent, const char *name,
+		      umode_t mode, loff_t size, const struct kernfs_ops *ops,
+		      void *priv, const void *ns)
+{
+	struct lock_class_key *key = NULL;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	key = (struct lock_class_key *)&ops->lockdep_key;
+#endif
+	return kernfs_create_file_ns_key(parent, name, mode, size, ops, priv,
+					 ns, key);
+}
+
 static inline struct sysfs_dirent *
 kernfs_create_file(struct sysfs_dirent *parent, const char *name, umode_t mode,
 		   loff_t size, const struct kernfs_ops *ops, void *priv)
-- 
cgit v1.2.3-70-g09d2


From ccf73cf336dc55bc52748205dee998d2fd4a8808 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:30 -0500
Subject: sysfs, kernfs: introduce kernfs[_find_and]_get() and kernfs_put()

Introduce kernfs interface for finding, getting and putting
sysfs_dirents.

* sysfs_find_dirent() is renamed to kernfs_find_ns() and lockdep
  assertion for sysfs_mutex is added.

* sysfs_get_dirent_ns() is renamed to kernfs_find_and_get().

* Macro inline dancing around __sysfs_get/put() are removed and
  kernfs_get/put() are made proper functions implemented in
  fs/sysfs/dir.c.

While the conversions are mostly equivalent, there's one difference -
kernfs_get() doesn't return the input param as its return value.  This
change is intentional.  While passing through the input increases
writability in some areas, it is unnecessary and has been shown to
cause confusion regarding how the last ref is handled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c         | 117 ++++++++++++++++++++++++++++---------------------
 fs/sysfs/file.c        |  41 +++++++++--------
 fs/sysfs/group.c       |  30 +++++++------
 fs/sysfs/inode.c       |   5 ++-
 fs/sysfs/mount.c       |  14 ------
 fs/sysfs/symlink.c     |  16 ++++---
 fs/sysfs/sysfs.h       |  22 ----------
 include/linux/kernfs.h |  19 ++++++++
 include/linux/sysfs.h  |  35 ++++++---------
 9 files changed, 151 insertions(+), 148 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 8f2d577b5f6..0d806efcc9a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -240,10 +240,31 @@ static void sysfs_free_ino(unsigned int ino)
 	spin_unlock(&sysfs_ino_lock);
 }
 
-void release_sysfs_dirent(struct sysfs_dirent *sd)
+/**
+ * kernfs_get - get a reference count on a sysfs_dirent
+ * @sd: the target sysfs_dirent
+ */
+void kernfs_get(struct sysfs_dirent *sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+
+/**
+ * kernfs_put - put a reference count on a sysfs_dirent
+ * @sd: the target sysfs_dirent
+ *
+ * Put a reference count of @sd and destroy it if it reached zero.
+ */
+void kernfs_put(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd;
 
+	if (!sd || !atomic_dec_and_test(&sd->s_count))
+		return;
  repeat:
 	/* Moving/renaming is always done while holding reference.
 	 * sd->s_parent won't change beneath us.
@@ -255,7 +276,7 @@ void release_sysfs_dirent(struct sysfs_dirent *sd)
 		parent_sd ? parent_sd->s_name : "", sd->s_name);
 
 	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-		sysfs_put(sd->s_symlink.target_sd);
+		kernfs_put(sd->s_symlink.target_sd);
 	if (sysfs_type(sd) & SYSFS_COPY_NAME)
 		kfree(sd->s_name);
 	if (sd->s_iattr && sd->s_iattr->ia_secdata)
@@ -269,6 +290,7 @@ void release_sysfs_dirent(struct sysfs_dirent *sd)
 	if (sd && atomic_dec_and_test(&sd->s_count))
 		goto repeat;
 }
+EXPORT_SYMBOL_GPL(kernfs_put);
 
 static int sysfs_dentry_delete(const struct dentry *dentry)
 {
@@ -331,7 +353,7 @@ out_bad:
 
 static void sysfs_dentry_release(struct dentry *dentry)
 {
-	sysfs_put(dentry->d_fsdata);
+	kernfs_put(dentry->d_fsdata);
 }
 
 const struct dentry_operations sysfs_dentry_ops = {
@@ -433,7 +455,8 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		return -EINVAL;
 
 	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = sysfs_get(parent_sd);
+	sd->s_parent = parent_sd;
+	kernfs_get(parent_sd);
 
 	ret = sysfs_link_sibling(sd);
 	if (ret)
@@ -553,36 +576,33 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 
 		sysfs_deactivate(sd);
 		sysfs_unmap_bin_file(sd);
-		sysfs_put(sd);
+		kernfs_put(sd);
 	}
 }
 
 /**
- *	sysfs_find_dirent - find sysfs_dirent with the given name
- *	@parent_sd: sysfs_dirent to search under
- *	@name: name to look for
- *	@ns: the namespace tag to use
- *
- *	Look for sysfs_dirent with name @name under @parent_sd.
- *
- *	LOCKING:
- *	mutex_lock(sysfs_mutex)
+ * kernfs_find_ns - find sysfs_dirent with the given name
+ * @parent: sysfs_dirent to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
  *
- *	RETURNS:
- *	Pointer to sysfs_dirent if found, NULL if not.
+ * Look for sysfs_dirent with name @name under @parent.  Returns pointer to
+ * the found sysfs_dirent on success, %NULL on failure.
  */
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const unsigned char *name,
-				       const void *ns)
+static struct sysfs_dirent *kernfs_find_ns(struct sysfs_dirent *parent,
+					   const unsigned char *name,
+					   const void *ns)
 {
-	struct rb_node *node = parent_sd->s_dir.children.rb_node;
-	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
+	struct rb_node *node = parent->s_dir.children.rb_node;
+	bool has_ns = parent->s_flags & SYSFS_FLAG_NS;
 	unsigned int hash;
 
+	lockdep_assert_held(&sysfs_mutex);
+
 	if (has_ns != (bool)ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
 		     has_ns ? "required" : "invalid",
-		     parent_sd->s_name, name);
+		     parent->s_name, name);
 		return NULL;
 	}
 
@@ -604,34 +624,28 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 }
 
 /**
- *	sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
- *	@parent_sd: sysfs_dirent to search under
- *	@name: name to look for
- *	@ns: the namespace tag to use
- *
- *	Look for sysfs_dirent with name @name under @parent_sd and get
- *	it if found.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).  Grabs sysfs_mutex.
+ * kernfs_find_and_get_ns - find and get sysfs_dirent with the given name
+ * @parent: sysfs_dirent to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
  *
- *	RETURNS:
- *	Pointer to sysfs_dirent if found, NULL if not.
+ * Look for sysfs_dirent with name @name under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * sysfs_dirent on success, %NULL on failure.
  */
-struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
-					 const unsigned char *name,
-					 const void *ns)
+struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
+					    const char *name, const void *ns)
 {
 	struct sysfs_dirent *sd;
 
 	mutex_lock(&sysfs_mutex);
-	sd = sysfs_find_dirent(parent_sd, name, ns);
-	sysfs_get(sd);
+	sd = kernfs_find_ns(parent, name, ns);
+	kernfs_get(sd);
 	mutex_unlock(&sysfs_mutex);
 
 	return sd;
 }
-EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 
 /**
  * kernfs_create_dir_ns - create a directory
@@ -667,7 +681,7 @@ struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 	if (!rc)
 		return sd;
 
-	sysfs_put(sd);
+	kernfs_put(sd);
 	return ERR_PTR(rc);
 }
 
@@ -716,14 +730,15 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (parent_sd->s_flags & SYSFS_FLAG_NS)
 		ns = sysfs_info(dir->i_sb)->ns;
 
-	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
+	sd = kernfs_find_ns(parent_sd, dentry->d_name.name, ns);
 
 	/* no such entry */
 	if (!sd) {
 		ret = ERR_PTR(-ENOENT);
 		goto out_unlock;
 	}
-	dentry->d_fsdata = sysfs_get(sd);
+	kernfs_get(sd);
+	dentry->d_fsdata = sd;
 
 	/* attach dentry and inode */
 	inode = sysfs_get_inode(dir->i_sb, sd);
@@ -859,7 +874,7 @@ int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
 
 	sysfs_addrm_start(&acxt);
 
-	sd = sysfs_find_dirent(dir_sd, name, ns);
+	sd = kernfs_find_ns(dir_sd, name, ns);
 	if (sd)
 		__kernfs_remove(&acxt, sd);
 
@@ -925,7 +940,7 @@ int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent, new_name, new_ns))
+	if (kernfs_find_ns(new_parent, new_name, new_ns))
 		goto out;
 
 	/* rename sysfs_dirent */
@@ -943,8 +958,8 @@ int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 	 * Move to the appropriate place in the appropriate directories rbtree.
 	 */
 	sysfs_unlink_sibling(sd);
-	sysfs_get(new_parent);
-	sysfs_put(sd->s_parent);
+	kernfs_get(new_parent);
+	kernfs_put(sd->s_parent);
 	sd->s_ns = new_ns;
 	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
 	sd->s_parent = new_parent;
@@ -1000,7 +1015,7 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
 
 static int sysfs_dir_release(struct inode *inode, struct file *filp)
 {
-	sysfs_put(filp->private_data);
+	kernfs_put(filp->private_data);
 	return 0;
 }
 
@@ -1011,7 +1026,7 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
 			pos->s_parent == parent_sd &&
 			hash == pos->s_hash;
-		sysfs_put(pos);
+		kernfs_put(pos);
 		if (!valid)
 			pos = NULL;
 	}
@@ -1075,8 +1090,10 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 		unsigned int type = dt_type(pos);
 		int len = strlen(name);
 		ino_t ino = pos->s_ino;
+
 		ctx->pos = pos->s_hash;
-		file->private_data = sysfs_get(pos);
+		file->private_data = pos;
+		kernfs_get(pos);
 
 		mutex_unlock(&sysfs_mutex);
 		if (!dir_emit(ctx, name, len, ino, type))
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e4eca285b39..7f0a79fa2ed 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -881,19 +881,19 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 	struct sysfs_dirent *sd = k->sd, *tmp;
 
 	if (sd && dir)
-		sd = sysfs_get_dirent(sd, dir);
+		sd = kernfs_find_and_get(sd, dir);
 	else
-		sysfs_get(sd);
+		kernfs_get(sd);
 
 	if (sd && attr) {
-		tmp = sysfs_get_dirent(sd, attr);
-		sysfs_put(sd);
+		tmp = kernfs_find_and_get(sd, attr);
+		kernfs_put(sd);
 		sd = tmp;
 	}
 
 	if (sd) {
 		kernfs_notify(sd);
-		sysfs_put(sd);
+		kernfs_put(sd);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_notify);
@@ -1052,7 +1052,7 @@ struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
 	sysfs_addrm_finish(&acxt);
 
 	if (rc) {
-		sysfs_put(sd);
+		kernfs_put(sd);
 		return ERR_PTR(rc);
 	}
 	return sd;
@@ -1106,16 +1106,18 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 	int error;
 
-	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
-	else
-		dir_sd = sysfs_get(kobj->sd);
+	if (group) {
+		dir_sd = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		dir_sd = kobj->sd;
+		kernfs_get(dir_sd);
+	}
 
 	if (!dir_sd)
 		return -ENOENT;
 
 	error = sysfs_add_file(dir_sd, attr, false);
-	sysfs_put(dir_sd);
+	kernfs_put(dir_sd);
 
 	return error;
 }
@@ -1135,7 +1137,7 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 	struct iattr newattrs;
 	int rc;
 
-	sd = sysfs_get_dirent(kobj->sd, attr->name);
+	sd = kernfs_find_and_get(kobj->sd, attr->name);
 	if (!sd)
 		return -ENOENT;
 
@@ -1144,7 +1146,7 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 
 	rc = kernfs_setattr(sd, &newattrs);
 
-	sysfs_put(sd);
+	kernfs_put(sd);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -1185,13 +1187,16 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 {
 	struct sysfs_dirent *dir_sd;
 
-	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
-	else
-		dir_sd = sysfs_get(kobj->sd);
+	if (group) {
+		dir_sd = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		dir_sd = kobj->sd;
+		kernfs_get(dir_sd);
+	}
+
 	if (dir_sd) {
 		kernfs_remove_by_name(dir_sd, attr->name);
-		sysfs_put(dir_sd);
+		kernfs_put(dir_sd);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 9f65cd97a2d..7177532b8f7 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -108,13 +108,13 @@ static int internal_create_group(struct kobject *kobj, int update,
 		}
 	} else
 		sd = kobj->sd;
-	sysfs_get(sd);
+	kernfs_get(sd);
 	error = create_files(sd, kobj, grp, update);
 	if (error) {
 		if (grp->name)
 			kernfs_remove(sd);
 	}
-	sysfs_put(sd);
+	kernfs_put(sd);
 	return error;
 }
 
@@ -208,21 +208,23 @@ void sysfs_remove_group(struct kobject *kobj,
 	struct sysfs_dirent *sd;
 
 	if (grp->name) {
-		sd = sysfs_get_dirent(dir_sd, grp->name);
+		sd = kernfs_find_and_get(dir_sd, grp->name);
 		if (!sd) {
 			WARN(!sd, KERN_WARNING
 			     "sysfs group %p not found for kobject '%s'\n",
 			     grp, kobject_name(kobj));
 			return;
 		}
-	} else
-		sd = sysfs_get(dir_sd);
+	} else {
+		sd = dir_sd;
+		kernfs_get(sd);
+	}
 
 	remove_files(sd, kobj, grp);
 	if (grp->name)
 		kernfs_remove(sd);
 
-	sysfs_put(sd);
+	kernfs_put(sd);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_group);
 
@@ -263,7 +265,7 @@ int sysfs_merge_group(struct kobject *kobj,
 	struct attribute *const *attr;
 	int i;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
+	dir_sd = kernfs_find_and_get(kobj->sd, grp->name);
 	if (!dir_sd)
 		return -ENOENT;
 
@@ -273,7 +275,7 @@ int sysfs_merge_group(struct kobject *kobj,
 		while (--i >= 0)
 			kernfs_remove_by_name(dir_sd, (*--attr)->name);
 	}
-	sysfs_put(dir_sd);
+	kernfs_put(dir_sd);
 
 	return error;
 }
@@ -290,11 +292,11 @@ void sysfs_unmerge_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 	struct attribute *const *attr;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
+	dir_sd = kernfs_find_and_get(kobj->sd, grp->name);
 	if (dir_sd) {
 		for (attr = grp->attrs; *attr; ++attr)
 			kernfs_remove_by_name(dir_sd, (*attr)->name);
-		sysfs_put(dir_sd);
+		kernfs_put(dir_sd);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -312,12 +314,12 @@ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
 	struct sysfs_dirent *dir_sd;
 	int error = 0;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
+	dir_sd = kernfs_find_and_get(kobj->sd, group_name);
 	if (!dir_sd)
 		return -ENOENT;
 
 	error = sysfs_create_link_sd(dir_sd, target, link_name);
-	sysfs_put(dir_sd);
+	kernfs_put(dir_sd);
 
 	return error;
 }
@@ -334,10 +336,10 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 {
 	struct sysfs_dirent *dir_sd;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
+	dir_sd = kernfs_find_and_get(kobj->sd, group_name);
 	if (dir_sd) {
 		kernfs_remove_by_name(dir_sd, link_name);
-		sysfs_put(dir_sd);
+		kernfs_put(dir_sd);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index b3c717ab349..bfe4478f82b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -254,7 +254,8 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
-	inode->i_private = sysfs_get(sd);
+	kernfs_get(sd);
+	inode->i_private = sd;
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
@@ -321,7 +322,7 @@ void sysfs_evict_inode(struct inode *inode)
 
 	truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
-	sysfs_put(sd);
+	kernfs_put(sd);
 }
 
 int sysfs_permission(struct inode *inode, int mask)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8c24bce2f4a..852d11519f9 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -184,17 +184,3 @@ out_err:
 	sysfs_dir_cachep = NULL;
 	goto out;
 }
-
-#undef sysfs_get
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
-	return __sysfs_get(sd);
-}
-EXPORT_SYMBOL_GPL(sysfs_get);
-
-#undef sysfs_put
-void sysfs_put(struct sysfs_dirent *sd)
-{
-	__sysfs_put(sd);
-}
-EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 76efeab6db4..b137aa3a486 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -44,7 +44,7 @@ struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 	if (parent->s_flags & SYSFS_FLAG_NS)
 		sd->s_ns = target->s_ns;
 	sd->s_symlink.target_sd = target;
-	sysfs_get(target);	/* ref owned by symlink */
+	kernfs_get(target);	/* ref owned by symlink */
 
 	sysfs_addrm_start(&acxt);
 	error = sysfs_add_one(&acxt, sd, parent);
@@ -53,7 +53,7 @@ struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 	if (!error)
 		return sd;
 
-	sysfs_put(sd);
+	kernfs_put(sd);
 	return ERR_PTR(error);
 }
 
@@ -72,15 +72,17 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (target->sd)
-		target_sd = sysfs_get(target->sd);
+	if (target->sd) {
+		target_sd = target->sd;
+		kernfs_get(target_sd);
+	}
 	spin_unlock(&sysfs_symlink_target_lock);
 
 	if (!target_sd)
 		return -ENOENT;
 
 	sd = kernfs_create_link(parent_sd, name, target_sd);
-	sysfs_put(target_sd);
+	kernfs_put(target_sd);
 
 	if (!IS_ERR(sd))
 		return 0;
@@ -216,7 +218,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 		old_ns = targ->sd->s_ns;
 
 	result = -ENOENT;
-	sd = sysfs_get_dirent_ns(parent_sd, old, old_ns);
+	sd = kernfs_find_and_get_ns(parent_sd, old, old_ns);
 	if (!sd)
 		goto out;
 
@@ -229,7 +231,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 	result = kernfs_rename_ns(sd, parent_sd, new, new_ns);
 
 out:
-	sysfs_put(sd);
+	kernfs_put(sd);
 	return result;
 }
 EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index e93f8b84561..85315e22840 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -141,30 +141,8 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		  struct sysfs_dirent *parent_sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const unsigned char *name,
-				       const void *ns);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
-void release_sysfs_dirent(struct sysfs_dirent *sd);
-
-static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
-{
-	if (sd) {
-		WARN_ON(!atomic_read(&sd->s_count));
-		atomic_inc(&sd->s_count);
-	}
-	return sd;
-}
-#define sysfs_get(sd) __sysfs_get(sd)
-
-static inline void __sysfs_put(struct sysfs_dirent *sd)
-{
-	if (sd && atomic_dec_and_test(&sd->s_count))
-		release_sysfs_dirent(sd);
-}
-#define sysfs_put(sd) __sysfs_put(sd)
-
 /*
  * inode.c
  */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 105d09dcb06..fd8f574ef2f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -71,6 +71,11 @@ struct kernfs_ops {
 
 #ifdef CONFIG_SYSFS
 
+struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
+					    const char *name, const void *ns);
+void kernfs_get(struct sysfs_dirent *sd);
+void kernfs_put(struct sysfs_dirent *sd);
+
 struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 					  const char *name, void *priv,
 					  const void *ns);
@@ -94,6 +99,14 @@ void kernfs_notify(struct sysfs_dirent *sd);
 
 #else	/* CONFIG_SYSFS */
 
+static inline struct sysfs_dirent *
+kernfs_find_and_get_ns(struct sysfs_dirent *parent, const char *name,
+		       const void *ns)
+{ return NULL; }
+
+static inline void kernfs_get(struct sysfs_dirent *sd) { }
+static inline void kernfs_put(struct sysfs_dirent *sd) { }
+
 static inline struct sysfs_dirent *
 kernfs_create_dir_ns(struct sysfs_dirent *parent, const char *name, void *priv,
 		     const void *ns)
@@ -132,6 +145,12 @@ static inline void kernfs_notify(struct sysfs_dirent *sd) { }
 
 #endif	/* CONFIG_SYSFS */
 
+static inline struct sysfs_dirent *
+kernfs_find_and_get(struct sysfs_dirent *sd, const char *name)
+{
+	return kernfs_find_and_get_ns(sd, name, NULL);
+}
+
 static inline struct sysfs_dirent *
 kernfs_create_dir(struct sysfs_dirent *parent, const char *name, void *priv)
 {
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 0ab2b023b61..cd8f90bf51a 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -243,11 +243,6 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 				  const char *link_name);
 
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);
-struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
-					 const unsigned char *name,
-					 const void *ns);
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd);
-void sysfs_put(struct sysfs_dirent *sd);
 
 int __must_check sysfs_init(void);
 
@@ -417,19 +412,6 @@ static inline void sysfs_notify(struct kobject *kobj, const char *dir,
 				const char *attr)
 {
 }
-static inline struct sysfs_dirent *
-sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, const unsigned char *name,
-		    const void *ns)
-{
-	return NULL;
-}
-static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
-	return NULL;
-}
-static inline void sysfs_put(struct sysfs_dirent *sd)
-{
-}
 
 static inline int __must_check sysfs_init(void)
 {
@@ -456,15 +438,26 @@ static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target
 	return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
 }
 
+static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
+{
+	kernfs_notify(sd);
+}
+
 static inline struct sysfs_dirent *
 sysfs_get_dirent(struct sysfs_dirent *parent_sd, const unsigned char *name)
 {
-	return sysfs_get_dirent_ns(parent_sd, name, NULL);
+	return kernfs_find_and_get(parent_sd, name);
 }
 
-static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
+static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
 {
-	kernfs_notify(sd);
+	kernfs_get(sd);
+	return sd;
+}
+
+static inline void sysfs_put(struct sysfs_dirent *sd)
+{
+	kernfs_put(sd);
 }
 
 #endif /* _SYSFS_H_ */
-- 
cgit v1.2.3-70-g09d2


From ae6621b0716852146e4655fef7f74a181faa6c81 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:31 -0500
Subject: sysfs, kernfs: move internal decls to fs/kernfs/kernfs-internal.h

Move data structure, constant and basic accessor declarations from
fs/sysfs/sysfs.h to fs/kernfs/kernfs-internal.h.  The two files
currently include each other.  Once kernfs / sysfs separation is
complete, the cross inclusions will be removed.  Inclusion protectors
are added to fs/sysfs/sysfs.h to allow cross-inclusion.

This patch doesn't introduce any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/kernfs-internal.h | 115 ++++++++++++++++++++++++++++++++++++++++++++
 fs/sysfs/sysfs.h            | 102 +++------------------------------------
 2 files changed, 121 insertions(+), 96 deletions(-)
 create mode 100644 fs/kernfs/kernfs-internal.h

(limited to 'fs')

diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 00000000000..5a2c3a17d7c
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,115 @@
+/*
+ * fs/kernfs/kernfs-internal.h - kernfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __KERNFS_INTERNAL_H
+#define __KERNFS_INTERNAL_H
+
+#include <linux/lockdep.h>
+#include <linux/fs.h>
+#include <linux/rbtree.h>
+
+#include <linux/kernfs.h>
+
+struct sysfs_open_dirent;
+
+/* type-specific structures for sysfs_dirent->s_* union members */
+struct sysfs_elem_dir {
+	unsigned long		subdirs;
+	/* children rbtree starts here and goes through sd->s_rb */
+	struct rb_root		children;
+};
+
+struct sysfs_elem_symlink {
+	struct sysfs_dirent	*target_sd;
+};
+
+struct sysfs_elem_attr {
+	const struct kernfs_ops	*ops;
+	struct sysfs_open_dirent *open;
+	loff_t			size;
+};
+
+struct sysfs_inode_attrs {
+	struct iattr	ia_iattr;
+	void		*ia_secdata;
+	u32		ia_secdata_len;
+};
+
+/*
+ * sysfs_dirent - the building block of sysfs hierarchy.  Each and
+ * every sysfs node is represented by single sysfs_dirent.
+ *
+ * As long as s_count reference is held, the sysfs_dirent itself is
+ * accessible.  Dereferencing s_elem or any other outer entity
+ * requires s_active reference.
+ */
+struct sysfs_dirent {
+	atomic_t		s_count;
+	atomic_t		s_active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+	struct sysfs_dirent	*s_parent;
+	const char		*s_name;
+
+	struct rb_node		s_rb;
+
+	union {
+		struct completion	*completion;
+		struct sysfs_dirent	*removed_list;
+	} u;
+
+	const void		*s_ns; /* namespace tag */
+	unsigned int		s_hash; /* ns + name hash */
+	union {
+		struct sysfs_elem_dir		s_dir;
+		struct sysfs_elem_symlink	s_symlink;
+		struct sysfs_elem_attr		s_attr;
+	};
+
+	void			*priv;
+
+	unsigned short		s_flags;
+	umode_t			s_mode;
+	unsigned int		s_ino;
+	struct sysfs_inode_attrs *s_iattr;
+};
+
+#define SD_DEACTIVATED_BIAS		INT_MIN
+
+#define SYSFS_TYPE_MASK			0x000f
+#define SYSFS_DIR			0x0001
+#define SYSFS_KOBJ_ATTR			0x0002
+#define SYSFS_KOBJ_LINK			0x0004
+#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
+#define SYSFS_ACTIVE_REF		SYSFS_KOBJ_ATTR
+
+#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
+#define SYSFS_FLAG_REMOVED		0x0010
+#define SYSFS_FLAG_NS			0x0020
+#define SYSFS_FLAG_HAS_SEQ_SHOW		0x0040
+#define SYSFS_FLAG_HAS_MMAP		0x0080
+#define SYSFS_FLAG_LOCKDEP		0x0100
+
+static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
+{
+	return sd->s_flags & SYSFS_TYPE_MASK;
+}
+
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
+struct sysfs_addrm_cxt {
+	struct sysfs_dirent	*removed;
+};
+
+#include "../sysfs/sysfs.h"
+
+#endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 85315e22840..f8c936f31b3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,103 +8,11 @@
  * This file is released under the GPLv2.
  */
 
-#include <linux/lockdep.h>
-#include <linux/kobject_ns.h>
-#include <linux/fs.h>
-#include <linux/rbtree.h>
+#ifndef __SYSFS_INTERNAL_H
+#define __SYSFS_INTERNAL_H
 
-struct sysfs_open_dirent;
-
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
-	unsigned long		subdirs;
-	/* children rbtree starts here and goes through sd->s_rb */
-	struct rb_root		children;
-};
-
-struct sysfs_elem_symlink {
-	struct sysfs_dirent	*target_sd;
-};
-
-struct sysfs_elem_attr {
-	const struct kernfs_ops	*ops;
-	struct sysfs_open_dirent *open;
-	loff_t			size;
-};
-
-struct sysfs_inode_attrs {
-	struct iattr	ia_iattr;
-	void		*ia_secdata;
-	u32		ia_secdata_len;
-};
-
-/*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
-	atomic_t		s_count;
-	atomic_t		s_active;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-	struct sysfs_dirent	*s_parent;
-	const char		*s_name;
-
-	struct rb_node		s_rb;
-
-	union {
-		struct completion	*completion;
-		struct sysfs_dirent	*removed_list;
-	} u;
-
-	const void		*s_ns; /* namespace tag */
-	unsigned int		s_hash; /* ns + name hash */
-	union {
-		struct sysfs_elem_dir		s_dir;
-		struct sysfs_elem_symlink	s_symlink;
-		struct sysfs_elem_attr		s_attr;
-	};
-
-	void			*priv;
-
-	unsigned short		s_flags;
-	umode_t			s_mode;
-	unsigned int		s_ino;
-	struct sysfs_inode_attrs *s_iattr;
-};
-
-#define SD_DEACTIVATED_BIAS		INT_MIN
-
-#define SYSFS_TYPE_MASK			0x000f
-#define SYSFS_DIR			0x0001
-#define SYSFS_KOBJ_ATTR			0x0002
-#define SYSFS_KOBJ_LINK			0x0004
-#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF		SYSFS_KOBJ_ATTR
-
-#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_REMOVED		0x0010
-#define SYSFS_FLAG_NS			0x0020
-#define SYSFS_FLAG_HAS_SEQ_SHOW		0x0040
-#define SYSFS_FLAG_HAS_MMAP		0x0080
-#define SYSFS_FLAG_LOCKDEP		0x0100
-
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
-	return sd->s_flags & SYSFS_TYPE_MASK;
-}
-
-/*
- * Context structure to be used while adding/removing nodes.
- */
-struct sysfs_addrm_cxt {
-	struct sysfs_dirent	*removed;
-};
+#include "../kernfs/kernfs-internal.h"
+#include <linux/sysfs.h>
 
 /*
  * mount.c
@@ -175,3 +83,5 @@ void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 extern const struct inode_operations sysfs_symlink_inode_operations;
 int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
 			 const char *name);
+
+#endif	/* __SYSFS_INTERNAL_H */
-- 
cgit v1.2.3-70-g09d2


From ffed24e22845a3da0ae01095ae3f11c8d16e889d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:32 -0500
Subject: sysfs, kernfs: move inode code to fs/kernfs/inode.c

There's nothing sysfs-specific in fs/sysfs/inode.c.  Move everything
in it to fs/kernfs/inode.c.  The respective declarations in
fs/sysfs/sysfs.h are moved to fs/kernfs/kernfs-internal.h.

This is pure relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/inode.c           | 327 ++++++++++++++++++++++++++++++++++++++++++
 fs/kernfs/kernfs-internal.h |  13 ++
 fs/sysfs/Makefile           |   2 +-
 fs/sysfs/inode.c            | 342 --------------------------------------------
 fs/sysfs/sysfs.h            |  13 --
 5 files changed, 341 insertions(+), 356 deletions(-)
 delete mode 100644 fs/sysfs/inode.c

(limited to 'fs')

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 86bfeea07de..9d4fab49ea2 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -7,3 +7,330 @@
  *
  * This file is released under the GPLv2.
  */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include "kernfs-internal.h"
+
+static const struct address_space_operations sysfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+};
+
+static struct backing_dev_info sysfs_backing_dev_info = {
+	.name		= "sysfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static const struct inode_operations sysfs_inode_operations = {
+	.permission	= sysfs_permission,
+	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
+	.setxattr	= sysfs_setxattr,
+};
+
+int __init sysfs_inode_init(void)
+{
+	return bdi_init(&sysfs_backing_dev_info);
+}
+
+static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+{
+	struct sysfs_inode_attrs *attrs;
+	struct iattr *iattrs;
+
+	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+	iattrs = &attrs->ia_iattr;
+
+	/* assign default attributes */
+	iattrs->ia_mode = sd->s_mode;
+	iattrs->ia_uid = GLOBAL_ROOT_UID;
+	iattrs->ia_gid = GLOBAL_ROOT_GID;
+	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+	return attrs;
+}
+
+static int __kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
+{
+	struct sysfs_inode_attrs *sd_attrs;
+	struct iattr *iattrs;
+	unsigned int ia_valid = iattr->ia_valid;
+
+	sd_attrs = sd->s_iattr;
+
+	if (!sd_attrs) {
+		/* setting attributes for the first time, allocate now */
+		sd_attrs = sysfs_init_inode_attrs(sd);
+		if (!sd_attrs)
+			return -ENOMEM;
+		sd->s_iattr = sd_attrs;
+	}
+	/* attributes were changed at least once in past */
+	iattrs = &sd_attrs->ia_iattr;
+
+	if (ia_valid & ATTR_UID)
+		iattrs->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		iattrs->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		iattrs->ia_atime = iattr->ia_atime;
+	if (ia_valid & ATTR_MTIME)
+		iattrs->ia_mtime = iattr->ia_mtime;
+	if (ia_valid & ATTR_CTIME)
+		iattrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+		iattrs->ia_mode = sd->s_mode = mode;
+	}
+	return 0;
+}
+
+/**
+ * kernfs_setattr - set iattr on a node
+ * @sd: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
+{
+	int ret;
+
+	mutex_lock(&sysfs_mutex);
+	ret = __kernfs_setattr(sd, iattr);
+	mutex_unlock(&sysfs_mutex);
+	return ret;
+}
+
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	int error;
+
+	if (!sd)
+		return -EINVAL;
+
+	mutex_lock(&sysfs_mutex);
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		goto out;
+
+	error = __kernfs_setattr(sd, iattr);
+	if (error)
+		goto out;
+
+	/* this ignores size changes */
+	setattr_copy(inode, iattr);
+
+out:
+	mutex_unlock(&sysfs_mutex);
+	return error;
+}
+
+static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
+			       u32 *secdata_len)
+{
+	struct sysfs_inode_attrs *iattrs;
+	void *old_secdata;
+	size_t old_secdata_len;
+
+	if (!sd->s_iattr) {
+		sd->s_iattr = sysfs_init_inode_attrs(sd);
+		if (!sd->s_iattr)
+			return -ENOMEM;
+	}
+
+	iattrs = sd->s_iattr;
+	old_secdata = iattrs->ia_secdata;
+	old_secdata_len = iattrs->ia_secdata_len;
+
+	iattrs->ia_secdata = *secdata;
+	iattrs->ia_secdata_len = *secdata_len;
+
+	*secdata = old_secdata;
+	*secdata_len = old_secdata_len;
+	return 0;
+}
+
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	void *secdata;
+	int error;
+	u32 secdata_len = 0;
+
+	if (!sd)
+		return -EINVAL;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(dentry->d_inode, suffix,
+						value, size, flags);
+		if (error)
+			goto out;
+		error = security_inode_getsecctx(dentry->d_inode,
+						&secdata, &secdata_len);
+		if (error)
+			goto out;
+
+		mutex_lock(&sysfs_mutex);
+		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
+		mutex_unlock(&sysfs_mutex);
+
+		if (secdata)
+			security_release_secctx(secdata, secdata_len);
+	} else
+		return -EINVAL;
+out:
+	return error;
+}
+
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+{
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
+{
+	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
+
+	inode->i_mode = sd->s_mode;
+	if (iattrs) {
+		/* sysfs_dirent has non-default attributes
+		 * get them from persistent copy in sysfs_dirent
+		 */
+		set_inode_attr(inode, &iattrs->ia_iattr);
+		security_inode_notifysecctx(inode,
+					    iattrs->ia_secdata,
+					    iattrs->ia_secdata_len);
+	}
+
+	if (sysfs_type(sd) == SYSFS_DIR)
+		set_nlink(inode, sd->s_dir.subdirs + 2);
+}
+
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct inode *inode = dentry->d_inode;
+
+	mutex_lock(&sysfs_mutex);
+	sysfs_refresh_inode(sd, inode);
+	mutex_unlock(&sysfs_mutex);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
+{
+	kernfs_get(sd);
+	inode->i_private = sd;
+	inode->i_mapping->a_ops = &sysfs_aops;
+	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
+	inode->i_op = &sysfs_inode_operations;
+
+	set_default_inode_attr(inode, sd->s_mode);
+	sysfs_refresh_inode(sd, inode);
+
+	/* initialize inode according to type */
+	switch (sysfs_type(sd)) {
+	case SYSFS_DIR:
+		inode->i_op = &sysfs_dir_inode_operations;
+		inode->i_fop = &sysfs_dir_operations;
+		break;
+	case SYSFS_KOBJ_ATTR:
+		inode->i_size = sd->s_attr.size;
+		inode->i_fop = &kernfs_file_operations;
+		break;
+	case SYSFS_KOBJ_LINK:
+		inode->i_op = &sysfs_symlink_inode_operations;
+		break;
+	default:
+		BUG();
+	}
+
+	unlock_new_inode(inode);
+}
+
+/**
+ *	sysfs_get_inode - get inode for sysfs_dirent
+ *	@sb: super block
+ *	@sd: sysfs_dirent to allocate inode for
+ *
+ *	Get inode for @sd.  If such inode doesn't exist, a new inode
+ *	is allocated and basics are initialized.  New inode is
+ *	returned locked.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, sd->s_ino);
+	if (inode && (inode->i_state & I_NEW))
+		sysfs_init_inode(sd, inode);
+
+	return inode;
+}
+
+/*
+ * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
+ * To prevent the sysfs inode numbers from being freed prematurely we take a
+ * reference to sysfs_dirent from the sysfs inode.  A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void sysfs_evict_inode(struct inode *inode)
+{
+	struct sysfs_dirent *sd  = inode->i_private;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+	kernfs_put(sd);
+}
+
+int sysfs_permission(struct inode *inode, int mask)
+{
+	struct sysfs_dirent *sd;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	sd = inode->i_private;
+
+	mutex_lock(&sysfs_mutex);
+	sysfs_refresh_inode(sd, inode);
+	mutex_unlock(&sysfs_mutex);
+
+	return generic_permission(inode, mask);
+}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 5a2c3a17d7c..933ac8d5d07 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -112,4 +112,17 @@ struct sysfs_addrm_cxt {
 
 #include "../sysfs/sysfs.h"
 
+/*
+ * inode.c
+ */
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
+void sysfs_evict_inode(struct inode *inode);
+int sysfs_permission(struct inode *inode, int mask);
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags);
+int sysfs_inode_init(void);
+
 #endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 8876ac18337..6eff6e1205a 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,4 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
 
-obj-y		:= inode.o file.o dir.o symlink.o mount.o group.o
+obj-y		:= file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index bfe4478f82b..00000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * fs/sysfs/inode.c - basic sysfs inode and dentry operations
- *
- * Copyright (c) 2001-3 Patrick Mochel
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include <linux/pagemap.h>
-#include <linux/namei.h>
-#include <linux/backing-dev.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/xattr.h>
-#include <linux/security.h>
-#include "sysfs.h"
-
-static const struct address_space_operations sysfs_aops = {
-	.readpage	= simple_readpage,
-	.write_begin	= simple_write_begin,
-	.write_end	= simple_write_end,
-};
-
-static struct backing_dev_info sysfs_backing_dev_info = {
-	.name		= "sysfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static const struct inode_operations sysfs_inode_operations = {
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-};
-
-int __init sysfs_inode_init(void)
-{
-	return bdi_init(&sysfs_backing_dev_info);
-}
-
-static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
-{
-	struct sysfs_inode_attrs *attrs;
-	struct iattr *iattrs;
-
-	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-	if (!attrs)
-		return NULL;
-	iattrs = &attrs->ia_iattr;
-
-	/* assign default attributes */
-	iattrs->ia_mode = sd->s_mode;
-	iattrs->ia_uid = GLOBAL_ROOT_UID;
-	iattrs->ia_gid = GLOBAL_ROOT_GID;
-	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
-
-	return attrs;
-}
-
-static int __kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
-{
-	struct sysfs_inode_attrs *sd_attrs;
-	struct iattr *iattrs;
-	unsigned int ia_valid = iattr->ia_valid;
-
-	sd_attrs = sd->s_iattr;
-
-	if (!sd_attrs) {
-		/* setting attributes for the first time, allocate now */
-		sd_attrs = sysfs_init_inode_attrs(sd);
-		if (!sd_attrs)
-			return -ENOMEM;
-		sd->s_iattr = sd_attrs;
-	}
-	/* attributes were changed at least once in past */
-	iattrs = &sd_attrs->ia_iattr;
-
-	if (ia_valid & ATTR_UID)
-		iattrs->ia_uid = iattr->ia_uid;
-	if (ia_valid & ATTR_GID)
-		iattrs->ia_gid = iattr->ia_gid;
-	if (ia_valid & ATTR_ATIME)
-		iattrs->ia_atime = iattr->ia_atime;
-	if (ia_valid & ATTR_MTIME)
-		iattrs->ia_mtime = iattr->ia_mtime;
-	if (ia_valid & ATTR_CTIME)
-		iattrs->ia_ctime = iattr->ia_ctime;
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = iattr->ia_mode;
-		iattrs->ia_mode = sd->s_mode = mode;
-	}
-	return 0;
-}
-
-/**
- * kernfs_setattr - set iattr on a node
- * @sd: target node
- * @iattr: iattr to set
- *
- * Returns 0 on success, -errno on failure.
- */
-int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
-{
-	int ret;
-
-	mutex_lock(&sysfs_mutex);
-	ret = __kernfs_setattr(sd, iattr);
-	mutex_unlock(&sysfs_mutex);
-	return ret;
-}
-
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-	struct inode *inode = dentry->d_inode;
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	int error;
-
-	if (!sd)
-		return -EINVAL;
-
-	mutex_lock(&sysfs_mutex);
-	error = inode_change_ok(inode, iattr);
-	if (error)
-		goto out;
-
-	error = __kernfs_setattr(sd, iattr);
-	if (error)
-		goto out;
-
-	/* this ignores size changes */
-	setattr_copy(inode, iattr);
-
-out:
-	mutex_unlock(&sysfs_mutex);
-	return error;
-}
-
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
-			       u32 *secdata_len)
-{
-	struct sysfs_inode_attrs *iattrs;
-	void *old_secdata;
-	size_t old_secdata_len;
-
-	if (!sd->s_iattr) {
-		sd->s_iattr = sysfs_init_inode_attrs(sd);
-		if (!sd->s_iattr)
-			return -ENOMEM;
-	}
-
-	iattrs = sd->s_iattr;
-	old_secdata = iattrs->ia_secdata;
-	old_secdata_len = iattrs->ia_secdata_len;
-
-	iattrs->ia_secdata = *secdata;
-	iattrs->ia_secdata_len = *secdata_len;
-
-	*secdata = old_secdata;
-	*secdata_len = old_secdata_len;
-	return 0;
-}
-
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	void *secdata;
-	int error;
-	u32 secdata_len = 0;
-
-	if (!sd)
-		return -EINVAL;
-
-	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
-		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		error = security_inode_setsecurity(dentry->d_inode, suffix,
-						value, size, flags);
-		if (error)
-			goto out;
-		error = security_inode_getsecctx(dentry->d_inode,
-						&secdata, &secdata_len);
-		if (error)
-			goto out;
-
-		mutex_lock(&sysfs_mutex);
-		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
-		mutex_unlock(&sysfs_mutex);
-
-		if (secdata)
-			security_release_secctx(secdata, secdata_len);
-	} else
-		return -EINVAL;
-out:
-	return error;
-}
-
-static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
-{
-	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-}
-
-static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
-{
-	inode->i_uid = iattr->ia_uid;
-	inode->i_gid = iattr->ia_gid;
-	inode->i_atime = iattr->ia_atime;
-	inode->i_mtime = iattr->ia_mtime;
-	inode->i_ctime = iattr->ia_ctime;
-}
-
-static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
-
-	inode->i_mode = sd->s_mode;
-	if (iattrs) {
-		/* sysfs_dirent has non-default attributes
-		 * get them from persistent copy in sysfs_dirent
-		 */
-		set_inode_attr(inode, &iattrs->ia_iattr);
-		security_inode_notifysecctx(inode,
-					    iattrs->ia_secdata,
-					    iattrs->ia_secdata_len);
-	}
-
-	if (sysfs_type(sd) == SYSFS_DIR)
-		set_nlink(inode, sd->s_dir.subdirs + 2);
-}
-
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct inode *inode = dentry->d_inode;
-
-	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
-	mutex_unlock(&sysfs_mutex);
-
-	generic_fillattr(inode, stat);
-	return 0;
-}
-
-static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-	kernfs_get(sd);
-	inode->i_private = sd;
-	inode->i_mapping->a_ops = &sysfs_aops;
-	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
-	inode->i_op = &sysfs_inode_operations;
-
-	set_default_inode_attr(inode, sd->s_mode);
-	sysfs_refresh_inode(sd, inode);
-
-	/* initialize inode according to type */
-	switch (sysfs_type(sd)) {
-	case SYSFS_DIR:
-		inode->i_op = &sysfs_dir_inode_operations;
-		inode->i_fop = &sysfs_dir_operations;
-		break;
-	case SYSFS_KOBJ_ATTR:
-		inode->i_size = sd->s_attr.size;
-		inode->i_fop = &kernfs_file_operations;
-		break;
-	case SYSFS_KOBJ_LINK:
-		inode->i_op = &sysfs_symlink_inode_operations;
-		break;
-	default:
-		BUG();
-	}
-
-	unlock_new_inode(inode);
-}
-
-/**
- *	sysfs_get_inode - get inode for sysfs_dirent
- *	@sb: super block
- *	@sd: sysfs_dirent to allocate inode for
- *
- *	Get inode for @sd.  If such inode doesn't exist, a new inode
- *	is allocated and basics are initialized.  New inode is
- *	returned locked.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- *
- *	RETURNS:
- *	Pointer to allocated inode on success, NULL on failure.
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
-{
-	struct inode *inode;
-
-	inode = iget_locked(sb, sd->s_ino);
-	if (inode && (inode->i_state & I_NEW))
-		sysfs_init_inode(sd, inode);
-
-	return inode;
-}
-
-/*
- * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take a
- * reference to sysfs_dirent from the sysfs inode.  A
- * super_operations.evict_inode() implementation is needed to drop that
- * reference upon inode destruction.
- */
-void sysfs_evict_inode(struct inode *inode)
-{
-	struct sysfs_dirent *sd  = inode->i_private;
-
-	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
-	kernfs_put(sd);
-}
-
-int sysfs_permission(struct inode *inode, int mask)
-{
-	struct sysfs_dirent *sd;
-
-	if (mask & MAY_NOT_BLOCK)
-		return -ECHILD;
-
-	sd = inode->i_private;
-
-	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
-	mutex_unlock(&sysfs_mutex);
-
-	return generic_permission(inode, mask);
-}
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index f8c936f31b3..93c1910783f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -51,19 +51,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
-/*
- * inode.c
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_evict_inode(struct inode *inode);
-int sysfs_permission(struct inode *inode, int mask);
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		   size_t size, int flags);
-int sysfs_inode_init(void);
-
 /*
  * file.c
  */
-- 
cgit v1.2.3-70-g09d2


From fd7b9f7b9776b11df629e9dd3865320bf57ce588 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:33 -0500
Subject: sysfs, kernfs: move dir core code to fs/kernfs/dir.c

Move core dir code to fs/kernfs/dir.c.  fs/sysfs/dir.c now only
contains sysfs_warn_dup() and sysfs wrappers around kernfs interfaces.
The respective declarations in fs/sysfs/sysfs.h are moved to
fs/kernfs/kernfs-internal.h.

This is pure relocation.

v2: sysfs_symlink_target_lock was mistakenly relocated to kernfs.  It
    should remain with sysfs.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 988 ++++++++++++++++++++++++++++++++++++++++++++
 fs/kernfs/kernfs-internal.h |  17 +
 fs/sysfs/dir.c              | 986 -------------------------------------------
 fs/sysfs/sysfs.h            |  13 -
 4 files changed, 1005 insertions(+), 999 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1061602ce81..a4ca4de3cb2 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -7,3 +7,991 @@
  *
  * This file is released under the GPLv2.
  */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+
+#include "kernfs-internal.h"
+
+DEFINE_MUTEX(sysfs_mutex);
+
+#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
+
+static DEFINE_SPINLOCK(sysfs_ino_lock);
+static DEFINE_IDA(sysfs_ino_ida);
+
+/**
+ *	sysfs_name_hash
+ *	@name: Null terminated string to hash
+ *	@ns:   Namespace tag to hash
+ *
+ *	Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int sysfs_name_hash(const char *name, const void *ns)
+{
+	unsigned long hash = init_name_hash();
+	unsigned int len = strlen(name);
+	while (len--)
+		hash = partial_name_hash(*name++, hash);
+	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+	hash &= 0x7fffffffU;
+	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+	if (hash < 1)
+		hash += 2;
+	if (hash >= INT_MAX)
+		hash = INT_MAX - 1;
+	return hash;
+}
+
+static int sysfs_name_compare(unsigned int hash, const char *name,
+			      const void *ns, const struct sysfs_dirent *sd)
+{
+	if (hash != sd->s_hash)
+		return hash - sd->s_hash;
+	if (ns != sd->s_ns)
+		return ns - sd->s_ns;
+	return strcmp(name, sd->s_name);
+}
+
+static int sysfs_sd_compare(const struct sysfs_dirent *left,
+			    const struct sysfs_dirent *right)
+{
+	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
+				  right);
+}
+
+/**
+ *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
+ *	@sd: sysfs_dirent of interest
+ *
+ *	Link @sd into its sibling rbtree which starts from
+ *	sd->s_parent->s_dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(sysfs_mutex)
+ *
+ *	RETURNS:
+ *	0 on susccess -EEXIST on failure.
+ */
+static int sysfs_link_sibling(struct sysfs_dirent *sd)
+{
+	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
+	struct rb_node *parent = NULL;
+
+	if (sysfs_type(sd) == SYSFS_DIR)
+		sd->s_parent->s_dir.subdirs++;
+
+	while (*node) {
+		struct sysfs_dirent *pos;
+		int result;
+
+		pos = to_sysfs_dirent(*node);
+		parent = *node;
+		result = sysfs_sd_compare(sd, pos);
+		if (result < 0)
+			node = &pos->s_rb.rb_left;
+		else if (result > 0)
+			node = &pos->s_rb.rb_right;
+		else
+			return -EEXIST;
+	}
+	/* add new node and rebalance the tree */
+	rb_link_node(&sd->s_rb, parent, node);
+	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
+	return 0;
+}
+
+/**
+ *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
+ *	@sd: sysfs_dirent of interest
+ *
+ *	Unlink @sd from its sibling rbtree which starts from
+ *	sd->s_parent->s_dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(sysfs_mutex)
+ */
+static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
+{
+	if (sysfs_type(sd) == SYSFS_DIR)
+		sd->s_parent->s_dir.subdirs--;
+
+	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
+}
+
+/**
+ *	sysfs_get_active - get an active reference to sysfs_dirent
+ *	@sd: sysfs_dirent to get an active reference to
+ *
+ *	Get an active reference of @sd.  This function is noop if @sd
+ *	is NULL.
+ *
+ *	RETURNS:
+ *	Pointer to @sd on success, NULL on failure.
+ */
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+{
+	if (unlikely(!sd))
+		return NULL;
+
+	if (!atomic_inc_unless_negative(&sd->s_active))
+		return NULL;
+
+	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
+		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
+	return sd;
+}
+
+/**
+ *	sysfs_put_active - put an active reference to sysfs_dirent
+ *	@sd: sysfs_dirent to put an active reference to
+ *
+ *	Put an active reference to @sd.  This function is noop if @sd
+ *	is NULL.
+ */
+void sysfs_put_active(struct sysfs_dirent *sd)
+{
+	int v;
+
+	if (unlikely(!sd))
+		return;
+
+	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
+		rwsem_release(&sd->dep_map, 1, _RET_IP_);
+	v = atomic_dec_return(&sd->s_active);
+	if (likely(v != SD_DEACTIVATED_BIAS))
+		return;
+
+	/* atomic_dec_return() is a mb(), we'll always see the updated
+	 * sd->u.completion.
+	 */
+	complete(sd->u.completion);
+}
+
+/**
+ *	sysfs_deactivate - deactivate sysfs_dirent
+ *	@sd: sysfs_dirent to deactivate
+ *
+ *	Deny new active references and drain existing ones.
+ */
+static void sysfs_deactivate(struct sysfs_dirent *sd)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int v;
+
+	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
+
+	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
+		return;
+
+	sd->u.completion = (void *)&wait;
+
+	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
+	/* atomic_add_return() is a mb(), put_active() will always see
+	 * the updated sd->u.completion.
+	 */
+	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
+
+	if (v != SD_DEACTIVATED_BIAS) {
+		lock_contended(&sd->dep_map, _RET_IP_);
+		wait_for_completion(&wait);
+	}
+
+	lock_acquired(&sd->dep_map, _RET_IP_);
+	rwsem_release(&sd->dep_map, 1, _RET_IP_);
+}
+
+static int sysfs_alloc_ino(unsigned int *pino)
+{
+	int ino, rc;
+
+ retry:
+	spin_lock(&sysfs_ino_lock);
+	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
+	spin_unlock(&sysfs_ino_lock);
+
+	if (rc == -EAGAIN) {
+		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
+			goto retry;
+		rc = -ENOMEM;
+	}
+
+	*pino = ino;
+	return rc;
+}
+
+static void sysfs_free_ino(unsigned int ino)
+{
+	spin_lock(&sysfs_ino_lock);
+	ida_remove(&sysfs_ino_ida, ino);
+	spin_unlock(&sysfs_ino_lock);
+}
+
+/**
+ * kernfs_get - get a reference count on a sysfs_dirent
+ * @sd: the target sysfs_dirent
+ */
+void kernfs_get(struct sysfs_dirent *sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+
+/**
+ * kernfs_put - put a reference count on a sysfs_dirent
+ * @sd: the target sysfs_dirent
+ *
+ * Put a reference count of @sd and destroy it if it reached zero.
+ */
+void kernfs_put(struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent *parent_sd;
+
+	if (!sd || !atomic_dec_and_test(&sd->s_count))
+		return;
+ repeat:
+	/* Moving/renaming is always done while holding reference.
+	 * sd->s_parent won't change beneath us.
+	 */
+	parent_sd = sd->s_parent;
+
+	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
+		"sysfs: free using entry: %s/%s\n",
+		parent_sd ? parent_sd->s_name : "", sd->s_name);
+
+	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
+		kernfs_put(sd->s_symlink.target_sd);
+	if (sysfs_type(sd) & SYSFS_COPY_NAME)
+		kfree(sd->s_name);
+	if (sd->s_iattr && sd->s_iattr->ia_secdata)
+		security_release_secctx(sd->s_iattr->ia_secdata,
+					sd->s_iattr->ia_secdata_len);
+	kfree(sd->s_iattr);
+	sysfs_free_ino(sd->s_ino);
+	kmem_cache_free(sysfs_dir_cachep, sd);
+
+	sd = parent_sd;
+	if (sd && atomic_dec_and_test(&sd->s_count))
+		goto repeat;
+}
+EXPORT_SYMBOL_GPL(kernfs_put);
+
+static int sysfs_dentry_delete(const struct dentry *dentry)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
+}
+
+static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct sysfs_dirent *sd;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	sd = dentry->d_fsdata;
+	mutex_lock(&sysfs_mutex);
+
+	/* The sysfs dirent has been deleted */
+	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+		goto out_bad;
+
+	/* The sysfs dirent has been moved? */
+	if (dentry->d_parent->d_fsdata != sd->s_parent)
+		goto out_bad;
+
+	/* The sysfs dirent has been renamed */
+	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
+		goto out_bad;
+
+	/* The sysfs dirent has been moved to a different namespace */
+	if (sd->s_parent && (sd->s_parent->s_flags & SYSFS_FLAG_NS) &&
+	    sysfs_info(dentry->d_sb)->ns != sd->s_ns)
+		goto out_bad;
+
+	mutex_unlock(&sysfs_mutex);
+out_valid:
+	return 1;
+out_bad:
+	/* Remove the dentry from the dcache hashes.
+	 * If this is a deleted dentry we use d_drop instead of d_delete
+	 * so sysfs doesn't need to cope with negative dentries.
+	 *
+	 * If this is a dentry that has simply been renamed we
+	 * use d_drop to remove it from the dcache lookup on its
+	 * old parent.  If this dentry persists later when a lookup
+	 * is performed at its new name the dentry will be readded
+	 * to the dcache hashes.
+	 */
+	mutex_unlock(&sysfs_mutex);
+
+	/* If we have submounts we must allow the vfs caches
+	 * to lie about the state of the filesystem to prevent
+	 * leaks and other nasty things.
+	 */
+	if (check_submounts_and_drop(dentry) != 0)
+		goto out_valid;
+
+	return 0;
+}
+
+static void sysfs_dentry_release(struct dentry *dentry)
+{
+	kernfs_put(dentry->d_fsdata);
+}
+
+const struct dentry_operations sysfs_dentry_ops = {
+	.d_revalidate	= sysfs_dentry_revalidate,
+	.d_delete	= sysfs_dentry_delete,
+	.d_release	= sysfs_dentry_release,
+};
+
+struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
+{
+	char *dup_name = NULL;
+	struct sysfs_dirent *sd;
+
+	if (type & SYSFS_COPY_NAME) {
+		name = dup_name = kstrdup(name, GFP_KERNEL);
+		if (!name)
+			return NULL;
+	}
+
+	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
+	if (!sd)
+		goto err_out1;
+
+	if (sysfs_alloc_ino(&sd->s_ino))
+		goto err_out2;
+
+	atomic_set(&sd->s_count, 1);
+	atomic_set(&sd->s_active, 0);
+
+	sd->s_name = name;
+	sd->s_mode = mode;
+	sd->s_flags = type | SYSFS_FLAG_REMOVED;
+
+	return sd;
+
+ err_out2:
+	kmem_cache_free(sysfs_dir_cachep, sd);
+ err_out1:
+	kfree(dup_name);
+	return NULL;
+}
+
+/**
+ *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
+ *	@acxt: pointer to sysfs_addrm_cxt to be used
+ *
+ *	This function is called when the caller is about to add or remove
+ *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
+ *	to keep and pass context to other addrm functions.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).  sysfs_mutex is locked on
+ *	return.
+ */
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
+	__acquires(sysfs_mutex)
+{
+	memset(acxt, 0, sizeof(*acxt));
+
+	mutex_lock(&sysfs_mutex);
+}
+
+/**
+ *	sysfs_add_one - add sysfs_dirent to parent without warning
+ *	@acxt: addrm context to use
+ *	@sd: sysfs_dirent to be added
+ *	@parent_sd: the parent sysfs_dirent to add @sd to
+ *
+ *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
+ *	the parent inode if @sd is a directory and link into the children
+ *	list of the parent.
+ *
+ *	This function should be called between calls to
+ *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by sysfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		  struct sysfs_dirent *parent_sd)
+{
+	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
+	struct sysfs_inode_attrs *ps_iattr;
+	int ret;
+
+	if (has_ns != (bool)sd->s_ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid",
+		     parent_sd->s_name, sd->s_name);
+		return -EINVAL;
+	}
+
+	if (sysfs_type(parent_sd) != SYSFS_DIR)
+		return -EINVAL;
+
+	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
+	sd->s_parent = parent_sd;
+	kernfs_get(parent_sd);
+
+	ret = sysfs_link_sibling(sd);
+	if (ret)
+		return ret;
+
+	/* Update timestamps on the parent */
+	ps_iattr = parent_sd->s_iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
+	/* Mark the entry added into directory tree */
+	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
+
+	return 0;
+}
+
+/**
+ *	sysfs_remove_one - remove sysfs_dirent from parent
+ *	@acxt: addrm context to use
+ *	@sd: sysfs_dirent to be removed
+ *
+ *	Mark @sd removed and drop nlink of parent inode if @sd is a
+ *	directory.  @sd is unlinked from the children list.
+ *
+ *	This function should be called between calls to
+ *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by sysfs_addrm_start().
+ */
+static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
+			     struct sysfs_dirent *sd)
+{
+	struct sysfs_inode_attrs *ps_iattr;
+
+	/*
+	 * Removal can be called multiple times on the same node.  Only the
+	 * first invocation is effective and puts the base ref.
+	 */
+	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+		return;
+
+	sysfs_unlink_sibling(sd);
+
+	/* Update timestamps on the parent */
+	ps_iattr = sd->s_parent->s_iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
+	sd->s_flags |= SYSFS_FLAG_REMOVED;
+	sd->u.removed_list = acxt->removed;
+	acxt->removed = sd;
+}
+
+/**
+ *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
+ *	@acxt: addrm context to finish up
+ *
+ *	Finish up sysfs_dirent add/remove.  Resources acquired by
+ *	sysfs_addrm_start() are released and removed sysfs_dirents are
+ *	cleaned up.
+ *
+ *	LOCKING:
+ *	sysfs_mutex is released.
+ */
+void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
+	__releases(sysfs_mutex)
+{
+	/* release resources acquired by sysfs_addrm_start() */
+	mutex_unlock(&sysfs_mutex);
+
+	/* kill removed sysfs_dirents */
+	while (acxt->removed) {
+		struct sysfs_dirent *sd = acxt->removed;
+
+		acxt->removed = sd->u.removed_list;
+
+		sysfs_deactivate(sd);
+		sysfs_unmap_bin_file(sd);
+		kernfs_put(sd);
+	}
+}
+
+/**
+ * kernfs_find_ns - find sysfs_dirent with the given name
+ * @parent: sysfs_dirent to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for sysfs_dirent with name @name under @parent.  Returns pointer to
+ * the found sysfs_dirent on success, %NULL on failure.
+ */
+static struct sysfs_dirent *kernfs_find_ns(struct sysfs_dirent *parent,
+					   const unsigned char *name,
+					   const void *ns)
+{
+	struct rb_node *node = parent->s_dir.children.rb_node;
+	bool has_ns = parent->s_flags & SYSFS_FLAG_NS;
+	unsigned int hash;
+
+	lockdep_assert_held(&sysfs_mutex);
+
+	if (has_ns != (bool)ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid",
+		     parent->s_name, name);
+		return NULL;
+	}
+
+	hash = sysfs_name_hash(name, ns);
+	while (node) {
+		struct sysfs_dirent *sd;
+		int result;
+
+		sd = to_sysfs_dirent(node);
+		result = sysfs_name_compare(hash, name, ns, sd);
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return sd;
+	}
+	return NULL;
+}
+
+/**
+ * kernfs_find_and_get_ns - find and get sysfs_dirent with the given name
+ * @parent: sysfs_dirent to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for sysfs_dirent with name @name under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * sysfs_dirent on success, %NULL on failure.
+ */
+struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
+					    const char *name, const void *ns)
+{
+	struct sysfs_dirent *sd;
+
+	mutex_lock(&sysfs_mutex);
+	sd = kernfs_find_ns(parent, name, ns);
+	kernfs_get(sd);
+	mutex_unlock(&sysfs_mutex);
+
+	return sd;
+}
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
+
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
+					  const char *name, void *priv,
+					  const void *ns)
+{
+	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+	int rc;
+
+	/* allocate */
+	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
+	if (!sd)
+		return ERR_PTR(-ENOMEM);
+
+	sd->s_ns = ns;
+	sd->priv = priv;
+
+	/* link in */
+	sysfs_addrm_start(&acxt);
+	rc = sysfs_add_one(&acxt, sd, parent);
+	sysfs_addrm_finish(&acxt);
+
+	if (!rc)
+		return sd;
+
+	kernfs_put(sd);
+	return ERR_PTR(rc);
+}
+
+static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct dentry *ret = NULL;
+	struct dentry *parent = dentry->d_parent;
+	struct sysfs_dirent *parent_sd = parent->d_fsdata;
+	struct sysfs_dirent *sd;
+	struct inode *inode;
+	const void *ns = NULL;
+
+	mutex_lock(&sysfs_mutex);
+
+	if (parent_sd->s_flags & SYSFS_FLAG_NS)
+		ns = sysfs_info(dir->i_sb)->ns;
+
+	sd = kernfs_find_ns(parent_sd, dentry->d_name.name, ns);
+
+	/* no such entry */
+	if (!sd) {
+		ret = ERR_PTR(-ENOENT);
+		goto out_unlock;
+	}
+	kernfs_get(sd);
+	dentry->d_fsdata = sd;
+
+	/* attach dentry and inode */
+	inode = sysfs_get_inode(dir->i_sb, sd);
+	if (!inode) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	/* instantiate and hash dentry */
+	ret = d_materialise_unique(dentry, inode);
+ out_unlock:
+	mutex_unlock(&sysfs_mutex);
+	return ret;
+}
+
+const struct inode_operations sysfs_dir_inode_operations = {
+	.lookup		= sysfs_lookup,
+	.permission	= sysfs_permission,
+	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
+	.setxattr	= sysfs_setxattr,
+};
+
+static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
+{
+	struct sysfs_dirent *last;
+
+	while (true) {
+		struct rb_node *rbn;
+
+		last = pos;
+
+		if (sysfs_type(pos) != SYSFS_DIR)
+			break;
+
+		rbn = rb_first(&pos->s_dir.children);
+		if (!rbn)
+			break;
+
+		pos = to_sysfs_dirent(rbn);
+	}
+
+	return last;
+}
+
+/**
+ * sysfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sysfs_dirent whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
+						       struct sysfs_dirent *root)
+{
+	struct rb_node *rbn;
+
+	lockdep_assert_held(&sysfs_mutex);
+
+	/* if first iteration, visit leftmost descendant which may be root */
+	if (!pos)
+		return sysfs_leftmost_descendant(root);
+
+	/* if we visited @root, we're done */
+	if (pos == root)
+		return NULL;
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	rbn = rb_next(&pos->s_rb);
+	if (rbn)
+		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
+
+	/* no sibling left, visit parent */
+	return pos->s_parent;
+}
+
+static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
+			    struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent *pos, *next;
+
+	if (!sd)
+		return;
+
+	pr_debug("sysfs %s: removing\n", sd->s_name);
+
+	next = NULL;
+	do {
+		pos = next;
+		next = sysfs_next_descendant_post(pos, sd);
+		if (pos)
+			sysfs_remove_one(acxt, pos);
+	} while (next);
+}
+
+/**
+ * kernfs_remove - remove a sysfs_dirent recursively
+ * @sd: the sysfs_dirent to remove
+ *
+ * Remove @sd along with all its subdirectories and files.
+ */
+void kernfs_remove(struct sysfs_dirent *sd)
+{
+	struct sysfs_addrm_cxt acxt;
+
+	sysfs_addrm_start(&acxt);
+	__kernfs_remove(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
+}
+
+/**
+ * kernfs_remove_by_name_ns - find a sysfs_dirent by name and remove it
+ * @dir_sd: parent of the target
+ * @name: name of the sysfs_dirent to remove
+ * @ns: namespace tag of the sysfs_dirent to remove
+ *
+ * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
+ * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
+			     const void *ns)
+{
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+
+	if (!dir_sd) {
+		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+			name);
+		return -ENOENT;
+	}
+
+	sysfs_addrm_start(&acxt);
+
+	sd = kernfs_find_ns(dir_sd, name, ns);
+	if (sd)
+		__kernfs_remove(&acxt, sd);
+
+	sysfs_addrm_finish(&acxt);
+
+	if (sd)
+		return 0;
+	else
+		return -ENOENT;
+}
+
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @sd: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
+		     const char *new_name, const void *new_ns)
+{
+	int error;
+
+	mutex_lock(&sysfs_mutex);
+
+	error = 0;
+	if ((sd->s_parent == new_parent) && (sd->s_ns == new_ns) &&
+	    (strcmp(sd->s_name, new_name) == 0))
+		goto out;	/* nothing to rename */
+
+	error = -EEXIST;
+	if (kernfs_find_ns(new_parent, new_name, new_ns))
+		goto out;
+
+	/* rename sysfs_dirent */
+	if (strcmp(sd->s_name, new_name) != 0) {
+		error = -ENOMEM;
+		new_name = kstrdup(new_name, GFP_KERNEL);
+		if (!new_name)
+			goto out;
+
+		kfree(sd->s_name);
+		sd->s_name = new_name;
+	}
+
+	/*
+	 * Move to the appropriate place in the appropriate directories rbtree.
+	 */
+	sysfs_unlink_sibling(sd);
+	kernfs_get(new_parent);
+	kernfs_put(sd->s_parent);
+	sd->s_ns = new_ns;
+	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
+	sd->s_parent = new_parent;
+	sysfs_link_sibling(sd);
+
+	error = 0;
+ out:
+	mutex_unlock(&sysfs_mutex);
+	return error;
+}
+
+/**
+ * kernfs_enable_ns - enable namespace under a directory
+ * @sd: directory of interest, should be empty
+ *
+ * This is to be called right after @sd is created to enable namespace
+ * under it.  All children of @sd must have non-NULL namespace tags and
+ * only the ones which match the super_block's tag will be visible.
+ */
+void kernfs_enable_ns(struct sysfs_dirent *sd)
+{
+	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
+	sd->s_flags |= SYSFS_FLAG_NS;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct sysfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int sysfs_dir_release(struct inode *inode, struct file *filp)
+{
+	kernfs_put(filp->private_data);
+	return 0;
+}
+
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
+	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
+{
+	if (pos) {
+		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
+			pos->s_parent == parent_sd &&
+			hash == pos->s_hash;
+		kernfs_put(pos);
+		if (!valid)
+			pos = NULL;
+	}
+	if (!pos && (hash > 1) && (hash < INT_MAX)) {
+		struct rb_node *node = parent_sd->s_dir.children.rb_node;
+		while (node) {
+			pos = to_sysfs_dirent(node);
+
+			if (hash < pos->s_hash)
+				node = node->rb_left;
+			else if (hash > pos->s_hash)
+				node = node->rb_right;
+			else
+				break;
+		}
+	}
+	/* Skip over entries in the wrong namespace */
+	while (pos && pos->s_ns != ns) {
+		struct rb_node *node = rb_next(&pos->s_rb);
+		if (!node)
+			pos = NULL;
+		else
+			pos = to_sysfs_dirent(node);
+	}
+	return pos;
+}
+
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
+	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+{
+	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
+	if (pos)
+		do {
+			struct rb_node *node = rb_next(&pos->s_rb);
+			if (!node)
+				pos = NULL;
+			else
+				pos = to_sysfs_dirent(node);
+		} while (pos && pos->s_ns != ns);
+	return pos;
+}
+
+static int sysfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
+	struct sysfs_dirent *pos = file->private_data;
+	const void *ns = NULL;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	mutex_lock(&sysfs_mutex);
+
+	if (parent_sd->s_flags & SYSFS_FLAG_NS)
+		ns = sysfs_info(dentry->d_sb)->ns;
+
+	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
+	     pos;
+	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
+		const char *name = pos->s_name;
+		unsigned int type = dt_type(pos);
+		int len = strlen(name);
+		ino_t ino = pos->s_ino;
+
+		ctx->pos = pos->s_hash;
+		file->private_data = pos;
+		kernfs_get(pos);
+
+		mutex_unlock(&sysfs_mutex);
+		if (!dir_emit(ctx, name, len, ino, type))
+			return 0;
+		mutex_lock(&sysfs_mutex);
+	}
+	mutex_unlock(&sysfs_mutex);
+	file->private_data = NULL;
+	ctx->pos = INT_MAX;
+	return 0;
+}
+
+static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file_inode(file);
+	loff_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = generic_file_llseek(file, offset, whence);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+const struct file_operations sysfs_dir_operations = {
+	.read		= generic_read_dir,
+	.iterate	= sysfs_readdir,
+	.release	= sysfs_dir_release,
+	.llseek		= sysfs_dir_llseek,
+};
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 933ac8d5d07..31f0dbe1881 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -14,6 +14,7 @@
 #include <linux/lockdep.h>
 #include <linux/fs.h>
 #include <linux/rbtree.h>
+#include <linux/mutex.h>
 
 #include <linux/kernfs.h>
 
@@ -125,4 +126,20 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags);
 int sysfs_inode_init(void);
 
+/*
+ * dir.c
+ */
+extern struct mutex sysfs_mutex;
+extern const struct dentry_operations sysfs_dentry_ops;
+extern const struct file_operations sysfs_dir_operations;
+extern const struct inode_operations sysfs_dir_inode_operations;
+
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+void sysfs_put_active(struct sysfs_dirent *sd);
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		  struct sysfs_dirent *parent_sd);
+void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
+struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
+
 #endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0d806efcc9a..e5c4e711805 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,468 +13,12 @@
 #undef DEBUG
 
 #include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
-#include <linux/idr.h>
-#include <linux/completion.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/security.h>
-#include <linux/hash.h>
 #include "sysfs.h"
 
-DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_symlink_target_lock);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
-
-static DEFINE_SPINLOCK(sysfs_ino_lock);
-static DEFINE_IDA(sysfs_ino_ida);
-
-/**
- *	sysfs_name_hash
- *	@name: Null terminated string to hash
- *	@ns:   Namespace tag to hash
- *
- *	Returns 31 bit hash of ns + name (so it fits in an off_t )
- */
-static unsigned int sysfs_name_hash(const char *name, const void *ns)
-{
-	unsigned long hash = init_name_hash();
-	unsigned int len = strlen(name);
-	while (len--)
-		hash = partial_name_hash(*name++, hash);
-	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
-	hash &= 0x7fffffffU;
-	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
-	if (hash < 1)
-		hash += 2;
-	if (hash >= INT_MAX)
-		hash = INT_MAX - 1;
-	return hash;
-}
-
-static int sysfs_name_compare(unsigned int hash, const char *name,
-			      const void *ns, const struct sysfs_dirent *sd)
-{
-	if (hash != sd->s_hash)
-		return hash - sd->s_hash;
-	if (ns != sd->s_ns)
-		return ns - sd->s_ns;
-	return strcmp(name, sd->s_name);
-}
-
-static int sysfs_sd_compare(const struct sysfs_dirent *left,
-			    const struct sysfs_dirent *right)
-{
-	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
-				  right);
-}
-
-/**
- *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
- *	@sd: sysfs_dirent of interest
- *
- *	Link @sd into its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
- *
- *	Locking:
- *	mutex_lock(sysfs_mutex)
- *
- *	RETURNS:
- *	0 on susccess -EEXIST on failure.
- */
-static int sysfs_link_sibling(struct sysfs_dirent *sd)
-{
-	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
-	struct rb_node *parent = NULL;
-
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs++;
-
-	while (*node) {
-		struct sysfs_dirent *pos;
-		int result;
-
-		pos = to_sysfs_dirent(*node);
-		parent = *node;
-		result = sysfs_sd_compare(sd, pos);
-		if (result < 0)
-			node = &pos->s_rb.rb_left;
-		else if (result > 0)
-			node = &pos->s_rb.rb_right;
-		else
-			return -EEXIST;
-	}
-	/* add new node and rebalance the tree */
-	rb_link_node(&sd->s_rb, parent, node);
-	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
-	return 0;
-}
-
-/**
- *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
- *	@sd: sysfs_dirent of interest
- *
- *	Unlink @sd from its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
- *
- *	Locking:
- *	mutex_lock(sysfs_mutex)
- */
-static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
-{
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs--;
-
-	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
-}
-
-/**
- *	sysfs_get_active - get an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to get an active reference to
- *
- *	Get an active reference of @sd.  This function is noop if @sd
- *	is NULL.
- *
- *	RETURNS:
- *	Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
-{
-	if (unlikely(!sd))
-		return NULL;
-
-	if (!atomic_inc_unless_negative(&sd->s_active))
-		return NULL;
-
-	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
-		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
-	return sd;
-}
-
-/**
- *	sysfs_put_active - put an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to put an active reference to
- *
- *	Put an active reference to @sd.  This function is noop if @sd
- *	is NULL.
- */
-void sysfs_put_active(struct sysfs_dirent *sd)
-{
-	int v;
-
-	if (unlikely(!sd))
-		return;
-
-	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
-		rwsem_release(&sd->dep_map, 1, _RET_IP_);
-	v = atomic_dec_return(&sd->s_active);
-	if (likely(v != SD_DEACTIVATED_BIAS))
-		return;
-
-	/* atomic_dec_return() is a mb(), we'll always see the updated
-	 * sd->u.completion.
-	 */
-	complete(sd->u.completion);
-}
-
-/**
- *	sysfs_deactivate - deactivate sysfs_dirent
- *	@sd: sysfs_dirent to deactivate
- *
- *	Deny new active references and drain existing ones.
- */
-static void sysfs_deactivate(struct sysfs_dirent *sd)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	int v;
-
-	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
-
-	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
-		return;
-
-	sd->u.completion = (void *)&wait;
-
-	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
-	/* atomic_add_return() is a mb(), put_active() will always see
-	 * the updated sd->u.completion.
-	 */
-	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-
-	if (v != SD_DEACTIVATED_BIAS) {
-		lock_contended(&sd->dep_map, _RET_IP_);
-		wait_for_completion(&wait);
-	}
-
-	lock_acquired(&sd->dep_map, _RET_IP_);
-	rwsem_release(&sd->dep_map, 1, _RET_IP_);
-}
-
-static int sysfs_alloc_ino(unsigned int *pino)
-{
-	int ino, rc;
-
- retry:
-	spin_lock(&sysfs_ino_lock);
-	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
-	spin_unlock(&sysfs_ino_lock);
-
-	if (rc == -EAGAIN) {
-		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
-			goto retry;
-		rc = -ENOMEM;
-	}
-
-	*pino = ino;
-	return rc;
-}
-
-static void sysfs_free_ino(unsigned int ino)
-{
-	spin_lock(&sysfs_ino_lock);
-	ida_remove(&sysfs_ino_ida, ino);
-	spin_unlock(&sysfs_ino_lock);
-}
-
-/**
- * kernfs_get - get a reference count on a sysfs_dirent
- * @sd: the target sysfs_dirent
- */
-void kernfs_get(struct sysfs_dirent *sd)
-{
-	if (sd) {
-		WARN_ON(!atomic_read(&sd->s_count));
-		atomic_inc(&sd->s_count);
-	}
-}
-EXPORT_SYMBOL_GPL(kernfs_get);
-
-/**
- * kernfs_put - put a reference count on a sysfs_dirent
- * @sd: the target sysfs_dirent
- *
- * Put a reference count of @sd and destroy it if it reached zero.
- */
-void kernfs_put(struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *parent_sd;
-
-	if (!sd || !atomic_dec_and_test(&sd->s_count))
-		return;
- repeat:
-	/* Moving/renaming is always done while holding reference.
-	 * sd->s_parent won't change beneath us.
-	 */
-	parent_sd = sd->s_parent;
-
-	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
-		"sysfs: free using entry: %s/%s\n",
-		parent_sd ? parent_sd->s_name : "", sd->s_name);
-
-	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-		kernfs_put(sd->s_symlink.target_sd);
-	if (sysfs_type(sd) & SYSFS_COPY_NAME)
-		kfree(sd->s_name);
-	if (sd->s_iattr && sd->s_iattr->ia_secdata)
-		security_release_secctx(sd->s_iattr->ia_secdata,
-					sd->s_iattr->ia_secdata_len);
-	kfree(sd->s_iattr);
-	sysfs_free_ino(sd->s_ino);
-	kmem_cache_free(sysfs_dir_cachep, sd);
-
-	sd = parent_sd;
-	if (sd && atomic_dec_and_test(&sd->s_count))
-		goto repeat;
-}
-EXPORT_SYMBOL_GPL(kernfs_put);
-
-static int sysfs_dentry_delete(const struct dentry *dentry)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
-}
-
-static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	struct sysfs_dirent *sd;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	sd = dentry->d_fsdata;
-	mutex_lock(&sysfs_mutex);
-
-	/* The sysfs dirent has been deleted */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
-		goto out_bad;
-
-	/* The sysfs dirent has been moved? */
-	if (dentry->d_parent->d_fsdata != sd->s_parent)
-		goto out_bad;
-
-	/* The sysfs dirent has been renamed */
-	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
-		goto out_bad;
-
-	/* The sysfs dirent has been moved to a different namespace */
-	if (sd->s_parent && (sd->s_parent->s_flags & SYSFS_FLAG_NS) &&
-	    sysfs_info(dentry->d_sb)->ns != sd->s_ns)
-		goto out_bad;
-
-	mutex_unlock(&sysfs_mutex);
-out_valid:
-	return 1;
-out_bad:
-	/* Remove the dentry from the dcache hashes.
-	 * If this is a deleted dentry we use d_drop instead of d_delete
-	 * so sysfs doesn't need to cope with negative dentries.
-	 *
-	 * If this is a dentry that has simply been renamed we
-	 * use d_drop to remove it from the dcache lookup on its
-	 * old parent.  If this dentry persists later when a lookup
-	 * is performed at its new name the dentry will be readded
-	 * to the dcache hashes.
-	 */
-	mutex_unlock(&sysfs_mutex);
-
-	/* If we have submounts we must allow the vfs caches
-	 * to lie about the state of the filesystem to prevent
-	 * leaks and other nasty things.
-	 */
-	if (check_submounts_and_drop(dentry) != 0)
-		goto out_valid;
-
-	return 0;
-}
-
-static void sysfs_dentry_release(struct dentry *dentry)
-{
-	kernfs_put(dentry->d_fsdata);
-}
-
-const struct dentry_operations sysfs_dentry_ops = {
-	.d_revalidate	= sysfs_dentry_revalidate,
-	.d_delete	= sysfs_dentry_delete,
-	.d_release	= sysfs_dentry_release,
-};
-
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
-{
-	char *dup_name = NULL;
-	struct sysfs_dirent *sd;
-
-	if (type & SYSFS_COPY_NAME) {
-		name = dup_name = kstrdup(name, GFP_KERNEL);
-		if (!name)
-			return NULL;
-	}
-
-	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
-	if (!sd)
-		goto err_out1;
-
-	if (sysfs_alloc_ino(&sd->s_ino))
-		goto err_out2;
-
-	atomic_set(&sd->s_count, 1);
-	atomic_set(&sd->s_active, 0);
-
-	sd->s_name = name;
-	sd->s_mode = mode;
-	sd->s_flags = type | SYSFS_FLAG_REMOVED;
-
-	return sd;
-
- err_out2:
-	kmem_cache_free(sysfs_dir_cachep, sd);
- err_out1:
-	kfree(dup_name);
-	return NULL;
-}
-
-/**
- *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
- *	@acxt: pointer to sysfs_addrm_cxt to be used
- *
- *	This function is called when the caller is about to add or remove
- *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
- *	to keep and pass context to other addrm functions.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).  sysfs_mutex is locked on
- *	return.
- */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
-	__acquires(sysfs_mutex)
-{
-	memset(acxt, 0, sizeof(*acxt));
-
-	mutex_lock(&sysfs_mutex);
-}
-
-/**
- *	sysfs_add_one - add sysfs_dirent to parent without warning
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
- *	@parent_sd: the parent sysfs_dirent to add @sd to
- *
- *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *	the parent inode if @sd is a directory and link into the children
- *	list of the parent.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- *
- *	RETURNS:
- *	0 on success, -EEXIST if entry with the given name already
- *	exists.
- */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd)
-{
-	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
-	struct sysfs_inode_attrs *ps_iattr;
-	int ret;
-
-	if (has_ns != (bool)sd->s_ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-		     has_ns ? "required" : "invalid",
-		     parent_sd->s_name, sd->s_name);
-		return -EINVAL;
-	}
-
-	if (sysfs_type(parent_sd) != SYSFS_DIR)
-		return -EINVAL;
-
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = parent_sd;
-	kernfs_get(parent_sd);
-
-	ret = sysfs_link_sibling(sd);
-	if (ret)
-		return ret;
-
-	/* Update timestamps on the parent */
-	ps_iattr = parent_sd->s_iattr;
-	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-	}
-
-	/* Mark the entry added into directory tree */
-	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
-
-	return 0;
-}
-
 /**
  *	sysfs_pathname - return full path to sysfs dirent
  *	@sd: sysfs_dirent whose path we want
@@ -510,181 +54,6 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
 	kfree(path);
 }
 
-/**
- *	sysfs_remove_one - remove sysfs_dirent from parent
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be removed
- *
- *	Mark @sd removed and drop nlink of parent inode if @sd is a
- *	directory.  @sd is unlinked from the children list.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- */
-static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
-			     struct sysfs_dirent *sd)
-{
-	struct sysfs_inode_attrs *ps_iattr;
-
-	/*
-	 * Removal can be called multiple times on the same node.  Only the
-	 * first invocation is effective and puts the base ref.
-	 */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
-		return;
-
-	sysfs_unlink_sibling(sd);
-
-	/* Update timestamps on the parent */
-	ps_iattr = sd->s_parent->s_iattr;
-	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-	}
-
-	sd->s_flags |= SYSFS_FLAG_REMOVED;
-	sd->u.removed_list = acxt->removed;
-	acxt->removed = sd;
-}
-
-/**
- *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
- *	@acxt: addrm context to finish up
- *
- *	Finish up sysfs_dirent add/remove.  Resources acquired by
- *	sysfs_addrm_start() are released and removed sysfs_dirents are
- *	cleaned up.
- *
- *	LOCKING:
- *	sysfs_mutex is released.
- */
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
-	__releases(sysfs_mutex)
-{
-	/* release resources acquired by sysfs_addrm_start() */
-	mutex_unlock(&sysfs_mutex);
-
-	/* kill removed sysfs_dirents */
-	while (acxt->removed) {
-		struct sysfs_dirent *sd = acxt->removed;
-
-		acxt->removed = sd->u.removed_list;
-
-		sysfs_deactivate(sd);
-		sysfs_unmap_bin_file(sd);
-		kernfs_put(sd);
-	}
-}
-
-/**
- * kernfs_find_ns - find sysfs_dirent with the given name
- * @parent: sysfs_dirent to search under
- * @name: name to look for
- * @ns: the namespace tag to use
- *
- * Look for sysfs_dirent with name @name under @parent.  Returns pointer to
- * the found sysfs_dirent on success, %NULL on failure.
- */
-static struct sysfs_dirent *kernfs_find_ns(struct sysfs_dirent *parent,
-					   const unsigned char *name,
-					   const void *ns)
-{
-	struct rb_node *node = parent->s_dir.children.rb_node;
-	bool has_ns = parent->s_flags & SYSFS_FLAG_NS;
-	unsigned int hash;
-
-	lockdep_assert_held(&sysfs_mutex);
-
-	if (has_ns != (bool)ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-		     has_ns ? "required" : "invalid",
-		     parent->s_name, name);
-		return NULL;
-	}
-
-	hash = sysfs_name_hash(name, ns);
-	while (node) {
-		struct sysfs_dirent *sd;
-		int result;
-
-		sd = to_sysfs_dirent(node);
-		result = sysfs_name_compare(hash, name, ns, sd);
-		if (result < 0)
-			node = node->rb_left;
-		else if (result > 0)
-			node = node->rb_right;
-		else
-			return sd;
-	}
-	return NULL;
-}
-
-/**
- * kernfs_find_and_get_ns - find and get sysfs_dirent with the given name
- * @parent: sysfs_dirent to search under
- * @name: name to look for
- * @ns: the namespace tag to use
- *
- * Look for sysfs_dirent with name @name under @parent and get a reference
- * if found.  This function may sleep and returns pointer to the found
- * sysfs_dirent on success, %NULL on failure.
- */
-struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
-					    const char *name, const void *ns)
-{
-	struct sysfs_dirent *sd;
-
-	mutex_lock(&sysfs_mutex);
-	sd = kernfs_find_ns(parent, name, ns);
-	kernfs_get(sd);
-	mutex_unlock(&sysfs_mutex);
-
-	return sd;
-}
-EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
-
-/**
- * kernfs_create_dir_ns - create a directory
- * @parent: parent in which to create a new directory
- * @name: name of the new directory
- * @priv: opaque data associated with the new directory
- * @ns: optional namespace tag of the directory
- *
- * Returns the created node on success, ERR_PTR() value on failure.
- */
-struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
-					  const char *name, void *priv,
-					  const void *ns)
-{
-	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-	int rc;
-
-	/* allocate */
-	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
-	if (!sd)
-		return ERR_PTR(-ENOMEM);
-
-	sd->s_ns = ns;
-	sd->priv = priv;
-
-	/* link in */
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, parent);
-	sysfs_addrm_finish(&acxt);
-
-	if (!rc)
-		return sd;
-
-	kernfs_put(sd);
-	return ERR_PTR(rc);
-}
-
 /**
  * sysfs_create_dir_ns - create a directory for an object with a namespace tag
  * @kobj: object we're creating directory for
@@ -715,177 +84,6 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	return 0;
 }
 
-static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
-				   unsigned int flags)
-{
-	struct dentry *ret = NULL;
-	struct dentry *parent = dentry->d_parent;
-	struct sysfs_dirent *parent_sd = parent->d_fsdata;
-	struct sysfs_dirent *sd;
-	struct inode *inode;
-	const void *ns = NULL;
-
-	mutex_lock(&sysfs_mutex);
-
-	if (parent_sd->s_flags & SYSFS_FLAG_NS)
-		ns = sysfs_info(dir->i_sb)->ns;
-
-	sd = kernfs_find_ns(parent_sd, dentry->d_name.name, ns);
-
-	/* no such entry */
-	if (!sd) {
-		ret = ERR_PTR(-ENOENT);
-		goto out_unlock;
-	}
-	kernfs_get(sd);
-	dentry->d_fsdata = sd;
-
-	/* attach dentry and inode */
-	inode = sysfs_get_inode(dir->i_sb, sd);
-	if (!inode) {
-		ret = ERR_PTR(-ENOMEM);
-		goto out_unlock;
-	}
-
-	/* instantiate and hash dentry */
-	ret = d_materialise_unique(dentry, inode);
- out_unlock:
-	mutex_unlock(&sysfs_mutex);
-	return ret;
-}
-
-const struct inode_operations sysfs_dir_inode_operations = {
-	.lookup		= sysfs_lookup,
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-};
-
-static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
-{
-	struct sysfs_dirent *last;
-
-	while (true) {
-		struct rb_node *rbn;
-
-		last = pos;
-
-		if (sysfs_type(pos) != SYSFS_DIR)
-			break;
-
-		rbn = rb_first(&pos->s_dir.children);
-		if (!rbn)
-			break;
-
-		pos = to_sysfs_dirent(rbn);
-	}
-
-	return last;
-}
-
-/**
- * sysfs_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: sysfs_dirent whose descendants to walk
- *
- * Find the next descendant to visit for post-order traversal of @root's
- * descendants.  @root is included in the iteration and the last node to be
- * visited.
- */
-static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
-						       struct sysfs_dirent *root)
-{
-	struct rb_node *rbn;
-
-	lockdep_assert_held(&sysfs_mutex);
-
-	/* if first iteration, visit leftmost descendant which may be root */
-	if (!pos)
-		return sysfs_leftmost_descendant(root);
-
-	/* if we visited @root, we're done */
-	if (pos == root)
-		return NULL;
-
-	/* if there's an unvisited sibling, visit its leftmost descendant */
-	rbn = rb_next(&pos->s_rb);
-	if (rbn)
-		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
-
-	/* no sibling left, visit parent */
-	return pos->s_parent;
-}
-
-static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
-			    struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *pos, *next;
-
-	if (!sd)
-		return;
-
-	pr_debug("sysfs %s: removing\n", sd->s_name);
-
-	next = NULL;
-	do {
-		pos = next;
-		next = sysfs_next_descendant_post(pos, sd);
-		if (pos)
-			sysfs_remove_one(acxt, pos);
-	} while (next);
-}
-
-/**
- * kernfs_remove - remove a sysfs_dirent recursively
- * @sd: the sysfs_dirent to remove
- *
- * Remove @sd along with all its subdirectories and files.
- */
-void kernfs_remove(struct sysfs_dirent *sd)
-{
-	struct sysfs_addrm_cxt acxt;
-
-	sysfs_addrm_start(&acxt);
-	__kernfs_remove(&acxt, sd);
-	sysfs_addrm_finish(&acxt);
-}
-
-/**
- * kernfs_remove_by_name_ns - find a sysfs_dirent by name and remove it
- * @dir_sd: parent of the target
- * @name: name of the sysfs_dirent to remove
- * @ns: namespace tag of the sysfs_dirent to remove
- *
- * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
- * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
- */
-int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
-			     const void *ns)
-{
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-
-	if (!dir_sd) {
-		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
-			name);
-		return -ENOENT;
-	}
-
-	sysfs_addrm_start(&acxt);
-
-	sd = kernfs_find_ns(dir_sd, name, ns);
-	if (sd)
-		__kernfs_remove(&acxt, sd);
-
-	sysfs_addrm_finish(&acxt);
-
-	if (sd)
-		return 0;
-	else
-		return -ENOENT;
-}
-
 /**
  *	sysfs_remove_dir - remove an object's directory.
  *	@kobj:	object.
@@ -920,57 +118,6 @@ void sysfs_remove_dir(struct kobject *kobj)
 	}
 }
 
-/**
- * kernfs_rename_ns - move and rename a kernfs_node
- * @sd: target node
- * @new_parent: new parent to put @sd under
- * @new_name: new name
- * @new_ns: new namespace tag
- */
-int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
-		     const char *new_name, const void *new_ns)
-{
-	int error;
-
-	mutex_lock(&sysfs_mutex);
-
-	error = 0;
-	if ((sd->s_parent == new_parent) && (sd->s_ns == new_ns) &&
-	    (strcmp(sd->s_name, new_name) == 0))
-		goto out;	/* nothing to rename */
-
-	error = -EEXIST;
-	if (kernfs_find_ns(new_parent, new_name, new_ns))
-		goto out;
-
-	/* rename sysfs_dirent */
-	if (strcmp(sd->s_name, new_name) != 0) {
-		error = -ENOMEM;
-		new_name = kstrdup(new_name, GFP_KERNEL);
-		if (!new_name)
-			goto out;
-
-		kfree(sd->s_name);
-		sd->s_name = new_name;
-	}
-
-	/*
-	 * Move to the appropriate place in the appropriate directories rbtree.
-	 */
-	sysfs_unlink_sibling(sd);
-	kernfs_get(new_parent);
-	kernfs_put(sd->s_parent);
-	sd->s_ns = new_ns;
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = new_parent;
-	sysfs_link_sibling(sd);
-
-	error = 0;
- out:
-	mutex_unlock(&sysfs_mutex);
-	return error;
-}
-
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 			const void *new_ns)
 {
@@ -991,136 +138,3 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 
 	return kernfs_rename_ns(sd, new_parent_sd, sd->s_name, new_ns);
 }
-
-/**
- * kernfs_enable_ns - enable namespace under a directory
- * @sd: directory of interest, should be empty
- *
- * This is to be called right after @sd is created to enable namespace
- * under it.  All children of @sd must have non-NULL namespace tags and
- * only the ones which match the super_block's tag will be visible.
- */
-void kernfs_enable_ns(struct sysfs_dirent *sd)
-{
-	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
-	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
-	sd->s_flags |= SYSFS_FLAG_NS;
-}
-
-/* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct sysfs_dirent *sd)
-{
-	return (sd->s_mode >> 12) & 15;
-}
-
-static int sysfs_dir_release(struct inode *inode, struct file *filp)
-{
-	kernfs_put(filp->private_data);
-	return 0;
-}
-
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
-{
-	if (pos) {
-		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
-			pos->s_parent == parent_sd &&
-			hash == pos->s_hash;
-		kernfs_put(pos);
-		if (!valid)
-			pos = NULL;
-	}
-	if (!pos && (hash > 1) && (hash < INT_MAX)) {
-		struct rb_node *node = parent_sd->s_dir.children.rb_node;
-		while (node) {
-			pos = to_sysfs_dirent(node);
-
-			if (hash < pos->s_hash)
-				node = node->rb_left;
-			else if (hash > pos->s_hash)
-				node = node->rb_right;
-			else
-				break;
-		}
-	}
-	/* Skip over entries in the wrong namespace */
-	while (pos && pos->s_ns != ns) {
-		struct rb_node *node = rb_next(&pos->s_rb);
-		if (!node)
-			pos = NULL;
-		else
-			pos = to_sysfs_dirent(node);
-	}
-	return pos;
-}
-
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
-{
-	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-	if (pos)
-		do {
-			struct rb_node *node = rb_next(&pos->s_rb);
-			if (!node)
-				pos = NULL;
-			else
-				pos = to_sysfs_dirent(node);
-		} while (pos && pos->s_ns != ns);
-	return pos;
-}
-
-static int sysfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *pos = file->private_data;
-	const void *ns = NULL;
-
-	if (!dir_emit_dots(file, ctx))
-		return 0;
-	mutex_lock(&sysfs_mutex);
-
-	if (parent_sd->s_flags & SYSFS_FLAG_NS)
-		ns = sysfs_info(dentry->d_sb)->ns;
-
-	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
-	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
-		const char *name = pos->s_name;
-		unsigned int type = dt_type(pos);
-		int len = strlen(name);
-		ino_t ino = pos->s_ino;
-
-		ctx->pos = pos->s_hash;
-		file->private_data = pos;
-		kernfs_get(pos);
-
-		mutex_unlock(&sysfs_mutex);
-		if (!dir_emit(ctx, name, len, ino, type))
-			return 0;
-		mutex_lock(&sysfs_mutex);
-	}
-	mutex_unlock(&sysfs_mutex);
-	file->private_data = NULL;
-	ctx->pos = INT_MAX;
-	return 0;
-}
-
-static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	struct inode *inode = file_inode(file);
-	loff_t ret;
-
-	mutex_lock(&inode->i_mutex);
-	ret = generic_file_llseek(file, offset, whence);
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-const struct file_operations sysfs_dir_operations = {
-	.read		= generic_read_dir,
-	.iterate	= sysfs_readdir,
-	.release	= sysfs_dir_release,
-	.llseek		= sysfs_dir_llseek,
-};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 93c1910783f..972b4a4a5f9 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -34,22 +34,9 @@ extern struct kmem_cache *sysfs_dir_cachep;
 /*
  * dir.c
  */
-extern struct mutex sysfs_mutex;
 extern spinlock_t sysfs_symlink_target_lock;
-extern const struct dentry_operations sysfs_dentry_ops;
 
-extern const struct file_operations sysfs_dir_operations;
-extern const struct inode_operations sysfs_dir_inode_operations;
-
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
 void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd);
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
 /*
  * file.c
-- 
cgit v1.2.3-70-g09d2


From 414985ae23c031efbd6d16d484dea8b5de28b8f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:34 -0500
Subject: sysfs, kernfs: move file core code to fs/kernfs/file.c

Move core file code to fs/kernfs/file.c.  fs/sysfs/file.c now contains
sysfs kernfs_ops callbacks, sysfs wrappers around kernfs interfaces,
and sysfs_schedule_callback().  The respective declarations in
fs/sysfs/sysfs.h are moved to fs/kernfs/kernfs-internal.h.

This is pure relocation.

v2: Refreshed on top of the v2 of "sysfs, kernfs: prepare read path
    for kernfs".

v3: Refreshed on top of the v3 of "sysfs, kernfs: prepare read path
    for kernfs".

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c            | 805 ++++++++++++++++++++++++++++++++++++++++++++
 fs/kernfs/kernfs-internal.h |   7 +
 fs/sysfs/file.c             | 802 +------------------------------------------
 fs/sysfs/sysfs.h            |   4 -
 4 files changed, 813 insertions(+), 805 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 90b1e88dad4..fa172e86047 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -7,3 +7,808 @@
  *
  * This file is released under the GPLv2.
  */
+
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+
+#include "kernfs-internal.h"
+
+/*
+ * There's one sysfs_open_file for each open file and one sysfs_open_dirent
+ * for each sysfs_dirent with one or more open files.
+ *
+ * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
+ * protected by sysfs_open_dirent_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * sysfs_open_file.  sysfs_open_files are chained at
+ * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
+ */
+static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
+static DEFINE_MUTEX(sysfs_open_file_mutex);
+
+struct sysfs_open_dirent {
+	atomic_t		refcnt;
+	atomic_t		event;
+	wait_queue_head_t	poll;
+	struct list_head	files; /* goes through sysfs_open_file.list */
+};
+
+static struct sysfs_open_file *sysfs_of(struct file *file)
+{
+	return ((struct seq_file *)file->private_data)->private;
+}
+
+/*
+ * Determine the kernfs_ops for the given sysfs_dirent.  This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct sysfs_dirent *sd)
+{
+	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
+		lockdep_assert_held(sd);
+	return sd->s_attr.ops;
+}
+
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+	struct sysfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops;
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd))
+		return ERR_PTR(-ENODEV);
+
+	ops = kernfs_ops(of->sd);
+	if (ops->seq_start) {
+		return ops->seq_start(sf, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open().  Returns
+		 * !NULL if pos is at the beginning; otherwise, NULL.
+		 */
+		return NULL + !*ppos;
+	}
+}
+
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+	struct sysfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->sd);
+
+	if (ops->seq_next) {
+		return ops->seq_next(sf, v, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open(), always
+		 * terminate after the initial read.
+		 */
+		++*ppos;
+		return NULL;
+	}
+}
+
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+	struct sysfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->sd);
+
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+}
+
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+	struct sysfs_open_file *of = sf->private;
+
+	of->event = atomic_read(&of->sd->s_attr.open->event);
+
+	return of->sd->s_attr.ops->seq_show(sf, v);
+}
+
+static const struct seq_operations kernfs_seq_ops = {
+	.start = kernfs_seq_start,
+	.next = kernfs_seq_next,
+	.stop = kernfs_seq_stop,
+	.show = kernfs_seq_show,
+};
+
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file.  Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
+				       char __user *user_buf, size_t count,
+				       loff_t *ppos)
+{
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
+	char *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		len = -ENODEV;
+		mutex_unlock(&of->mutex);
+		goto out_free;
+	}
+
+	ops = kernfs_ops(of->sd);
+	if (ops->read)
+		len = ops->read(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+
+	if (len < 0)
+		goto out_free;
+
+	if (copy_to_user(user_buf, buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+
+	*ppos += len;
+
+ out_free:
+	kfree(buf);
+	return len;
+}
+
+/**
+ * kernfs_file_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct sysfs_open_file *of = sysfs_of(file);
+
+	if (of->sd->s_flags & SYSFS_FLAG_HAS_SEQ_SHOW)
+		return seq_read(file, user_buf, count, ppos);
+	else
+		return kernfs_file_direct_read(of, user_buf, count, ppos);
+}
+
+/**
+ * kernfs_file_write - kernfs vfs write callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
+ */
+static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
+				 size_t count, loff_t *ppos)
+{
+	struct sysfs_open_file *of = sysfs_of(file);
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
+	char *buf;
+
+	buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		mutex_unlock(&of->mutex);
+		len = -ENODEV;
+		goto out_free;
+	}
+
+	ops = kernfs_ops(of->sd);
+	if (ops->write)
+		len = ops->write(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+
+	if (len > 0)
+		*ppos += len;
+out_free:
+	kfree(buf);
+	return len;
+}
+
+static void kernfs_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+
+	if (!of->vm_ops)
+		return;
+
+	if (!sysfs_get_active(of->sd))
+		return;
+
+	if (of->vm_ops->open)
+		of->vm_ops->open(vma);
+
+	sysfs_put_active(of->sd);
+}
+
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(of->sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = VM_FAULT_SIGBUS;
+	if (of->vm_ops->fault)
+		ret = of->vm_ops->fault(vma, vmf);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+				   struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(of->sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = 0;
+	if (of->vm_ops->page_mkwrite)
+		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	else
+		file_update_time(file);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+			     void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return -EINVAL;
+
+	if (!sysfs_get_active(of->sd))
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (of->vm_ops->access)
+		ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+				 struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(of->sd))
+		return -EINVAL;
+
+	ret = 0;
+	if (of->vm_ops->set_policy)
+		ret = of->vm_ops->set_policy(vma, new);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+					       unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	struct mempolicy *pol;
+
+	if (!of->vm_ops)
+		return vma->vm_policy;
+
+	if (!sysfs_get_active(of->sd))
+		return vma->vm_policy;
+
+	pol = vma->vm_policy;
+	if (of->vm_ops->get_policy)
+		pol = of->vm_ops->get_policy(vma, addr);
+
+	sysfs_put_active(of->sd);
+	return pol;
+}
+
+static int kernfs_vma_migrate(struct vm_area_struct *vma,
+			      const nodemask_t *from, const nodemask_t *to,
+			      unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(of->sd))
+		return 0;
+
+	ret = 0;
+	if (of->vm_ops->migrate)
+		ret = of->vm_ops->migrate(vma, from, to, flags);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+#endif
+
+static const struct vm_operations_struct kernfs_vm_ops = {
+	.open		= kernfs_vma_open,
+	.fault		= kernfs_vma_fault,
+	.page_mkwrite	= kernfs_vma_page_mkwrite,
+	.access		= kernfs_vma_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= kernfs_vma_set_policy,
+	.get_policy	= kernfs_vma_get_policy,
+	.migrate	= kernfs_vma_migrate,
+#endif
+};
+
+static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sysfs_open_file *of = sysfs_of(file);
+	const struct kernfs_ops *ops;
+	int rc;
+
+	mutex_lock(&of->mutex);
+
+	rc = -ENODEV;
+	if (!sysfs_get_active(of->sd))
+		goto out_unlock;
+
+	ops = kernfs_ops(of->sd);
+	if (ops->mmap)
+		rc = ops->mmap(of, vma);
+	if (rc)
+		goto out_put;
+
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
+
+	rc = -EINVAL;
+	if (of->mmapped && of->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
+	rc = 0;
+	of->mmapped = 1;
+	of->vm_ops = vma->vm_ops;
+	vma->vm_ops = &kernfs_vm_ops;
+out_put:
+	sysfs_put_active(of->sd);
+out_unlock:
+	mutex_unlock(&of->mutex);
+
+	return rc;
+}
+
+/**
+ *	sysfs_get_open_dirent - get or create sysfs_open_dirent
+ *	@sd: target sysfs_dirent
+ *	@of: sysfs_open_file for this instance of open
+ *
+ *	If @sd->s_attr.open exists, increment its reference count;
+ *	otherwise, create one.  @of is chained to the files list.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
+				 struct sysfs_open_file *of)
+{
+	struct sysfs_open_dirent *od, *new_od = NULL;
+
+ retry:
+	mutex_lock(&sysfs_open_file_mutex);
+	spin_lock_irq(&sysfs_open_dirent_lock);
+
+	if (!sd->s_attr.open && new_od) {
+		sd->s_attr.open = new_od;
+		new_od = NULL;
+	}
+
+	od = sd->s_attr.open;
+	if (od) {
+		atomic_inc(&od->refcnt);
+		list_add_tail(&of->list, &od->files);
+	}
+
+	spin_unlock_irq(&sysfs_open_dirent_lock);
+	mutex_unlock(&sysfs_open_file_mutex);
+
+	if (od) {
+		kfree(new_od);
+		return 0;
+	}
+
+	/* not there, initialize a new one and retry */
+	new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
+	if (!new_od)
+		return -ENOMEM;
+
+	atomic_set(&new_od->refcnt, 0);
+	atomic_set(&new_od->event, 1);
+	init_waitqueue_head(&new_od->poll);
+	INIT_LIST_HEAD(&new_od->files);
+	goto retry;
+}
+
+/**
+ *	sysfs_put_open_dirent - put sysfs_open_dirent
+ *	@sd: target sysfs_dirent
+ *	@of: associated sysfs_open_file
+ *
+ *	Put @sd->s_attr.open and unlink @of from the files list.  If
+ *	reference count reaches zero, disassociate and free it.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
+				  struct sysfs_open_file *of)
+{
+	struct sysfs_open_dirent *od = sd->s_attr.open;
+	unsigned long flags;
+
+	mutex_lock(&sysfs_open_file_mutex);
+	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+
+	if (of)
+		list_del(&of->list);
+
+	if (atomic_dec_and_test(&od->refcnt))
+		sd->s_attr.open = NULL;
+	else
+		od = NULL;
+
+	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+	mutex_unlock(&sysfs_open_file_mutex);
+
+	kfree(od);
+}
+
+static int kernfs_file_open(struct inode *inode, struct file *file)
+{
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	const struct kernfs_ops *ops;
+	struct sysfs_open_file *of;
+	bool has_read, has_write, has_mmap;
+	int error = -EACCES;
+
+	if (!sysfs_get_active(attr_sd))
+		return -ENODEV;
+
+	ops = kernfs_ops(attr_sd);
+
+	has_read = ops->seq_show || ops->read || ops->mmap;
+	has_write = ops->write || ops->mmap;
+	has_mmap = ops->mmap;
+
+	/* check perms and supported operations */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (!(inode->i_mode & S_IWUGO) || !has_write))
+		goto err_out;
+
+	if ((file->f_mode & FMODE_READ) &&
+	    (!(inode->i_mode & S_IRUGO) || !has_read))
+		goto err_out;
+
+	/* allocate a sysfs_open_file for the file */
+	error = -ENOMEM;
+	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
+	if (!of)
+		goto err_out;
+
+	/*
+	 * The following is done to give a different lockdep key to
+	 * @of->mutex for files which implement mmap.  This is a rather
+	 * crude way to avoid false positive lockdep warning around
+	 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+	 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+	 * which mm->mmap_sem nests, while holding @of->mutex.  As each
+	 * open file has a separate mutex, it's okay as long as those don't
+	 * happen on the same file.  At this point, we can't easily give
+	 * each file a separate locking class.  Let's differentiate on
+	 * whether the file has mmap or not for now.
+	 */
+	if (has_mmap)
+		mutex_init(&of->mutex);
+	else
+		mutex_init(&of->mutex);
+
+	of->sd = attr_sd;
+	of->file = file;
+
+	/*
+	 * Always instantiate seq_file even if read access doesn't use
+	 * seq_file or is not requested.  This unifies private data access
+	 * and readable regular files are the vast majority anyway.
+	 */
+	if (ops->seq_show)
+		error = seq_open(file, &kernfs_seq_ops);
+	else
+		error = seq_open(file, NULL);
+	if (error)
+		goto err_free;
+
+	((struct seq_file *)file->private_data)->private = of;
+
+	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
+	if (file->f_mode & FMODE_WRITE)
+		file->f_mode |= FMODE_PWRITE;
+
+	/* make sure we have open dirent struct */
+	error = sysfs_get_open_dirent(attr_sd, of);
+	if (error)
+		goto err_close;
+
+	/* open succeeded, put active references */
+	sysfs_put_active(attr_sd);
+	return 0;
+
+err_close:
+	seq_release(inode, file);
+err_free:
+	kfree(of);
+err_out:
+	sysfs_put_active(attr_sd);
+	return error;
+}
+
+static int kernfs_file_release(struct inode *inode, struct file *filp)
+{
+	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
+	struct sysfs_open_file *of = sysfs_of(filp);
+
+	sysfs_put_open_dirent(sd, of);
+	seq_release(inode, filp);
+	kfree(of);
+
+	return 0;
+}
+
+void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
+{
+	struct sysfs_open_dirent *od;
+	struct sysfs_open_file *of;
+
+	if (!(sd->s_flags & SYSFS_FLAG_HAS_MMAP))
+		return;
+
+	spin_lock_irq(&sysfs_open_dirent_lock);
+	od = sd->s_attr.open;
+	if (od)
+		atomic_inc(&od->refcnt);
+	spin_unlock_irq(&sysfs_open_dirent_lock);
+	if (!od)
+		return;
+
+	mutex_lock(&sysfs_open_file_mutex);
+	list_for_each_entry(of, &od->files, list) {
+		struct inode *inode = file_inode(of->file);
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+	mutex_unlock(&sysfs_open_file_mutex);
+
+	sysfs_put_open_dirent(sd, NULL);
+}
+
+/* Sysfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
+{
+	struct sysfs_open_file *of = sysfs_of(filp);
+	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
+	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
+
+	/* need parent for the kobj, grab both */
+	if (!sysfs_get_active(attr_sd))
+		goto trigger;
+
+	poll_wait(filp, &od->poll, wait);
+
+	sysfs_put_active(attr_sd);
+
+	if (of->event != atomic_read(&od->event))
+		goto trigger;
+
+	return DEFAULT_POLLMASK;
+
+ trigger:
+	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+
+/**
+ * kernfs_notify - notify a kernfs file
+ * @sd: file to notify
+ *
+ * Notify @sd such that poll(2) on @sd wakes up.
+ */
+void kernfs_notify(struct sysfs_dirent *sd)
+{
+	struct sysfs_open_dirent *od;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+
+	if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
+		od = sd->s_attr.open;
+		if (od) {
+			atomic_inc(&od->event);
+			wake_up_interruptible(&od->poll);
+		}
+	}
+
+	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kernfs_notify);
+
+const struct file_operations kernfs_file_operations = {
+	.read		= kernfs_file_read,
+	.write		= kernfs_file_write,
+	.llseek		= generic_file_llseek,
+	.mmap		= kernfs_file_mmap,
+	.open		= kernfs_file_open,
+	.release	= kernfs_file_release,
+	.poll		= kernfs_file_poll,
+};
+
+/**
+ * kernfs_create_file_ns_key - create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
+					       const char *name,
+					       umode_t mode, loff_t size,
+					       const struct kernfs_ops *ops,
+					       void *priv, const void *ns,
+					       struct lock_class_key *key)
+{
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+	int rc;
+
+	sd = sysfs_new_dirent(name, (mode & S_IALLUGO) | S_IFREG,
+			      SYSFS_KOBJ_ATTR);
+	if (!sd)
+		return ERR_PTR(-ENOMEM);
+
+	sd->s_attr.ops = ops;
+	sd->s_attr.size = size;
+	sd->s_ns = ns;
+	sd->priv = priv;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (key) {
+		lockdep_init_map(&sd->dep_map, "s_active", key, 0);
+		sd->s_flags |= SYSFS_FLAG_LOCKDEP;
+	}
+#endif
+
+	/*
+	 * sd->s_attr.ops is accesible only while holding active ref.  We
+	 * need to know whether some ops are implemented outside active
+	 * ref.  Cache their existence in flags.
+	 */
+	if (ops->seq_show)
+		sd->s_flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
+	if (ops->mmap)
+		sd->s_flags |= SYSFS_FLAG_HAS_MMAP;
+
+	sysfs_addrm_start(&acxt);
+	rc = sysfs_add_one(&acxt, sd, parent);
+	sysfs_addrm_finish(&acxt);
+
+	if (rc) {
+		kernfs_put(sd);
+		return ERR_PTR(rc);
+	}
+	return sd;
+}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 31f0dbe1881..38e3a163e5a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -142,4 +142,11 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
+/*
+ * file.c
+ */
+extern const struct file_operations kernfs_file_operations;
+
+void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
+
 #endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 7f0a79fa2ed..ac77d2be3c3 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,54 +14,12 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
-#include <linux/fsnotify.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/mm.h>
 
 #include "sysfs.h"
-
-/*
- * There's one sysfs_open_file for each open file and one sysfs_open_dirent
- * for each sysfs_dirent with one or more open files.
- *
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
- * protected by sysfs_open_dirent_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * sysfs_open_file.  sysfs_open_files are chained at
- * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
-static DEFINE_MUTEX(sysfs_open_file_mutex);
-
-struct sysfs_open_dirent {
-	atomic_t		refcnt;
-	atomic_t		event;
-	wait_queue_head_t	poll;
-	struct list_head	files; /* goes through sysfs_open_file.list */
-};
-
-static struct sysfs_open_file *sysfs_of(struct file *file)
-{
-	return ((struct seq_file *)file->private_data)->private;
-}
-
-/*
- * Determine the kernfs_ops for the given sysfs_dirent.  This function must
- * be called while holding an active reference.
- */
-static const struct kernfs_ops *kernfs_ops(struct sysfs_dirent *sd)
-{
-	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
-		lockdep_assert_held(sd);
-	return sd->s_attr.ops;
-}
+#include "../kernfs/kernfs-internal.h"
 
 /*
  * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
@@ -143,147 +101,6 @@ static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
 	return battr->read(of->file, kobj, battr, buf, pos, count);
 }
 
-static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
-{
-	struct sysfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops;
-
-	/*
-	 * @of->mutex nests outside active ref and is just to ensure that
-	 * the ops aren't called concurrently for the same open file.
-	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd))
-		return ERR_PTR(-ENODEV);
-
-	ops = kernfs_ops(of->sd);
-	if (ops->seq_start) {
-		return ops->seq_start(sf, ppos);
-	} else {
-		/*
-		 * The same behavior and code as single_open().  Returns
-		 * !NULL if pos is at the beginning; otherwise, NULL.
-		 */
-		return NULL + !*ppos;
-	}
-}
-
-static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
-{
-	struct sysfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops = kernfs_ops(of->sd);
-
-	if (ops->seq_next) {
-		return ops->seq_next(sf, v, ppos);
-	} else {
-		/*
-		 * The same behavior and code as single_open(), always
-		 * terminate after the initial read.
-		 */
-		++*ppos;
-		return NULL;
-	}
-}
-
-static void kernfs_seq_stop(struct seq_file *sf, void *v)
-{
-	struct sysfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops = kernfs_ops(of->sd);
-
-	if (ops->seq_stop)
-		ops->seq_stop(sf, v);
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-}
-
-static int kernfs_seq_show(struct seq_file *sf, void *v)
-{
-	struct sysfs_open_file *of = sf->private;
-
-	of->event = atomic_read(&of->sd->s_attr.open->event);
-
-	return of->sd->s_attr.ops->seq_show(sf, v);
-}
-
-static const struct seq_operations kernfs_seq_ops = {
-	.start = kernfs_seq_start,
-	.next = kernfs_seq_next,
-	.stop = kernfs_seq_stop,
-	.show = kernfs_seq_show,
-};
-
-/*
- * As reading a bin file can have side-effects, the exact offset and bytes
- * specified in read(2) call should be passed to the read callback making
- * it difficult to use seq_file.  Implement simplistic custom buffering for
- * bin files.
- */
-static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
-				       char __user *user_buf, size_t count,
-				       loff_t *ppos)
-{
-	ssize_t len = min_t(size_t, count, PAGE_SIZE);
-	const struct kernfs_ops *ops;
-	char *buf;
-
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	/*
-	 * @of->mutex nests outside active ref and is just to ensure that
-	 * the ops aren't called concurrently for the same open file.
-	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		len = -ENODEV;
-		mutex_unlock(&of->mutex);
-		goto out_free;
-	}
-
-	ops = kernfs_ops(of->sd);
-	if (ops->read)
-		len = ops->read(of, buf, len, *ppos);
-	else
-		len = -EINVAL;
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-
-	if (len < 0)
-		goto out_free;
-
-	if (copy_to_user(user_buf, buf, len)) {
-		len = -EFAULT;
-		goto out_free;
-	}
-
-	*ppos += len;
-
- out_free:
-	kfree(buf);
-	return len;
-}
-
-/**
- * kernfs_file_read - kernfs vfs read callback
- * @file: file pointer
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- */
-static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
-				size_t count, loff_t *ppos)
-{
-	struct sysfs_open_file *of = sysfs_of(file);
-
-	if (of->sd->s_flags & SYSFS_FLAG_HAS_SEQ_SHOW)
-		return seq_read(file, user_buf, count, ppos);
-	else
-		return kernfs_file_direct_read(of, user_buf, count, ppos);
-}
-
 /* kernfs write callback for regular sysfs files */
 static ssize_t sysfs_kf_write(struct sysfs_open_file *of, char *buf,
 			      size_t count, loff_t pos)
@@ -319,67 +136,6 @@ static ssize_t sysfs_kf_bin_write(struct sysfs_open_file *of, char *buf,
 	return battr->write(of->file, kobj, battr, buf, pos, count);
 }
 
-/**
- * kernfs_file_write - kernfs vfs write callback
- * @file: file pointer
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Copy data in from userland and pass it to the matching kernfs write
- * operation.
- *
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come on
- * the first write.  Hint: if you're writing a value, first read the file,
- * modify only the the value you're changing, then write entire buffer
- * back.
- */
-static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
-				 size_t count, loff_t *ppos)
-{
-	struct sysfs_open_file *of = sysfs_of(file);
-	ssize_t len = min_t(size_t, count, PAGE_SIZE);
-	const struct kernfs_ops *ops;
-	char *buf;
-
-	buf = kmalloc(len + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	if (copy_from_user(buf, user_buf, len)) {
-		len = -EFAULT;
-		goto out_free;
-	}
-	buf[len] = '\0';	/* guarantee string termination */
-
-	/*
-	 * @of->mutex nests outside active ref and is just to ensure that
-	 * the ops aren't called concurrently for the same open file.
-	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		mutex_unlock(&of->mutex);
-		len = -ENODEV;
-		goto out_free;
-	}
-
-	ops = kernfs_ops(of->sd);
-	if (ops->write)
-		len = ops->write(of, buf, len, *ppos);
-	else
-		len = -EINVAL;
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-
-	if (len > 0)
-		*ppos += len;
-out_free:
-	kfree(buf);
-	return len;
-}
-
 static int sysfs_kf_bin_mmap(struct sysfs_open_file *of,
 			     struct vm_area_struct *vma)
 {
@@ -392,490 +148,6 @@ static int sysfs_kf_bin_mmap(struct sysfs_open_file *of,
 	return battr->mmap(of->file, kobj, battr, vma);
 }
 
-static void kernfs_vma_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-
-	if (!of->vm_ops)
-		return;
-
-	if (!sysfs_get_active(of->sd))
-		return;
-
-	if (of->vm_ops->open)
-		of->vm_ops->open(vma);
-
-	sysfs_put_active(of->sd);
-}
-
-static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(of->sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = VM_FAULT_SIGBUS;
-	if (of->vm_ops->fault)
-		ret = of->vm_ops->fault(vma, vmf);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
-				   struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(of->sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = 0;
-	if (of->vm_ops->page_mkwrite)
-		ret = of->vm_ops->page_mkwrite(vma, vmf);
-	else
-		file_update_time(file);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
-			     void *buf, int len, int write)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return -EINVAL;
-
-	if (!sysfs_get_active(of->sd))
-		return -EINVAL;
-
-	ret = -EINVAL;
-	if (of->vm_ops->access)
-		ret = of->vm_ops->access(vma, addr, buf, len, write);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-#ifdef CONFIG_NUMA
-static int kernfs_vma_set_policy(struct vm_area_struct *vma,
-				 struct mempolicy *new)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(of->sd))
-		return -EINVAL;
-
-	ret = 0;
-	if (of->vm_ops->set_policy)
-		ret = of->vm_ops->set_policy(vma, new);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
-					       unsigned long addr)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	struct mempolicy *pol;
-
-	if (!of->vm_ops)
-		return vma->vm_policy;
-
-	if (!sysfs_get_active(of->sd))
-		return vma->vm_policy;
-
-	pol = vma->vm_policy;
-	if (of->vm_ops->get_policy)
-		pol = of->vm_ops->get_policy(vma, addr);
-
-	sysfs_put_active(of->sd);
-	return pol;
-}
-
-static int kernfs_vma_migrate(struct vm_area_struct *vma,
-			      const nodemask_t *from, const nodemask_t *to,
-			      unsigned long flags)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(of->sd))
-		return 0;
-
-	ret = 0;
-	if (of->vm_ops->migrate)
-		ret = of->vm_ops->migrate(vma, from, to, flags);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-#endif
-
-static const struct vm_operations_struct kernfs_vm_ops = {
-	.open		= kernfs_vma_open,
-	.fault		= kernfs_vma_fault,
-	.page_mkwrite	= kernfs_vma_page_mkwrite,
-	.access		= kernfs_vma_access,
-#ifdef CONFIG_NUMA
-	.set_policy	= kernfs_vma_set_policy,
-	.get_policy	= kernfs_vma_get_policy,
-	.migrate	= kernfs_vma_migrate,
-#endif
-};
-
-static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct sysfs_open_file *of = sysfs_of(file);
-	const struct kernfs_ops *ops;
-	int rc;
-
-	mutex_lock(&of->mutex);
-
-	rc = -ENODEV;
-	if (!sysfs_get_active(of->sd))
-		goto out_unlock;
-
-	ops = kernfs_ops(of->sd);
-	if (ops->mmap)
-		rc = ops->mmap(of, vma);
-	if (rc)
-		goto out_put;
-
-	/*
-	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-	 * to satisfy versions of X which crash if the mmap fails: that
-	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
-	 */
-	if (vma->vm_file != file)
-		goto out_put;
-
-	rc = -EINVAL;
-	if (of->mmapped && of->vm_ops != vma->vm_ops)
-		goto out_put;
-
-	/*
-	 * It is not possible to successfully wrap close.
-	 * So error if someone is trying to use close.
-	 */
-	rc = -EINVAL;
-	if (vma->vm_ops && vma->vm_ops->close)
-		goto out_put;
-
-	rc = 0;
-	of->mmapped = 1;
-	of->vm_ops = vma->vm_ops;
-	vma->vm_ops = &kernfs_vm_ops;
-out_put:
-	sysfs_put_active(of->sd);
-out_unlock:
-	mutex_unlock(&of->mutex);
-
-	return rc;
-}
-
-/**
- *	sysfs_get_open_dirent - get or create sysfs_open_dirent
- *	@sd: target sysfs_dirent
- *	@of: sysfs_open_file for this instance of open
- *
- *	If @sd->s_attr.open exists, increment its reference count;
- *	otherwise, create one.  @of is chained to the files list.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- *
- *	RETURNS:
- *	0 on success, -errno on failure.
- */
-static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
-				 struct sysfs_open_file *of)
-{
-	struct sysfs_open_dirent *od, *new_od = NULL;
-
- retry:
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irq(&sysfs_open_dirent_lock);
-
-	if (!sd->s_attr.open && new_od) {
-		sd->s_attr.open = new_od;
-		new_od = NULL;
-	}
-
-	od = sd->s_attr.open;
-	if (od) {
-		atomic_inc(&od->refcnt);
-		list_add_tail(&of->list, &od->files);
-	}
-
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	mutex_unlock(&sysfs_open_file_mutex);
-
-	if (od) {
-		kfree(new_od);
-		return 0;
-	}
-
-	/* not there, initialize a new one and retry */
-	new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
-	if (!new_od)
-		return -ENOMEM;
-
-	atomic_set(&new_od->refcnt, 0);
-	atomic_set(&new_od->event, 1);
-	init_waitqueue_head(&new_od->poll);
-	INIT_LIST_HEAD(&new_od->files);
-	goto retry;
-}
-
-/**
- *	sysfs_put_open_dirent - put sysfs_open_dirent
- *	@sd: target sysfs_dirent
- *	@of: associated sysfs_open_file
- *
- *	Put @sd->s_attr.open and unlink @of from the files list.  If
- *	reference count reaches zero, disassociate and free it.
- *
- *	LOCKING:
- *	None.
- */
-static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
-				  struct sysfs_open_file *of)
-{
-	struct sysfs_open_dirent *od = sd->s_attr.open;
-	unsigned long flags;
-
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-
-	if (of)
-		list_del(&of->list);
-
-	if (atomic_dec_and_test(&od->refcnt))
-		sd->s_attr.open = NULL;
-	else
-		od = NULL;
-
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-	mutex_unlock(&sysfs_open_file_mutex);
-
-	kfree(od);
-}
-
-static int kernfs_file_open(struct inode *inode, struct file *file)
-{
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	const struct kernfs_ops *ops;
-	struct sysfs_open_file *of;
-	bool has_read, has_write, has_mmap;
-	int error = -EACCES;
-
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
-
-	ops = kernfs_ops(attr_sd);
-
-	has_read = ops->seq_show || ops->read || ops->mmap;
-	has_write = ops->write || ops->mmap;
-	has_mmap = ops->mmap;
-
-	/* check perms and supported operations */
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (!(inode->i_mode & S_IWUGO) || !has_write))
-		goto err_out;
-
-	if ((file->f_mode & FMODE_READ) &&
-	    (!(inode->i_mode & S_IRUGO) || !has_read))
-		goto err_out;
-
-	/* allocate a sysfs_open_file for the file */
-	error = -ENOMEM;
-	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
-	if (!of)
-		goto err_out;
-
-	/*
-	 * The following is done to give a different lockdep key to
-	 * @of->mutex for files which implement mmap.  This is a rather
-	 * crude way to avoid false positive lockdep warning around
-	 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
-	 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
-	 * which mm->mmap_sem nests, while holding @of->mutex.  As each
-	 * open file has a separate mutex, it's okay as long as those don't
-	 * happen on the same file.  At this point, we can't easily give
-	 * each file a separate locking class.  Let's differentiate on
-	 * whether the file has mmap or not for now.
-	 */
-	if (has_mmap)
-		mutex_init(&of->mutex);
-	else
-		mutex_init(&of->mutex);
-
-	of->sd = attr_sd;
-	of->file = file;
-
-	/*
-	 * Always instantiate seq_file even if read access doesn't use
-	 * seq_file or is not requested.  This unifies private data access
-	 * and readable regular files are the vast majority anyway.
-	 */
-	if (ops->seq_show)
-		error = seq_open(file, &kernfs_seq_ops);
-	else
-		error = seq_open(file, NULL);
-	if (error)
-		goto err_free;
-
-	((struct seq_file *)file->private_data)->private = of;
-
-	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
-	if (file->f_mode & FMODE_WRITE)
-		file->f_mode |= FMODE_PWRITE;
-
-	/* make sure we have open dirent struct */
-	error = sysfs_get_open_dirent(attr_sd, of);
-	if (error)
-		goto err_close;
-
-	/* open succeeded, put active references */
-	sysfs_put_active(attr_sd);
-	return 0;
-
-err_close:
-	seq_release(inode, file);
-err_free:
-	kfree(of);
-err_out:
-	sysfs_put_active(attr_sd);
-	return error;
-}
-
-static int kernfs_file_release(struct inode *inode, struct file *filp)
-{
-	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_file *of = sysfs_of(filp);
-
-	sysfs_put_open_dirent(sd, of);
-	seq_release(inode, filp);
-	kfree(of);
-
-	return 0;
-}
-
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
-{
-	struct sysfs_open_dirent *od;
-	struct sysfs_open_file *of;
-
-	if (!(sd->s_flags & SYSFS_FLAG_HAS_MMAP))
-		return;
-
-	spin_lock_irq(&sysfs_open_dirent_lock);
-	od = sd->s_attr.open;
-	if (od)
-		atomic_inc(&od->refcnt);
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	if (!od)
-		return;
-
-	mutex_lock(&sysfs_open_file_mutex);
-	list_for_each_entry(of, &od->files, list) {
-		struct inode *inode = file_inode(of->file);
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
-	}
-	mutex_unlock(&sysfs_open_file_mutex);
-
-	sysfs_put_open_dirent(sd, NULL);
-}
-
-/* Sysfs attribute files are pollable.  The idea is that you read
- * the content and then you use 'poll' or 'select' to wait for
- * the content to change.  When the content changes (assuming the
- * manager for the kobject supports notification), poll will
- * return POLLERR|POLLPRI, and select will return the fd whether
- * it is waiting for read, write, or exceptions.
- * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, or seek to 0 and read again.
- * Reminder: this only works for attributes which actively support
- * it, and it is not possible to test an attribute from userspace
- * to see if it supports poll (Neither 'poll' nor 'select' return
- * an appropriate error code).  When in doubt, set a suitable timeout value.
- */
-static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
-{
-	struct sysfs_open_file *of = sysfs_of(filp);
-	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
-
-	/* need parent for the kobj, grab both */
-	if (!sysfs_get_active(attr_sd))
-		goto trigger;
-
-	poll_wait(filp, &od->poll, wait);
-
-	sysfs_put_active(attr_sd);
-
-	if (of->event != atomic_read(&od->event))
-		goto trigger;
-
-	return DEFAULT_POLLMASK;
-
- trigger:
-	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
-}
-
-/**
- * kernfs_notify - notify a kernfs file
- * @sd: file to notify
- *
- * Notify @sd such that poll(2) on @sd wakes up.
- */
-void kernfs_notify(struct sysfs_dirent *sd)
-{
-	struct sysfs_open_dirent *od;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-
-	if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-		od = sd->s_attr.open;
-		if (od) {
-			atomic_inc(&od->event);
-			wake_up_interruptible(&od->poll);
-		}
-	}
-
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-}
-EXPORT_SYMBOL_GPL(kernfs_notify);
-
 void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 {
 	struct sysfs_dirent *sd = k->sd, *tmp;
@@ -898,16 +170,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 }
 EXPORT_SYMBOL_GPL(sysfs_notify);
 
-const struct file_operations kernfs_file_operations = {
-	.read		= kernfs_file_read,
-	.write		= kernfs_file_write,
-	.llseek		= generic_file_llseek,
-	.mmap		= kernfs_file_mmap,
-	.open		= kernfs_file_open,
-	.release	= kernfs_file_release,
-	.poll		= kernfs_file_poll,
-};
-
 static const struct kernfs_ops sysfs_file_kfops_empty = {
 };
 
@@ -996,68 +258,6 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 	return 0;
 }
 
-/**
- * kernfs_create_file_ns_key - create a file
- * @parent: directory to create the file in
- * @name: name of the file
- * @mode: mode of the file
- * @size: size of the file
- * @ops: kernfs operations for the file
- * @priv: private data for the file
- * @ns: optional namespace tag of the file
- * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
- *
- * Returns the created node on success, ERR_PTR() value on error.
- */
-struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
-					       const char *name,
-					       umode_t mode, loff_t size,
-					       const struct kernfs_ops *ops,
-					       void *priv, const void *ns,
-					       struct lock_class_key *key)
-{
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-	int rc;
-
-	sd = sysfs_new_dirent(name, (mode & S_IALLUGO) | S_IFREG,
-			      SYSFS_KOBJ_ATTR);
-	if (!sd)
-		return ERR_PTR(-ENOMEM);
-
-	sd->s_attr.ops = ops;
-	sd->s_attr.size = size;
-	sd->s_ns = ns;
-	sd->priv = priv;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	if (key) {
-		lockdep_init_map(&sd->dep_map, "s_active", key, 0);
-		sd->s_flags |= SYSFS_FLAG_LOCKDEP;
-	}
-#endif
-
-	/*
-	 * sd->s_attr.ops is accesible only while holding active ref.  We
-	 * need to know whether some ops are implemented outside active
-	 * ref.  Cache their existence in flags.
-	 */
-	if (ops->seq_show)
-		sd->s_flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
-	if (ops->mmap)
-		sd->s_flags |= SYSFS_FLAG_HAS_MMAP;
-
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, parent);
-	sysfs_addrm_finish(&acxt);
-
-	if (rc) {
-		kernfs_put(sd);
-		return ERR_PTR(rc);
-	}
-	return sd;
-}
-
 int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
 		   bool is_bin)
 {
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 972b4a4a5f9..4b8b60d834c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -41,15 +41,11 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
 /*
  * file.c
  */
-extern const struct file_operations kernfs_file_operations;
-
 int sysfs_add_file(struct sysfs_dirent *dir_sd,
 		   const struct attribute *attr, bool is_bin);
-
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			   const struct attribute *attr, bool is_bin,
 			   umode_t amode, const void *ns);
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 
 /*
  * symlink.c
-- 
cgit v1.2.3-70-g09d2


From 2072f1afddfe9fa00c1c0c79f8986707324ec65b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:35 -0500
Subject: sysfs, kernfs: move symlink core code to fs/kernfs/symlink.c

Move core symlink code to fs/kernfs/symlink.c.  fs/sysfs/symlink.c now
only contains sysfs wrappers around kernfs interfaces.  The respective
declarations in fs/sysfs/sysfs.h are moved to
fs/kernfs/kernfs-internal.h.

This is pure relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/kernfs-internal.h |   5 ++
 fs/kernfs/symlink.c         | 139 ++++++++++++++++++++++++++++++++++++++++++++
 fs/sysfs/symlink.c          | 137 -------------------------------------------
 fs/sysfs/sysfs.h            |   1 -
 4 files changed, 144 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 38e3a163e5a..62ae35f997f 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -149,4 +149,9 @@ extern const struct file_operations kernfs_file_operations;
 
 void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 
+/*
+ * symlink.c
+ */
+extern const struct inode_operations sysfs_symlink_inode_operations;
+
 #endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 2578715baf0..af3570bb430 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -7,3 +7,142 @@
  *
  * This file is released under the GPLv2.
  */
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/namei.h>
+
+#include "kernfs-internal.h"
+
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
+					const char *name,
+					struct sysfs_dirent *target)
+{
+	struct sysfs_dirent *sd;
+	struct sysfs_addrm_cxt acxt;
+	int error;
+
+	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
+	if (!sd)
+		return ERR_PTR(-ENOMEM);
+
+	if (parent->s_flags & SYSFS_FLAG_NS)
+		sd->s_ns = target->s_ns;
+	sd->s_symlink.target_sd = target;
+	kernfs_get(target);	/* ref owned by symlink */
+
+	sysfs_addrm_start(&acxt);
+	error = sysfs_add_one(&acxt, sd, parent);
+	sysfs_addrm_finish(&acxt);
+
+	if (!error)
+		return sd;
+
+	kernfs_put(sd);
+	return ERR_PTR(error);
+}
+
+static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
+				 struct sysfs_dirent *target_sd, char *path)
+{
+	struct sysfs_dirent *base, *sd;
+	char *s = path;
+	int len = 0;
+
+	/* go up to the root, stop at the base */
+	base = parent_sd;
+	while (base->s_parent) {
+		sd = target_sd->s_parent;
+		while (sd->s_parent && base != sd)
+			sd = sd->s_parent;
+
+		if (base == sd)
+			break;
+
+		strcpy(s, "../");
+		s += 3;
+		base = base->s_parent;
+	}
+
+	/* determine end of target string for reverse fillup */
+	sd = target_sd;
+	while (sd->s_parent && sd != base) {
+		len += strlen(sd->s_name) + 1;
+		sd = sd->s_parent;
+	}
+
+	/* check limits */
+	if (len < 2)
+		return -EINVAL;
+	len--;
+	if ((s - path) + len > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	/* reverse fillup of target string from target to base */
+	sd = target_sd;
+	while (sd->s_parent && sd != base) {
+		int slen = strlen(sd->s_name);
+
+		len -= slen;
+		strncpy(s + len, sd->s_name, slen);
+		if (len)
+			s[--len] = '/';
+
+		sd = sd->s_parent;
+	}
+
+	return 0;
+}
+
+static int sysfs_getlink(struct dentry *dentry, char *path)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_dirent *parent_sd = sd->s_parent;
+	struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
+	int error;
+
+	mutex_lock(&sysfs_mutex);
+	error = sysfs_get_target_path(parent_sd, target_sd, path);
+	mutex_unlock(&sysfs_mutex);
+
+	return error;
+}
+
+static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	if (page) {
+		error = sysfs_getlink(dentry, (char *) page);
+		if (error < 0)
+			free_page((unsigned long)page);
+	}
+	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+	return NULL;
+}
+
+static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			   void *cookie)
+{
+	char *page = nd_get_link(nd);
+	if (!IS_ERR(page))
+		free_page((unsigned long)page);
+}
+
+const struct inode_operations sysfs_symlink_inode_operations = {
+	.setxattr	= sysfs_setxattr,
+	.readlink	= generic_readlink,
+	.follow_link	= sysfs_follow_link,
+	.put_link	= sysfs_put_link,
+	.setattr	= sysfs_setattr,
+	.getattr	= sysfs_getattr,
+	.permission	= sysfs_permission,
+};
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index b137aa3a486..6797c9c2e43 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,53 +11,13 @@
  */
 
 #include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
 #include <linux/mutex.h>
 #include <linux/security.h>
 
 #include "sysfs.h"
 
-/**
- * kernfs_create_link - create a symlink
- * @parent: directory to create the symlink in
- * @name: name of the symlink
- * @target: target node for the symlink to point to
- *
- * Returns the created node on success, ERR_PTR() value on error.
- */
-struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
-					const char *name,
-					struct sysfs_dirent *target)
-{
-	struct sysfs_dirent *sd;
-	struct sysfs_addrm_cxt acxt;
-	int error;
-
-	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
-	if (!sd)
-		return ERR_PTR(-ENOMEM);
-
-	if (parent->s_flags & SYSFS_FLAG_NS)
-		sd->s_ns = target->s_ns;
-	sd->s_symlink.target_sd = target;
-	kernfs_get(target);	/* ref owned by symlink */
-
-	sysfs_addrm_start(&acxt);
-	error = sysfs_add_one(&acxt, sd, parent);
-	sysfs_addrm_finish(&acxt);
-
-	if (!error)
-		return sd;
-
-	kernfs_put(sd);
-	return ERR_PTR(error);
-}
-
-
 static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 				   struct kobject *target,
 				   const char *name, int warn)
@@ -235,100 +195,3 @@ out:
 	return result;
 }
 EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
-
-static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-				 struct sysfs_dirent *target_sd, char *path)
-{
-	struct sysfs_dirent *base, *sd;
-	char *s = path;
-	int len = 0;
-
-	/* go up to the root, stop at the base */
-	base = parent_sd;
-	while (base->s_parent) {
-		sd = target_sd->s_parent;
-		while (sd->s_parent && base != sd)
-			sd = sd->s_parent;
-
-		if (base == sd)
-			break;
-
-		strcpy(s, "../");
-		s += 3;
-		base = base->s_parent;
-	}
-
-	/* determine end of target string for reverse fillup */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		len += strlen(sd->s_name) + 1;
-		sd = sd->s_parent;
-	}
-
-	/* check limits */
-	if (len < 2)
-		return -EINVAL;
-	len--;
-	if ((s - path) + len > PATH_MAX)
-		return -ENAMETOOLONG;
-
-	/* reverse fillup of target string from target to base */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		int slen = strlen(sd->s_name);
-
-		len -= slen;
-		strncpy(s + len, sd->s_name, slen);
-		if (len)
-			s[--len] = '/';
-
-		sd = sd->s_parent;
-	}
-
-	return 0;
-}
-
-static int sysfs_getlink(struct dentry *dentry, char *path)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
-	int error;
-
-	mutex_lock(&sysfs_mutex);
-	error = sysfs_get_target_path(parent_sd, target_sd, path);
-	mutex_unlock(&sysfs_mutex);
-
-	return error;
-}
-
-static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	int error = -ENOMEM;
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	if (page) {
-		error = sysfs_getlink(dentry, (char *) page);
-		if (error < 0)
-			free_page((unsigned long)page);
-	}
-	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-	return NULL;
-}
-
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			   void *cookie)
-{
-	char *page = nd_get_link(nd);
-	if (!IS_ERR(page))
-		free_page((unsigned long)page);
-}
-
-const struct inode_operations sysfs_symlink_inode_operations = {
-	.setxattr	= sysfs_setxattr,
-	.readlink	= generic_readlink,
-	.follow_link	= sysfs_follow_link,
-	.put_link	= sysfs_put_link,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.permission	= sysfs_permission,
-};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 4b8b60d834c..6d0dcead2d6 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -50,7 +50,6 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 /*
  * symlink.c
  */
-extern const struct inode_operations sysfs_symlink_inode_operations;
 int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
 			 const char *name);
 
-- 
cgit v1.2.3-70-g09d2


From ccc532dc12af9dd95dc1cd13e9a6d5f8f08c0842 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:36 -0500
Subject: sysfs, kernfs: drop unused params from sysfs_fill_super()

sysfs_fill_super() takes three params - @sb, @data and @silent - but
uses only @sb.  Drop the latter two.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/mount.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 852d11519f9..21070c286e3 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -41,7 +41,7 @@ struct sysfs_dirent sysfs_root = {
 	.s_ino		= 1,
 };
 
-static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
+static int sysfs_fill_super(struct super_block *sb)
 {
 	struct inode *inode;
 	struct dentry *root;
@@ -123,7 +123,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 	if (!sb->s_root) {
-		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		error = sysfs_fill_super(sb);
 		if (error) {
 			deactivate_locked_super(sb);
 			return ERR_PTR(error);
-- 
cgit v1.2.3-70-g09d2


From 51a35e9fd0f229d2f84455ee7e85a5d30fa35594 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:37 -0500
Subject: sysfs, kernfs: make sysfs_super_info->ns const

Add const qualifier to sysfs_super_info->ns so that it's consistent
with other namespace tag usages in sysfs.  Because kobject doesn't use
const qualifier for namespace tags, this ends up requiring an explicit
cast to drop const qualifier in free_sysfs_super_info().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/mount.c |  2 +-
 fs/sysfs/sysfs.h | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 21070c286e3..fcbe5e80aee 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -92,7 +92,7 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 
 static void free_sysfs_super_info(struct sysfs_super_info *info)
 {
-	kobj_ns_drop(KOBJ_NS_TYPE_NET, info->ns);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, (void *)info->ns);
 	kfree(info);
 }
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6d0dcead2d6..ce97907e289 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -18,14 +18,14 @@
  * mount.c
  */
 
-/*
- * Each sb is associated with one namespace tag, currently the network
- * namespace of the task which mounted this sysfs instance.  If multiple
- * tags become necessary, make the following an array and compare
- * sysfs_dirent tag against every entry.
- */
 struct sysfs_super_info {
-	void *ns;
+	/*
+	 * Each sb is associated with one namespace tag, currently the network
+	 * namespace of the task which mounted this sysfs instance.  If multiple
+	 * tags become necessary, make the following an array and compare
+	 * sysfs_dirent tag against every entry.
+	 */
+	const void		*ns;
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
-- 
cgit v1.2.3-70-g09d2


From 9e30cc9595303b27b48be49b7bcd4d0679e34253 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:38 -0500
Subject: sysfs, kernfs: no need to kern_mount() sysfs from sysfs_init()

It has been very long since sysfs depended on vfs to keep track of
internal states and whether sysfs is mounted or not doesn't make any
difference to sysfs's internal operation.

In addition to init and filesystem type registration, sysfs_init()
invokes kern_mount() to create in-kernel mount of sysfs.  This
internal mounting doesn't server any purpose anymore.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/mount.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index fcbe5e80aee..0c80f037901 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -24,7 +24,6 @@
 #include "sysfs.h"
 
 
-static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 
 static const struct super_operations sysfs_ops = {
@@ -153,34 +152,26 @@ static struct file_system_type sysfs_fs_type = {
 
 int __init sysfs_init(void)
 {
-	int err = -ENOMEM;
+	int err;
 
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
 					      sizeof(struct sysfs_dirent),
 					      0, 0, NULL);
 	if (!sysfs_dir_cachep)
-		goto out;
+		return -ENOMEM;
 
 	err = sysfs_inode_init();
 	if (err)
 		goto out_err;
 
 	err = register_filesystem(&sysfs_fs_type);
-	if (!err) {
-		sysfs_mnt = kern_mount(&sysfs_fs_type);
-		if (IS_ERR(sysfs_mnt)) {
-			printk(KERN_ERR "sysfs: could not mount!\n");
-			err = PTR_ERR(sysfs_mnt);
-			sysfs_mnt = NULL;
-			unregister_filesystem(&sysfs_fs_type);
-			goto out_err;
-		}
-	} else
+	if (err)
 		goto out_err;
-out:
-	return err;
+
+	return 0;
+
 out_err:
 	kmem_cache_destroy(sysfs_dir_cachep);
 	sysfs_dir_cachep = NULL;
-	goto out;
+	return err;
 }
-- 
cgit v1.2.3-70-g09d2


From 061447a496b915f1dc8f8c645c6825f856d2bbac Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:39 -0500
Subject: sysfs, kernfs: introduce sysfs_root_sd

Currently, it's assumed that there's a single kernfs hierarchy in the
system anchored at sysfs_root which is defined as a global struct.  To
allow other users of kernfs, this will be made dynamic.  Introduce a
new global variable sysfs_root_sd which points to &sysfs_root and
convert all &sysfs_root users.

This patch doesn't introduce any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c     | 4 ++--
 fs/sysfs/mount.c   | 8 +++++---
 fs/sysfs/symlink.c | 6 +++---
 fs/sysfs/sysfs.h   | 2 +-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e5c4e711805..2fea501889e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -68,7 +68,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (kobj->parent)
 		parent_sd = kobj->parent->sd;
 	else
-		parent_sd = &sysfs_root;
+		parent_sd = sysfs_root_sd;
 
 	if (!parent_sd)
 		return -ENOENT;
@@ -134,7 +134,7 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 
 	BUG_ON(!sd->s_parent);
 	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
-		new_parent_kobj->sd : &sysfs_root;
+		new_parent_kobj->sd : sysfs_root_sd;
 
 	return kernfs_rename_ns(sd, new_parent_sd, sd->s_name, new_ns);
 }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 0c80f037901..7cbd1fce282 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -32,7 +32,7 @@ static const struct super_operations sysfs_ops = {
 	.evict_inode	= sysfs_evict_inode,
 };
 
-struct sysfs_dirent sysfs_root = {
+static struct sysfs_dirent sysfs_root = {
 	.s_name		= "",
 	.s_count	= ATOMIC_INIT(1),
 	.s_flags	= SYSFS_DIR,
@@ -40,6 +40,8 @@ struct sysfs_dirent sysfs_root = {
 	.s_ino		= 1,
 };
 
+struct sysfs_dirent *sysfs_root_sd = &sysfs_root;
+
 static int sysfs_fill_super(struct super_block *sb)
 {
 	struct inode *inode;
@@ -53,7 +55,7 @@ static int sysfs_fill_super(struct super_block *sb)
 
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(sb, &sysfs_root);
+	inode = sysfs_get_inode(sb, sysfs_root_sd);
 	mutex_unlock(&sysfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
@@ -66,7 +68,7 @@ static int sysfs_fill_super(struct super_block *sb)
 		pr_debug("%s: could not get root dentry!\n", __func__);
 		return -ENOMEM;
 	}
-	root->d_fsdata = &sysfs_root;
+	root->d_fsdata = sysfs_root_sd;
 	sb->s_root = root;
 	sb->s_d_op = &sysfs_dentry_ops;
 	return 0;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 6797c9c2e43..62f0e014ec4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -70,7 +70,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 	struct sysfs_dirent *parent_sd = NULL;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent_sd = sysfs_root_sd;
 	else
 		parent_sd = kobj->sd;
 
@@ -144,7 +144,7 @@ void sysfs_remove_link(struct kobject *kobj, const char *name)
 	struct sysfs_dirent *parent_sd = NULL;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent_sd = sysfs_root_sd;
 	else
 		parent_sd = kobj->sd;
 
@@ -170,7 +170,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 	int result;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent_sd = sysfs_root_sd;
 	else
 		parent_sd = kobj->sd;
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ce97907e289..2b217cef90b 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,7 +28,7 @@ struct sysfs_super_info {
 	const void		*ns;
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
-extern struct sysfs_dirent sysfs_root;
+extern struct sysfs_dirent *sysfs_root_sd;
 extern struct kmem_cache *sysfs_dir_cachep;
 
 /*
-- 
cgit v1.2.3-70-g09d2


From ba7443bc656e5236c316b2acacc8b551f872910f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:40 -0500
Subject: sysfs, kernfs: implement kernfs_create/destroy_root()

There currently is single kernfs hierarchy in the whole system which
is used for sysfs.  kernfs needs to support multiple hierarchies to
allow other users.  This patch introduces struct kernfs_root which
serves as the root of each kernfs hierarchy and implements
kernfs_create/destroy_root().

* Each kernfs_root is associated with a root sd (sysfs_dentry).  The
  root is freed when the root sd is released and kernfs_destory_root()
  simply invokes kernfs_remove() on the root sd.  sysfs_remove_one()
  is updated to handle release of the root sd.  Note that ps_iattr
  update in sysfs_remove_one() is trivially updated for readability.

* Root sd's are now dynamically allocated using sysfs_new_dirent().
  Update sysfs_alloc_ino() so that it gives out ino from 1 so that the
  root sd still gets ino 1.

* While kernfs currently only points to the root sd, it'll soon grow
  fields which are specific to each hierarchy.  As determining a given
  sd's root will be necessary, sd->s_dir.root is added.  This backlink
  fits better as a separate field in sd; however, sd->s_dir is inside
  union with space to spare, so use it to save space and provide
  kernfs_root() accessor to determine the root sd.

* As hierarchies may be destroyed now, each mount needs to hold onto
  the hierarchy it's attached to.  Update sysfs_fill_super() and
  sysfs_kill_sb() so that they get and put the kernfs_root
  respectively.

* sysfs_root is replaced with kernfs_root which is dynamically created
  by invoking kernfs_create_root() from sysfs_init().

This patch doesn't introduce any visible behavior changes.

v2: kernfs_create_root() forgot to set @sd->priv.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 71 +++++++++++++++++++++++++++++++++++++++------
 fs/kernfs/kernfs-internal.h | 20 +++++++++++++
 fs/sysfs/mount.c            | 29 +++++++++++-------
 include/linux/kernfs.h      | 13 +++++++++
 4 files changed, 113 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a4ca4de3cb2..246740a741e 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -211,7 +211,7 @@ static int sysfs_alloc_ino(unsigned int *pino)
 
  retry:
 	spin_lock(&sysfs_ino_lock);
-	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
+	rc = ida_get_new_above(&sysfs_ino_ida, 1, &ino);
 	spin_unlock(&sysfs_ino_lock);
 
 	if (rc == -EAGAIN) {
@@ -253,9 +253,11 @@ EXPORT_SYMBOL_GPL(kernfs_get);
 void kernfs_put(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd;
+	struct kernfs_root *root;
 
 	if (!sd || !atomic_dec_and_test(&sd->s_count))
 		return;
+	root = kernfs_root(sd);
  repeat:
 	/* Moving/renaming is always done while holding reference.
 	 * sd->s_parent won't change beneath us.
@@ -278,8 +280,13 @@ void kernfs_put(struct sysfs_dirent *sd)
 	kmem_cache_free(sysfs_dir_cachep, sd);
 
 	sd = parent_sd;
-	if (sd && atomic_dec_and_test(&sd->s_count))
-		goto repeat;
+	if (sd) {
+		if (atomic_dec_and_test(&sd->s_count))
+			goto repeat;
+	} else {
+		/* just released the root sd, free @root too */
+		kfree(root);
+	}
 }
 EXPORT_SYMBOL_GPL(kernfs_put);
 
@@ -493,13 +500,15 @@ static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
 	if (sd->s_flags & SYSFS_FLAG_REMOVED)
 		return;
 
-	sysfs_unlink_sibling(sd);
+	if (sd->s_parent) {
+		sysfs_unlink_sibling(sd);
 
-	/* Update timestamps on the parent */
-	ps_iattr = sd->s_parent->s_iattr;
-	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+		/* Update timestamps on the parent */
+		ps_iattr = sd->s_parent->s_iattr;
+		if (ps_iattr) {
+			ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+			ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+		}
 	}
 
 	sd->s_flags |= SYSFS_FLAG_REMOVED;
@@ -603,6 +612,49 @@ struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
 }
 EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 
+/**
+ * kernfs_create_root - create a new kernfs hierarchy
+ * @priv: opaque data associated with the new directory
+ *
+ * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * failure.
+ */
+struct kernfs_root *kernfs_create_root(void *priv)
+{
+	struct kernfs_root *root;
+	struct sysfs_dirent *sd;
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	sd = sysfs_new_dirent("", S_IFDIR | S_IRUGO | S_IXUGO, SYSFS_DIR);
+	if (!sd) {
+		kfree(root);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
+	sd->priv = priv;
+	sd->s_dir.root = root;
+
+	root->sd = sd;
+
+	return root;
+}
+
+/**
+ * kernfs_destroy_root - destroy a kernfs hierarchy
+ * @root: root of the hierarchy to destroy
+ *
+ * Destroy the hierarchy anchored at @root by removing all existing
+ * directories and destroying @root.
+ */
+void kernfs_destroy_root(struct kernfs_root *root)
+{
+	kernfs_remove(root->sd);	/* will also free @root */
+}
+
 /**
  * kernfs_create_dir_ns - create a directory
  * @parent: parent in which to create a new directory
@@ -626,6 +678,7 @@ struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 	if (!sd)
 		return ERR_PTR(-ENOMEM);
 
+	sd->s_dir.root = parent->s_dir.root;
 	sd->s_ns = ns;
 	sd->priv = priv;
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 62ae35f997f..7dfe0627835 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -25,6 +25,12 @@ struct sysfs_elem_dir {
 	unsigned long		subdirs;
 	/* children rbtree starts here and goes through sd->s_rb */
 	struct rb_root		children;
+
+	/*
+	 * The kernfs hierarchy this directory belongs to.  This fits
+	 * better directly in sysfs_dirent but is here to save space.
+	 */
+	struct kernfs_root	*root;
 };
 
 struct sysfs_elem_symlink {
@@ -104,6 +110,20 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 	return sd->s_flags & SYSFS_TYPE_MASK;
 }
 
+/**
+ * kernfs_root - find out the kernfs_root a sysfs_dirent belongs to
+ * @sd: sysfs_dirent of interest
+ *
+ * Return the kernfs_root @sd belongs to.
+ */
+static inline struct kernfs_root *kernfs_root(struct sysfs_dirent *sd)
+{
+	/* if parent exists, it's always a dir; otherwise, @sd is a dir */
+	if (sd->s_parent)
+		sd = sd->s_parent;
+	return sd->s_dir.root;
+}
+
 /*
  * Context structure to be used while adding/removing nodes.
  */
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 7cbd1fce282..0b5661b462f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -32,15 +32,8 @@ static const struct super_operations sysfs_ops = {
 	.evict_inode	= sysfs_evict_inode,
 };
 
-static struct sysfs_dirent sysfs_root = {
-	.s_name		= "",
-	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_DIR,
-	.s_mode		= S_IFDIR | S_IRUGO | S_IXUGO,
-	.s_ino		= 1,
-};
-
-struct sysfs_dirent *sysfs_root_sd = &sysfs_root;
+static struct kernfs_root *sysfs_root;
+struct sysfs_dirent *sysfs_root_sd;
 
 static int sysfs_fill_super(struct super_block *sb)
 {
@@ -68,6 +61,7 @@ static int sysfs_fill_super(struct super_block *sb)
 		pr_debug("%s: could not get root dentry!\n", __func__);
 		return -ENOMEM;
 	}
+	kernfs_get(sysfs_root_sd);
 	root->d_fsdata = sysfs_root_sd;
 	sb->s_root = root;
 	sb->s_d_op = &sysfs_dentry_ops;
@@ -138,11 +132,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 static void sysfs_kill_sb(struct super_block *sb)
 {
 	struct sysfs_super_info *info = sysfs_info(sb);
-	/* Remove the superblock from fs_supers/s_instances
+	struct sysfs_dirent *root_sd = sb->s_root->d_fsdata;
+
+	/*
+	 * Remove the superblock from fs_supers/s_instances
 	 * so we can't find it, before freeing sysfs_super_info.
 	 */
 	kill_anon_super(sb);
 	free_sysfs_super_info(info);
+	kernfs_put(root_sd);
 }
 
 static struct file_system_type sysfs_fs_type = {
@@ -166,12 +164,21 @@ int __init sysfs_init(void)
 	if (err)
 		goto out_err;
 
+	sysfs_root = kernfs_create_root(NULL);
+	if (IS_ERR(sysfs_root)) {
+		err = PTR_ERR(sysfs_root);
+		goto out_err;
+	}
+	sysfs_root_sd = sysfs_root->sd;
+
 	err = register_filesystem(&sysfs_fs_type);
 	if (err)
-		goto out_err;
+		goto out_destroy_root;
 
 	return 0;
 
+out_destroy_root:
+	kernfs_destroy_root(sysfs_root);
 out_err:
 	kmem_cache_destroy(sysfs_dir_cachep);
 	sysfs_dir_cachep = NULL;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index fd8f574ef2f..f75548b8ed7 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -20,6 +20,11 @@ struct vm_area_struct;
 
 struct sysfs_dirent;
 
+struct kernfs_root {
+	/* published fields */
+	struct sysfs_dirent	*sd;
+};
+
 struct sysfs_open_file {
 	/* published fields */
 	struct sysfs_dirent	*sd;
@@ -76,6 +81,9 @@ struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
 void kernfs_get(struct sysfs_dirent *sd);
 void kernfs_put(struct sysfs_dirent *sd);
 
+struct kernfs_root *kernfs_create_root(void *priv);
+void kernfs_destroy_root(struct kernfs_root *root);
+
 struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 					  const char *name, void *priv,
 					  const void *ns);
@@ -107,6 +115,11 @@ kernfs_find_and_get_ns(struct sysfs_dirent *parent, const char *name,
 static inline void kernfs_get(struct sysfs_dirent *sd) { }
 static inline void kernfs_put(struct sysfs_dirent *sd) { }
 
+static inline struct kernfs_root *kernfs_create_root(void *priv)
+{ return ERR_PTR(-ENOSYS); }
+
+static inline void kernfs_destroy_root(struct kernfs_root *root) { }
+
 static inline struct sysfs_dirent *
 kernfs_create_dir_ns(struct sysfs_dirent *parent, const char *name, void *priv,
 		     const void *ns)
-- 
cgit v1.2.3-70-g09d2


From bc755553df9ab33f389c1a0a8bd0b4f4646e80ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:41 -0500
Subject: sysfs, kernfs: make inode number ida per kernfs_root

kernfs is being updated to allow multiple sysfs_dirent hierarchies so
that it can also be used by other users.  Currently, inode number is
allocated using a global ida, sysfs_ino_ida; however, inos for
different hierarchies should be handled separately.

This patch makes ino allocation per kernfs_root.  sysfs_ino_ida is
replaced by kernfs_root->ino_ida and sysfs_new_dirent() is updated to
take @root and allocate ino from it.  ida_simple_get/remove() are used
instead of sysfs_ino_lock and sysfs_alloc/free_ino().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 47 +++++++++++++--------------------------------
 fs/kernfs/file.c            |  4 ++--
 fs/kernfs/kernfs-internal.h |  3 ++-
 fs/kernfs/symlink.c         |  3 ++-
 include/linux/kernfs.h      |  4 ++++
 5 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 246740a741e..eaffa83719d 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -21,9 +21,6 @@ DEFINE_MUTEX(sysfs_mutex);
 
 #define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
 
-static DEFINE_SPINLOCK(sysfs_ino_lock);
-static DEFINE_IDA(sysfs_ino_ida);
-
 /**
  *	sysfs_name_hash
  *	@name: Null terminated string to hash
@@ -205,32 +202,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
 	rwsem_release(&sd->dep_map, 1, _RET_IP_);
 }
 
-static int sysfs_alloc_ino(unsigned int *pino)
-{
-	int ino, rc;
-
- retry:
-	spin_lock(&sysfs_ino_lock);
-	rc = ida_get_new_above(&sysfs_ino_ida, 1, &ino);
-	spin_unlock(&sysfs_ino_lock);
-
-	if (rc == -EAGAIN) {
-		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
-			goto retry;
-		rc = -ENOMEM;
-	}
-
-	*pino = ino;
-	return rc;
-}
-
-static void sysfs_free_ino(unsigned int ino)
-{
-	spin_lock(&sysfs_ino_lock);
-	ida_remove(&sysfs_ino_ida, ino);
-	spin_unlock(&sysfs_ino_lock);
-}
-
 /**
  * kernfs_get - get a reference count on a sysfs_dirent
  * @sd: the target sysfs_dirent
@@ -276,7 +247,7 @@ void kernfs_put(struct sysfs_dirent *sd)
 		security_release_secctx(sd->s_iattr->ia_secdata,
 					sd->s_iattr->ia_secdata_len);
 	kfree(sd->s_iattr);
-	sysfs_free_ino(sd->s_ino);
+	ida_simple_remove(&root->ino_ida, sd->s_ino);
 	kmem_cache_free(sysfs_dir_cachep, sd);
 
 	sd = parent_sd;
@@ -285,6 +256,7 @@ void kernfs_put(struct sysfs_dirent *sd)
 			goto repeat;
 	} else {
 		/* just released the root sd, free @root too */
+		ida_destroy(&root->ino_ida);
 		kfree(root);
 	}
 }
@@ -360,10 +332,12 @@ const struct dentry_operations sysfs_dentry_ops = {
 	.d_release	= sysfs_dentry_release,
 };
 
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
+struct sysfs_dirent *sysfs_new_dirent(struct kernfs_root *root,
+				      const char *name, umode_t mode, int type)
 {
 	char *dup_name = NULL;
 	struct sysfs_dirent *sd;
+	int ret;
 
 	if (type & SYSFS_COPY_NAME) {
 		name = dup_name = kstrdup(name, GFP_KERNEL);
@@ -375,8 +349,10 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 	if (!sd)
 		goto err_out1;
 
-	if (sysfs_alloc_ino(&sd->s_ino))
+	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
 		goto err_out2;
+	sd->s_ino = ret;
 
 	atomic_set(&sd->s_count, 1);
 	atomic_set(&sd->s_active, 0);
@@ -628,8 +604,11 @@ struct kernfs_root *kernfs_create_root(void *priv)
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
-	sd = sysfs_new_dirent("", S_IFDIR | S_IRUGO | S_IXUGO, SYSFS_DIR);
+	ida_init(&root->ino_ida);
+
+	sd = sysfs_new_dirent(root, "", S_IFDIR | S_IRUGO | S_IXUGO, SYSFS_DIR);
 	if (!sd) {
+		ida_destroy(&root->ino_ida);
 		kfree(root);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -674,7 +653,7 @@ struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
 	int rc;
 
 	/* allocate */
-	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
+	sd = sysfs_new_dirent(kernfs_root(parent), name, mode, SYSFS_DIR);
 	if (!sd)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index fa172e86047..990c97fa704 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -775,8 +775,8 @@ struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
 	struct sysfs_dirent *sd;
 	int rc;
 
-	sd = sysfs_new_dirent(name, (mode & S_IALLUGO) | S_IFREG,
-			      SYSFS_KOBJ_ATTR);
+	sd = sysfs_new_dirent(kernfs_root(parent), name,
+			      (mode & S_IALLUGO) | S_IFREG, SYSFS_KOBJ_ATTR);
 	if (!sd)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 7dfe0627835..466943d576f 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -160,7 +160,8 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		  struct sysfs_dirent *parent_sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
+struct sysfs_dirent *sysfs_new_dirent(struct kernfs_root *root,
+				      const char *name, umode_t mode, int type);
 
 /*
  * file.c
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index af3570bb430..004c1646559 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -30,7 +30,8 @@ struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 	struct sysfs_addrm_cxt acxt;
 	int error;
 
-	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
+	sd = sysfs_new_dirent(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
+			      SYSFS_KOBJ_LINK);
 	if (!sd)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index f75548b8ed7..fad8b986800 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -11,6 +11,7 @@
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/idr.h>
 #include <linux/lockdep.h>
 
 struct file;
@@ -23,6 +24,9 @@ struct sysfs_dirent;
 struct kernfs_root {
 	/* published fields */
 	struct sysfs_dirent	*sd;
+
+	/* private fields, do not use outside kernfs proper */
+	struct ida		ino_ida;
 };
 
 struct sysfs_open_file {
-- 
cgit v1.2.3-70-g09d2


From df394fb56c64244b30b442e9e02de1a2d9c5a98b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:42 -0500
Subject: sysfs, kernfs: make super_blocks bind to different kernfs_roots

kernfs is being updated to allow multiple sysfs_dirent hierarchies so
that it can also be used by other users.  Currently, sysfs
super_blocks are always attached to one kernfs_root - sysfs_root - and
distinguished only by their namespace tags.

This patch adds sysfs_super_info->root and update
sysfs_fill/test_super() so that super_blocks are identified by the
combination of both the associated kernfs_root and namespace tag.
This allows mounting different kernfs hierarchies.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/mount.c | 10 ++++++----
 fs/sysfs/sysfs.h |  6 ++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 0b5661b462f..f143b203a7e 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -37,6 +37,7 @@ struct sysfs_dirent *sysfs_root_sd;
 
 static int sysfs_fill_super(struct super_block *sb)
 {
+	struct sysfs_super_info *info = sysfs_info(sb);
 	struct inode *inode;
 	struct dentry *root;
 
@@ -48,7 +49,7 @@ static int sysfs_fill_super(struct super_block *sb)
 
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(sb, sysfs_root_sd);
+	inode = sysfs_get_inode(sb, info->root->sd);
 	mutex_unlock(&sysfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
@@ -61,8 +62,8 @@ static int sysfs_fill_super(struct super_block *sb)
 		pr_debug("%s: could not get root dentry!\n", __func__);
 		return -ENOMEM;
 	}
-	kernfs_get(sysfs_root_sd);
-	root->d_fsdata = sysfs_root_sd;
+	kernfs_get(info->root->sd);
+	root->d_fsdata = info->root->sd;
 	sb->s_root = root;
 	sb->s_d_op = &sysfs_dentry_ops;
 	return 0;
@@ -73,7 +74,7 @@ static int sysfs_test_super(struct super_block *sb, void *data)
 	struct sysfs_super_info *sb_info = sysfs_info(sb);
 	struct sysfs_super_info *info = data;
 
-	return sb_info->ns == info->ns;
+	return sb_info->root == info->root && sb_info->ns == info->ns;
 }
 
 static int sysfs_set_super(struct super_block *sb, void *data)
@@ -110,6 +111,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
+	info->root = sysfs_root;
 	info->ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 2b217cef90b..93b4b68458a 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -19,6 +19,12 @@
  */
 
 struct sysfs_super_info {
+	/*
+	 * The root associated with this super_block.  Each super_block is
+	 * identified by the root and ns it's associated with.
+	 */
+	struct kernfs_root	*root;
+
 	/*
 	 * Each sb is associated with one namespace tag, currently the network
 	 * namespace of the task which mounted this sysfs instance.  If multiple
-- 
cgit v1.2.3-70-g09d2


From 4b93dc9b1c684d0587fe44d36bbfdf45bd3bea9d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:43 -0500
Subject: sysfs, kernfs: prepare mount path for kernfs

We're in the process of separating out core sysfs functionality into
kernfs which will deal with sysfs_dirents directly.  This patch
rearranges mount path so that the kernfs and sysfs parts are separate.

* As sysfs_super_info won't be visible outside kernfs proper,
  kernfs_super_ns() is added to allow kernfs users to access a
  super_block's namespace tag.

* Generic mount operation is separated out into kernfs_mount_ns().
  sysfs_mount() now just performs sysfs-specific permission check,
  acquires namespace tag, and invokes kernfs_mount_ns().

* Generic superblock release is separated out into kernfs_kill_sb()
  which can be used directly as file_system_type->kill_sb().  As sysfs
  needs to put the namespace tag, sysfs_kill_sb() wraps
  kernfs_kill_sb() with ns tag put.

* sysfs_dir_cachep init and sysfs_inode_init() are separated out into
  kernfs_init().  kernfs_init() uses only small amount of memory and
  trying to handle and propagate kernfs_init() failure doesn't make
  much sense.  Use SLAB_PANIC for sysfs_dir_cachep and make
  sysfs_inode_init() panic on failure.

  After this change, kernfs_init() should be called before
  sysfs_init(), fs/namespace.c::mnt_init() modified accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/inode.c           |   5 ++-
 fs/kernfs/kernfs-internal.h |   2 +-
 fs/namespace.c              |   2 +
 fs/sysfs/mount.c            | 104 ++++++++++++++++++++++++++++++--------------
 include/linux/kernfs.h      |  28 ++++++++++++
 5 files changed, 106 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 9d4fab49ea2..b4cae6fd717 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -37,9 +37,10 @@ static const struct inode_operations sysfs_inode_operations = {
 	.setxattr	= sysfs_setxattr,
 };
 
-int __init sysfs_inode_init(void)
+void __init sysfs_inode_init(void)
 {
-	return bdi_init(&sysfs_backing_dev_info);
+	if (bdi_init(&sysfs_backing_dev_info))
+		panic("failed to init sysfs_backing_dev_info");
 }
 
 static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 466943d576f..0d949885389 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -144,7 +144,7 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags);
-int sysfs_inode_init(void);
+void sysfs_inode_init(void);
 
 /*
  * dir.c
diff --git a/fs/namespace.c b/fs/namespace.c
index ac2ce8a766e..a511ea003f8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2790,6 +2790,8 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mountpoint_hashtable[u]);
 
+	kernfs_init();
+
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f143b203a7e..5384732700b 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -86,18 +86,24 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 	return error;
 }
 
-static void free_sysfs_super_info(struct sysfs_super_info *info)
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
 {
-	kobj_ns_drop(KOBJ_NS_TYPE_NET, (void *)info->ns);
-	kfree(info);
+	struct sysfs_super_info *info = sysfs_info(sb);
+
+	return info->ns;
 }
 
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	struct sysfs_super_info *info;
-	struct super_block *sb;
-	int error;
+	struct dentry *root;
+	void *ns;
 
 	if (!(flags & MS_KERNMOUNT)) {
 		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
@@ -107,16 +113,44 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 			return ERR_PTR(-EPERM);
 	}
 
+	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns);
+	if (IS_ERR(root))
+		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+	return root;
+}
+
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+			       struct kernfs_root *root, const void *ns)
+{
+	struct super_block *sb;
+	struct sysfs_super_info *info;
+	int error;
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	info->root = sysfs_root;
-	info->ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	info->root = root;
+	info->ns = ns;
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
-		free_sysfs_super_info(info);
+		kfree(info);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 	if (!sb->s_root) {
@@ -132,6 +166,20 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 }
 
 static void sysfs_kill_sb(struct super_block *sb)
+{
+	kernfs_kill_sb(sb);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, (void *)kernfs_super_ns(sb));
+}
+
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb().  If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
 {
 	struct sysfs_super_info *info = sysfs_info(sb);
 	struct sysfs_dirent *root_sd = sb->s_root->d_fsdata;
@@ -141,7 +189,7 @@ static void sysfs_kill_sb(struct super_block *sb)
 	 * so we can't find it, before freeing sysfs_super_info.
 	 */
 	kill_anon_super(sb);
-	free_sysfs_super_info(info);
+	kfree(info);
 	kernfs_put(root_sd);
 }
 
@@ -152,37 +200,29 @@ static struct file_system_type sysfs_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
-int __init sysfs_init(void)
+void __init kernfs_init(void)
 {
-	int err;
-
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
 					      sizeof(struct sysfs_dirent),
-					      0, 0, NULL);
-	if (!sysfs_dir_cachep)
-		return -ENOMEM;
+					      0, SLAB_PANIC, NULL);
+	sysfs_inode_init();
+}
 
-	err = sysfs_inode_init();
-	if (err)
-		goto out_err;
+int __init sysfs_init(void)
+{
+	int err;
 
 	sysfs_root = kernfs_create_root(NULL);
-	if (IS_ERR(sysfs_root)) {
-		err = PTR_ERR(sysfs_root);
-		goto out_err;
-	}
+	if (IS_ERR(sysfs_root))
+		return PTR_ERR(sysfs_root);
+
 	sysfs_root_sd = sysfs_root->sd;
 
 	err = register_filesystem(&sysfs_fs_type);
-	if (err)
-		goto out_destroy_root;
+	if (err) {
+		kernfs_destroy_root(sysfs_root);
+		return err;
+	}
 
 	return 0;
-
-out_destroy_root:
-	kernfs_destroy_root(sysfs_root);
-out_err:
-	kmem_cache_destroy(sysfs_dir_cachep);
-	sysfs_dir_cachep = NULL;
-	return err;
 }
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index fad8b986800..75fcbe5c9d6 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -18,6 +18,8 @@ struct file;
 struct iattr;
 struct seq_file;
 struct vm_area_struct;
+struct super_block;
+struct file_system_type;
 
 struct sysfs_dirent;
 
@@ -109,6 +111,13 @@ void kernfs_enable_ns(struct sysfs_dirent *sd);
 int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr);
 void kernfs_notify(struct sysfs_dirent *sd);
 
+const void *kernfs_super_ns(struct super_block *sb);
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+			       struct kernfs_root *root, const void *ns);
+void kernfs_kill_sb(struct super_block *sb);
+
+void kernfs_init(void);
+
 #else	/* CONFIG_SYSFS */
 
 static inline struct sysfs_dirent *
@@ -160,6 +169,18 @@ static inline int kernfs_setattr(struct sysfs_dirent *sd,
 
 static inline void kernfs_notify(struct sysfs_dirent *sd) { }
 
+static inline const void *kernfs_super_ns(struct super_block *sb)
+{ return NULL; }
+
+static inline struct dentry *
+kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+		struct kernfs_root *root, const void *ns)
+{ return ERR_PTR(-ENOSYS); }
+
+static inline void kernfs_kill_sb(struct super_block *sb) { }
+
+static inline void kernfs_init(void) { }
+
 #endif	/* CONFIG_SYSFS */
 
 static inline struct sysfs_dirent *
@@ -201,4 +222,11 @@ static inline int kernfs_remove_by_name(struct sysfs_dirent *parent,
 	return kernfs_remove_by_name_ns(parent, name, NULL);
 }
 
+static inline struct dentry *
+kernfs_mount(struct file_system_type *fs_type, int flags,
+	     struct kernfs_root *root)
+{
+	return kernfs_mount_ns(fs_type, flags, root, NULL);
+}
+
 #endif	/* __LINUX_KERNFS_H */
-- 
cgit v1.2.3-70-g09d2


From fa736a951e456b996a76826ba78ff974414c3b55 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:44 -0500
Subject: sysfs, kernfs: move mount core code to fs/kernfs/mount.c

Move core mount code to fs/kernfs/mount.c.  The respective
declarations in fs/sysfs/sysfs.h are moved to
fs/kernfs/kernfs-internal.h.

This is pure relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/kernfs-internal.h |  22 +++++++
 fs/kernfs/mount.c           | 156 ++++++++++++++++++++++++++++++++++++++++++++
 fs/sysfs/mount.c            | 152 ------------------------------------------
 fs/sysfs/sysfs.h            |  18 -----
 4 files changed, 178 insertions(+), 170 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 0d949885389..ced0d6dadc7 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -133,6 +133,28 @@ struct sysfs_addrm_cxt {
 
 #include "../sysfs/sysfs.h"
 
+/*
+ * mount.c
+ */
+struct sysfs_super_info {
+	/*
+	 * The root associated with this super_block.  Each super_block is
+	 * identified by the root and ns it's associated with.
+	 */
+	struct kernfs_root	*root;
+
+	/*
+	 * Each sb is associated with one namespace tag, currently the network
+	 * namespace of the task which mounted this sysfs instance.  If multiple
+	 * tags become necessary, make the following an array and compare
+	 * sysfs_dirent tag against every entry.
+	 */
+	const void		*ns;
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
+
+extern struct kmem_cache *sysfs_dir_cachep;
+
 /*
  * inode.c
  */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 872e262e516..84c83e24bf2 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -7,3 +7,159 @@
  *
  * This file is released under the GPLv2.
  */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include "kernfs-internal.h"
+
+struct kmem_cache *sysfs_dir_cachep;
+
+static const struct super_operations sysfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= sysfs_evict_inode,
+};
+
+static int sysfs_fill_super(struct super_block *sb)
+{
+	struct sysfs_super_info *info = sysfs_info(sb);
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = SYSFS_MAGIC;
+	sb->s_op = &sysfs_ops;
+	sb->s_time_gran = 1;
+
+	/* get root inode, initialize and unlock it */
+	mutex_lock(&sysfs_mutex);
+	inode = sysfs_get_inode(sb, info->root->sd);
+	mutex_unlock(&sysfs_mutex);
+	if (!inode) {
+		pr_debug("sysfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	/* instantiate and link root dentry */
+	root = d_make_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n", __func__);
+		return -ENOMEM;
+	}
+	kernfs_get(info->root->sd);
+	root->d_fsdata = info->root->sd;
+	sb->s_root = root;
+	sb->s_d_op = &sysfs_dentry_ops;
+	return 0;
+}
+
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+	struct sysfs_super_info *sb_info = sysfs_info(sb);
+	struct sysfs_super_info *info = data;
+
+	return sb_info->root == info->root && sb_info->ns == info->ns;
+}
+
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+	int error;
+	error = set_anon_super(sb, data);
+	if (!error)
+		sb->s_fs_info = data;
+	return error;
+}
+
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
+{
+	struct sysfs_super_info *info = sysfs_info(sb);
+
+	return info->ns;
+}
+
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+			       struct kernfs_root *root, const void *ns)
+{
+	struct super_block *sb;
+	struct sysfs_super_info *info;
+	int error;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	info->root = root;
+	info->ns = ns;
+
+	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
+	if (IS_ERR(sb) || sb->s_fs_info != info)
+		kfree(info);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+	if (!sb->s_root) {
+		error = sysfs_fill_super(sb);
+		if (error) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(error);
+		}
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb().  If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
+{
+	struct sysfs_super_info *info = sysfs_info(sb);
+	struct sysfs_dirent *root_sd = sb->s_root->d_fsdata;
+
+	/*
+	 * Remove the superblock from fs_supers/s_instances
+	 * so we can't find it, before freeing sysfs_super_info.
+	 */
+	kill_anon_super(sb);
+	kfree(info);
+	kernfs_put(root_sd);
+}
+
+void __init kernfs_init(void)
+{
+	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
+					      sizeof(struct sysfs_dirent),
+					      0, SLAB_PANIC, NULL);
+	sysfs_inode_init();
+}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 5384732700b..e7e3aa8e7b7 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,91 +14,14 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
-#include <linux/pagemap.h>
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/magic.h>
-#include <linux/slab.h>
 #include <linux/user_namespace.h>
 
 #include "sysfs.h"
 
-
-struct kmem_cache *sysfs_dir_cachep;
-
-static const struct super_operations sysfs_ops = {
-	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
-	.evict_inode	= sysfs_evict_inode,
-};
-
 static struct kernfs_root *sysfs_root;
 struct sysfs_dirent *sysfs_root_sd;
 
-static int sysfs_fill_super(struct super_block *sb)
-{
-	struct sysfs_super_info *info = sysfs_info(sb);
-	struct inode *inode;
-	struct dentry *root;
-
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = SYSFS_MAGIC;
-	sb->s_op = &sysfs_ops;
-	sb->s_time_gran = 1;
-
-	/* get root inode, initialize and unlock it */
-	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(sb, info->root->sd);
-	mutex_unlock(&sysfs_mutex);
-	if (!inode) {
-		pr_debug("sysfs: could not get root inode\n");
-		return -ENOMEM;
-	}
-
-	/* instantiate and link root dentry */
-	root = d_make_root(inode);
-	if (!root) {
-		pr_debug("%s: could not get root dentry!\n", __func__);
-		return -ENOMEM;
-	}
-	kernfs_get(info->root->sd);
-	root->d_fsdata = info->root->sd;
-	sb->s_root = root;
-	sb->s_d_op = &sysfs_dentry_ops;
-	return 0;
-}
-
-static int sysfs_test_super(struct super_block *sb, void *data)
-{
-	struct sysfs_super_info *sb_info = sysfs_info(sb);
-	struct sysfs_super_info *info = data;
-
-	return sb_info->root == info->root && sb_info->ns == info->ns;
-}
-
-static int sysfs_set_super(struct super_block *sb, void *data)
-{
-	int error;
-	error = set_anon_super(sb, data);
-	if (!error)
-		sb->s_fs_info = data;
-	return error;
-}
-
-/**
- * kernfs_super_ns - determine the namespace tag of a kernfs super_block
- * @sb: super_block of interest
- *
- * Return the namespace tag associated with kernfs super_block @sb.
- */
-const void *kernfs_super_ns(struct super_block *sb)
-{
-	struct sysfs_super_info *info = sysfs_info(sb);
-
-	return info->ns;
-}
-
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
@@ -120,79 +43,12 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	return root;
 }
 
-/**
- * kernfs_mount_ns - kernfs mount helper
- * @fs_type: file_system_type of the fs being mounted
- * @flags: mount flags specified for the mount
- * @root: kernfs_root of the hierarchy being mounted
- * @ns: optional namespace tag of the mount
- *
- * This is to be called from each kernfs user's file_system_type->mount()
- * implementation, which should pass through the specified @fs_type and
- * @flags, and specify the hierarchy and namespace tag to mount via @root
- * and @ns, respectively.
- *
- * The return value can be passed to the vfs layer verbatim.
- */
-struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-			       struct kernfs_root *root, const void *ns)
-{
-	struct super_block *sb;
-	struct sysfs_super_info *info;
-	int error;
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return ERR_PTR(-ENOMEM);
-
-	info->root = root;
-	info->ns = ns;
-
-	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
-	if (IS_ERR(sb) || sb->s_fs_info != info)
-		kfree(info);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-	if (!sb->s_root) {
-		error = sysfs_fill_super(sb);
-		if (error) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(error);
-		}
-		sb->s_flags |= MS_ACTIVE;
-	}
-
-	return dget(sb->s_root);
-}
-
 static void sysfs_kill_sb(struct super_block *sb)
 {
 	kernfs_kill_sb(sb);
 	kobj_ns_drop(KOBJ_NS_TYPE_NET, (void *)kernfs_super_ns(sb));
 }
 
-/**
- * kernfs_kill_sb - kill_sb for kernfs
- * @sb: super_block being killed
- *
- * This can be used directly for file_system_type->kill_sb().  If a kernfs
- * user needs extra cleanup, it can implement its own kill_sb() and call
- * this function at the end.
- */
-void kernfs_kill_sb(struct super_block *sb)
-{
-	struct sysfs_super_info *info = sysfs_info(sb);
-	struct sysfs_dirent *root_sd = sb->s_root->d_fsdata;
-
-	/*
-	 * Remove the superblock from fs_supers/s_instances
-	 * so we can't find it, before freeing sysfs_super_info.
-	 */
-	kill_anon_super(sb);
-	kfree(info);
-	kernfs_put(root_sd);
-}
-
 static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
@@ -200,14 +56,6 @@ static struct file_system_type sysfs_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
-void __init kernfs_init(void)
-{
-	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
-					      sizeof(struct sysfs_dirent),
-					      0, SLAB_PANIC, NULL);
-	sysfs_inode_init();
-}
-
 int __init sysfs_init(void)
 {
 	int err;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 93b4b68458a..6a82311a50f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -17,25 +17,7 @@
 /*
  * mount.c
  */
-
-struct sysfs_super_info {
-	/*
-	 * The root associated with this super_block.  Each super_block is
-	 * identified by the root and ns it's associated with.
-	 */
-	struct kernfs_root	*root;
-
-	/*
-	 * Each sb is associated with one namespace tag, currently the network
-	 * namespace of the task which mounted this sysfs instance.  If multiple
-	 * tags become necessary, make the following an array and compare
-	 * sysfs_dirent tag against every entry.
-	 */
-	const void		*ns;
-};
-#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent *sysfs_root_sd;
-extern struct kmem_cache *sysfs_dir_cachep;
 
 /*
  * dir.c
-- 
cgit v1.2.3-70-g09d2


From cf9e5a73aaff0204801dd19cb4bd91d32f32026a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 17:18:32 -0500
Subject: sysfs, kernfs: make sysfs_dirent definition public

sysfs_dirent includes some information which should be available to
kernfs users - the type, flags, name and parent pointer.  This patch
moves sysfs_dirent definition from kernfs/kernfs-internal.h to
include/linux/kernfs.h so that kernfs users can access them.

The type part of flags is exported as enum kernfs_node_type, the flags
kernfs_node_flag, sysfs_type() and kernfs_enable_ns() are moved to
include/linux/kernfs.h and the former is updated to return the enum
type.  sysfs_dirent->s_parent and ->s_name are marked explicitly as
public.

This patch doesn't introduce any functional changes.

v2: Flags exported too and kernfs_enable_ns() definition moved.

v3: While moving kernfs_enable_ns() to include/linux/kernfs.h, v1 and
    v2 put the definition outside CONFIG_SYSFS replacing the dummy
    implementation with the actual implementation too.  Unfortunately,
    this can lead to oops when !CONFIG_SYSFS because
    kernfs_enable_ns() may be called on a NULL @sd and now tries to
    dereference @sd instead of not doing anything.  This issue was
    reported by Yuanhan Liu.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             |  15 ------
 fs/kernfs/kernfs-internal.h |  85 +------------------------------
 include/linux/kernfs.h      | 118 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 115 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index eaffa83719d..7c5b5179368 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -895,21 +895,6 @@ int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 	return error;
 }
 
-/**
- * kernfs_enable_ns - enable namespace under a directory
- * @sd: directory of interest, should be empty
- *
- * This is to be called right after @sd is created to enable namespace
- * under it.  All children of @sd must have non-NULL namespace tags and
- * only the ones which match the super_block's tag will be visible.
- */
-void kernfs_enable_ns(struct sysfs_dirent *sd)
-{
-	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
-	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
-	sd->s_flags |= SYSFS_FLAG_NS;
-}
-
 /* Relationship between s_mode and the DT_xxx types */
 static inline unsigned char dt_type(struct sysfs_dirent *sd)
 {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index ced0d6dadc7..f33a7844e8f 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -13,102 +13,19 @@
 
 #include <linux/lockdep.h>
 #include <linux/fs.h>
-#include <linux/rbtree.h>
 #include <linux/mutex.h>
 
 #include <linux/kernfs.h>
 
-struct sysfs_open_dirent;
-
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
-	unsigned long		subdirs;
-	/* children rbtree starts here and goes through sd->s_rb */
-	struct rb_root		children;
-
-	/*
-	 * The kernfs hierarchy this directory belongs to.  This fits
-	 * better directly in sysfs_dirent but is here to save space.
-	 */
-	struct kernfs_root	*root;
-};
-
-struct sysfs_elem_symlink {
-	struct sysfs_dirent	*target_sd;
-};
-
-struct sysfs_elem_attr {
-	const struct kernfs_ops	*ops;
-	struct sysfs_open_dirent *open;
-	loff_t			size;
-};
-
 struct sysfs_inode_attrs {
 	struct iattr	ia_iattr;
 	void		*ia_secdata;
 	u32		ia_secdata_len;
 };
 
-/*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
-	atomic_t		s_count;
-	atomic_t		s_active;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-	struct sysfs_dirent	*s_parent;
-	const char		*s_name;
-
-	struct rb_node		s_rb;
-
-	union {
-		struct completion	*completion;
-		struct sysfs_dirent	*removed_list;
-	} u;
-
-	const void		*s_ns; /* namespace tag */
-	unsigned int		s_hash; /* ns + name hash */
-	union {
-		struct sysfs_elem_dir		s_dir;
-		struct sysfs_elem_symlink	s_symlink;
-		struct sysfs_elem_attr		s_attr;
-	};
-
-	void			*priv;
-
-	unsigned short		s_flags;
-	umode_t			s_mode;
-	unsigned int		s_ino;
-	struct sysfs_inode_attrs *s_iattr;
-};
-
 #define SD_DEACTIVATED_BIAS		INT_MIN
 
-#define SYSFS_TYPE_MASK			0x000f
-#define SYSFS_DIR			0x0001
-#define SYSFS_KOBJ_ATTR			0x0002
-#define SYSFS_KOBJ_LINK			0x0004
-#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF		SYSFS_KOBJ_ATTR
-
-#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_REMOVED		0x0010
-#define SYSFS_FLAG_NS			0x0020
-#define SYSFS_FLAG_HAS_SEQ_SHOW		0x0040
-#define SYSFS_FLAG_HAS_MMAP		0x0080
-#define SYSFS_FLAG_LOCKDEP		0x0100
-
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
-	return sd->s_flags & SYSFS_TYPE_MASK;
-}
+/* SYSFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
 
 /**
  * kernfs_root - find out the kernfs_root a sysfs_dirent belongs to
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 75fcbe5c9d6..faaf4f29e33 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -13,6 +13,9 @@
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/lockdep.h>
+#include <linux/rbtree.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
 
 struct file;
 struct iattr;
@@ -21,7 +24,92 @@ struct vm_area_struct;
 struct super_block;
 struct file_system_type;
 
-struct sysfs_dirent;
+struct sysfs_open_dirent;
+struct sysfs_inode_attrs;
+
+enum kernfs_node_type {
+	SYSFS_DIR		= 0x0001,
+	SYSFS_KOBJ_ATTR		= 0x0002,
+	SYSFS_KOBJ_LINK		= 0x0004,
+};
+
+#define SYSFS_TYPE_MASK		0x000f
+#define SYSFS_COPY_NAME		(SYSFS_DIR | SYSFS_KOBJ_LINK)
+#define SYSFS_ACTIVE_REF	SYSFS_KOBJ_ATTR
+#define SYSFS_FLAG_MASK		~SYSFS_TYPE_MASK
+
+enum kernfs_node_flag {
+	SYSFS_FLAG_REMOVED	= 0x0010,
+	SYSFS_FLAG_NS		= 0x0020,
+	SYSFS_FLAG_HAS_SEQ_SHOW	= 0x0040,
+	SYSFS_FLAG_HAS_MMAP	= 0x0080,
+	SYSFS_FLAG_LOCKDEP	= 0x0100,
+};
+
+/* type-specific structures for sysfs_dirent->s_* union members */
+struct sysfs_elem_dir {
+	unsigned long		subdirs;
+	/* children rbtree starts here and goes through sd->s_rb */
+	struct rb_root		children;
+
+	/*
+	 * The kernfs hierarchy this directory belongs to.  This fits
+	 * better directly in sysfs_dirent but is here to save space.
+	 */
+	struct kernfs_root	*root;
+};
+
+struct sysfs_elem_symlink {
+	struct sysfs_dirent	*target_sd;
+};
+
+struct sysfs_elem_attr {
+	const struct kernfs_ops	*ops;
+	struct sysfs_open_dirent *open;
+	loff_t			size;
+};
+
+/*
+ * sysfs_dirent - the building block of sysfs hierarchy.  Each and every
+ * sysfs node is represented by single sysfs_dirent.  Most fields are
+ * private to kernfs and shouldn't be accessed directly by kernfs users.
+ *
+ * As long as s_count reference is held, the sysfs_dirent itself is
+ * accessible.  Dereferencing s_elem or any other outer entity
+ * requires s_active reference.
+ */
+struct sysfs_dirent {
+	atomic_t		s_count;
+	atomic_t		s_active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+	/* the following two fields are published */
+	struct sysfs_dirent	*s_parent;
+	const char		*s_name;
+
+	struct rb_node		s_rb;
+
+	union {
+		struct completion	*completion;
+		struct sysfs_dirent	*removed_list;
+	} u;
+
+	const void		*s_ns; /* namespace tag */
+	unsigned int		s_hash; /* ns + name hash */
+	union {
+		struct sysfs_elem_dir		s_dir;
+		struct sysfs_elem_symlink	s_symlink;
+		struct sysfs_elem_attr		s_attr;
+	};
+
+	void			*priv;
+
+	unsigned short		s_flags;
+	umode_t			s_mode;
+	unsigned int		s_ino;
+	struct sysfs_inode_attrs *s_iattr;
+};
 
 struct kernfs_root {
 	/* published fields */
@@ -82,6 +170,26 @@ struct kernfs_ops {
 
 #ifdef CONFIG_SYSFS
 
+static inline enum kernfs_node_type sysfs_type(struct sysfs_dirent *sd)
+{
+	return sd->s_flags & SYSFS_TYPE_MASK;
+}
+
+/**
+ * kernfs_enable_ns - enable namespace under a directory
+ * @sd: directory of interest, should be empty
+ *
+ * This is to be called right after @sd is created to enable namespace
+ * under it.  All children of @sd must have non-NULL namespace tags and
+ * only the ones which match the super_block's tag will be visible.
+ */
+static inline void kernfs_enable_ns(struct sysfs_dirent *sd)
+{
+	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
+	sd->s_flags |= SYSFS_FLAG_NS;
+}
+
 struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
 					    const char *name, const void *ns);
 void kernfs_get(struct sysfs_dirent *sd);
@@ -107,7 +215,6 @@ int kernfs_remove_by_name_ns(struct sysfs_dirent *parent, const char *name,
 			     const void *ns);
 int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 		     const char *new_name, const void *new_ns);
-void kernfs_enable_ns(struct sysfs_dirent *sd);
 int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr);
 void kernfs_notify(struct sysfs_dirent *sd);
 
@@ -120,6 +227,11 @@ void kernfs_init(void);
 
 #else	/* CONFIG_SYSFS */
 
+static inline enum kernfs_node_type sysfs_type(struct sysfs_dirent *sd)
+{ return 0; }	/* whatever */
+
+static inline void kernfs_enable_ns(struct sysfs_dirent *sd) { }
+
 static inline struct sysfs_dirent *
 kernfs_find_and_get_ns(struct sysfs_dirent *parent, const char *name,
 		       const void *ns)
@@ -161,8 +273,6 @@ static inline int kernfs_rename_ns(struct sysfs_dirent *sd,
 				   const char *new_name, const void *new_ns)
 { return -ENOSYS; }
 
-static inline void kernfs_enable_ns(struct sysfs_dirent *sd) { }
-
 static inline int kernfs_setattr(struct sysfs_dirent *sd,
 				 const struct iattr *iattr)
 { return -ENOSYS; }
-- 
cgit v1.2.3-70-g09d2


From ac9bba031001704a2339713cc12148857eccc5e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 17:19:09 -0500
Subject: sysfs, kernfs: implement kernfs_ns_enabled()

fs/sysfs/symlink.c::sysfs_delete_link() tests @sd->s_flags for
SYSFS_FLAG_NS.  Let's add kernfs_ns_enabled() so that sysfs doesn't
have to test sysfs_dirent flag directly.  This makes things tidier for
kernfs proper too.

This is purely cosmetic.

v2: To avoid possible NULL deref, use noop dummy implementation which
    always returns false when !CONFIG_SYSFS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 10 +++++-----
 fs/kernfs/symlink.c    |  2 +-
 fs/sysfs/symlink.c     |  2 +-
 include/linux/kernfs.h | 14 ++++++++++++++
 4 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 7c5b5179368..f51e0625e66 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -291,7 +291,7 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved to a different namespace */
-	if (sd->s_parent && (sd->s_parent->s_flags & SYSFS_FLAG_NS) &&
+	if (sd->s_parent && kernfs_ns_enabled(sd->s_parent) &&
 	    sysfs_info(dentry->d_sb)->ns != sd->s_ns)
 		goto out_bad;
 
@@ -414,7 +414,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		  struct sysfs_dirent *parent_sd)
 {
-	bool has_ns = parent_sd->s_flags & SYSFS_FLAG_NS;
+	bool has_ns = kernfs_ns_enabled(parent_sd);
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
@@ -535,7 +535,7 @@ static struct sysfs_dirent *kernfs_find_ns(struct sysfs_dirent *parent,
 					   const void *ns)
 {
 	struct rb_node *node = parent->s_dir.children.rb_node;
-	bool has_ns = parent->s_flags & SYSFS_FLAG_NS;
+	bool has_ns = kernfs_ns_enabled(parent);
 	unsigned int hash;
 
 	lockdep_assert_held(&sysfs_mutex);
@@ -685,7 +685,7 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	mutex_lock(&sysfs_mutex);
 
-	if (parent_sd->s_flags & SYSFS_FLAG_NS)
+	if (kernfs_ns_enabled(parent_sd))
 		ns = sysfs_info(dir->i_sb)->ns;
 
 	sd = kernfs_find_ns(parent_sd, dentry->d_name.name, ns);
@@ -968,7 +968,7 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 		return 0;
 	mutex_lock(&sysfs_mutex);
 
-	if (parent_sd->s_flags & SYSFS_FLAG_NS)
+	if (kernfs_ns_enabled(parent_sd))
 		ns = sysfs_info(dentry->d_sb)->ns;
 
 	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 004c1646559..12569a73883 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -35,7 +35,7 @@ struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
 	if (!sd)
 		return ERR_PTR(-ENOMEM);
 
-	if (parent->s_flags & SYSFS_FLAG_NS)
+	if (kernfs_ns_enabled(parent))
 		sd->s_ns = target->s_ns;
 	sd->s_symlink.target_sd = target;
 	kernfs_get(target);	/* ref owned by symlink */
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 62f0e014ec4..1b8c9ed8511 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -128,7 +128,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (targ->sd && (kobj->sd->s_flags & SYSFS_FLAG_NS))
+	if (targ->sd && kernfs_ns_enabled(kobj->sd))
 		ns = targ->sd->s_ns;
 	spin_unlock(&sysfs_symlink_target_lock);
 	kernfs_remove_by_name_ns(kobj->sd, name, ns);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index faaf4f29e33..d6554130841 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -190,6 +190,17 @@ static inline void kernfs_enable_ns(struct sysfs_dirent *sd)
 	sd->s_flags |= SYSFS_FLAG_NS;
 }
 
+/**
+ * kernfs_ns_enabled - test whether namespace is enabled
+ * @sd: the node to test
+ *
+ * Test whether namespace filtering is enabled for the children of @ns.
+ */
+static inline bool kernfs_ns_enabled(struct sysfs_dirent *sd)
+{
+	return sd->s_flags & SYSFS_FLAG_NS;
+}
+
 struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
 					    const char *name, const void *ns);
 void kernfs_get(struct sysfs_dirent *sd);
@@ -232,6 +243,9 @@ static inline enum kernfs_node_type sysfs_type(struct sysfs_dirent *sd)
 
 static inline void kernfs_enable_ns(struct sysfs_dirent *sd) { }
 
+static inline bool kernfs_ns_enabled(struct sysfs_dirent *sd)
+{ return false; }
+
 static inline struct sysfs_dirent *
 kernfs_find_and_get_ns(struct sysfs_dirent *parent, const char *name,
 		       const void *ns)
-- 
cgit v1.2.3-70-g09d2


From bfc5c17337145955b31c22b96a6e07def048471b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 Nov 2013 14:54:47 -0500
Subject: sysfs, kernfs: remove cross inclusions of internal headers

fs/kernfs/kernfs-internal.h needed to include fs/sysfs/sysfs.h because
part of kernfs core implementation was living in sysfs.

fs/sysfs/sysfs.h needed to include fs/kernfs/kernfs-internal.h because
include/linux/kernfs.h didn't expose enough interface.

The separation is complete and neither is true anymore.  Remove the
cross inclusion and make sysfs a proper user of kernfs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/kernfs-internal.h | 2 --
 fs/sysfs/sysfs.h            | 1 -
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index f33a7844e8f..d1ff591c5cf 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -48,8 +48,6 @@ struct sysfs_addrm_cxt {
 	struct sysfs_dirent	*removed;
 };
 
-#include "../sysfs/sysfs.h"
-
 /*
  * mount.c
  */
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6a82311a50f..c8e395b4933 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,7 +11,6 @@
 #ifndef __SYSFS_INTERNAL_H
 #define __SYSFS_INTERNAL_H
 
-#include "../kernfs/kernfs-internal.h"
 #include <linux/sysfs.h>
 
 /*
-- 
cgit v1.2.3-70-g09d2


From bc1e79acc13d70c5bb1b2a47bf0a580e6ae81fb6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 3 Dec 2013 13:24:08 -0800
Subject: block: fixup for generic bio chaining

btrfs bits got lost in the rebase

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Chris Mason <clm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/btrfs/disk-io.c | 2 +-
 fs/btrfs/volumes.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5a10c61adaf..e71039ea66c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1692,7 +1692,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kfree(end_io_wq);
-	bio_endio(bio, error);
+	bio_endio_nodec(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2130de0ddc..37972d5db73 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5297,6 +5297,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		if (!is_orig_bio) {
 			bio_put(bio);
 			bio = bbio->orig_bio;
+		} else {
+			atomic_inc(&bio->bi_remaining);
 		}
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
-- 
cgit v1.2.3-70-g09d2


From ca6673b02e26356bcb3b86e074eaa59cfa51114b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 3 Dec 2013 17:32:53 -0600
Subject: block: Replace __this_cpu_ptr with raw_cpu_ptr

__this_cpu_ptr is being phased out.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 1c04ec66974..651dba10b9c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1312,7 +1312,7 @@ static void bh_lru_install(struct buffer_head *bh)
 		}
 		while (out < BH_LRU_SIZE)
 			bhs[out++] = NULL;
-		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
 	}
 	bh_lru_unlock();
 
-- 
cgit v1.2.3-70-g09d2


From 301d4c9a286bc7dc4fb3cda21131be91a582fa79 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 25 Nov 2013 23:30:26 +0100
Subject: jbd: Revise KERN_EMERG error messages

Some of KERN_EMERG printk messages do not really deserve this log level
and the one in log_wait_commit() is even rather useless (the journal has
been previously aborted and *that* is where we should have been
complaining). So make some messages just KERN_ERR and remove the useless
message.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c     | 8 +++-----
 fs/jbd/transaction.c | 4 ++--
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2d04f9afafd..06fe11e0abf 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 #ifdef CONFIG_JBD_DEBUG
 	spin_lock(&journal->j_state_lock);
 	if (!tid_geq(journal->j_commit_request, tid)) {
-		printk(KERN_EMERG
+		printk(KERN_ERR
 		       "%s: error: j_commit_request=%d, tid=%d\n",
 		       __func__, journal->j_commit_request, tid);
 	}
@@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 out_unlock:
 	spin_unlock(&journal->j_state_lock);
 
-	if (unlikely(is_journal_aborted(journal))) {
-		printk(KERN_EMERG "journal commit I/O error\n");
+	if (unlikely(is_journal_aborted(journal)))
 		err = -EIO;
-	}
 	return err;
 }
 
@@ -2136,7 +2134,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD_DEBUG
 	int n = atomic_read(&nr_journal_heads);
 	if (n)
-		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+		printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
 #endif
 	jbd_remove_debugfs_entry();
 	journal_destroy_caches();
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index aa603e017d2..1695ba8334a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -675,7 +675,7 @@ repeat:
 					jbd_alloc(jh2bh(jh)->b_size,
 							 GFP_NOFS);
 				if (!frozen_buffer) {
-					printk(KERN_EMERG
+					printk(KERN_ERR
 					       "%s: OOM for frozen_buffer\n",
 					       __func__);
 					JBUFFER_TRACE(jh, "oom!");
@@ -898,7 +898,7 @@ repeat:
 	if (!jh->b_committed_data) {
 		committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 		if (!committed_data) {
-			printk(KERN_EMERG "%s: No memory for committed data\n",
+			printk(KERN_ERR "%s: No memory for committed data\n",
 				__func__);
 			err = -ENOMEM;
 			goto out;
-- 
cgit v1.2.3-70-g09d2


From 9a8049affd55a2c857a89faece27b878416fbf91 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 17:40:01 -0500
Subject: kernfs: update sysfs_init_inode_attrs()

sysfs_init_inode_attrs() is a bit clumsy to use requiring the caller
to check whether @sd->s_iattr is already set or not.  Rename it to
sysfs_inode_attrs(), update it to check whether @sd->s_iattr is
already initialized before trying to initialize it and return
@sd->s_iattr.  This simplifies the callers.

While at it,

* Rename struct sysfs_inode_attrs pointer variables to "attrs".  As
  kernfs no longer deals with "struct attribute", this isn't confusing
  and makes it easier to distinguish from struct iattr pointers.

* A new field will be added to sysfs_inode_attrs.  Reindent in
  preparation.

This patch doesn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/inode.c           | 60 ++++++++++++++++++++-------------------------
 fs/kernfs/kernfs-internal.h |  6 ++---
 2 files changed, 29 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b4cae6fd717..a1f83825afc 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -43,15 +43,17 @@ void __init sysfs_inode_init(void)
 		panic("failed to init sysfs_backing_dev_info");
 }
 
-static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+static struct sysfs_inode_attrs *sysfs_inode_attrs(struct sysfs_dirent *sd)
 {
-	struct sysfs_inode_attrs *attrs;
 	struct iattr *iattrs;
 
-	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-	if (!attrs)
+	if (sd->s_iattr)
+		return sd->s_iattr;
+
+	sd->s_iattr = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	if (!sd->s_iattr)
 		return NULL;
-	iattrs = &attrs->ia_iattr;
+	iattrs = &sd->s_iattr->ia_iattr;
 
 	/* assign default attributes */
 	iattrs->ia_mode = sd->s_mode;
@@ -59,26 +61,20 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
 	iattrs->ia_gid = GLOBAL_ROOT_GID;
 	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
 
-	return attrs;
+	return sd->s_iattr;
 }
 
 static int __kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
 {
-	struct sysfs_inode_attrs *sd_attrs;
+	struct sysfs_inode_attrs *attrs;
 	struct iattr *iattrs;
 	unsigned int ia_valid = iattr->ia_valid;
 
-	sd_attrs = sd->s_iattr;
+	attrs = sysfs_inode_attrs(sd);
+	if (!attrs)
+		return -ENOMEM;
 
-	if (!sd_attrs) {
-		/* setting attributes for the first time, allocate now */
-		sd_attrs = sysfs_init_inode_attrs(sd);
-		if (!sd_attrs)
-			return -ENOMEM;
-		sd->s_iattr = sd_attrs;
-	}
-	/* attributes were changed at least once in past */
-	iattrs = &sd_attrs->ia_iattr;
+	iattrs = &attrs->ia_iattr;
 
 	if (ia_valid & ATTR_UID)
 		iattrs->ia_uid = iattr->ia_uid;
@@ -143,22 +139,19 @@ out:
 static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
 			       u32 *secdata_len)
 {
-	struct sysfs_inode_attrs *iattrs;
+	struct sysfs_inode_attrs *attrs;
 	void *old_secdata;
 	size_t old_secdata_len;
 
-	if (!sd->s_iattr) {
-		sd->s_iattr = sysfs_init_inode_attrs(sd);
-		if (!sd->s_iattr)
-			return -ENOMEM;
-	}
+	attrs = sysfs_inode_attrs(sd);
+	if (!attrs)
+		return -ENOMEM;
 
-	iattrs = sd->s_iattr;
-	old_secdata = iattrs->ia_secdata;
-	old_secdata_len = iattrs->ia_secdata_len;
+	old_secdata = attrs->ia_secdata;
+	old_secdata_len = attrs->ia_secdata_len;
 
-	iattrs->ia_secdata = *secdata;
-	iattrs->ia_secdata_len = *secdata_len;
+	attrs->ia_secdata = *secdata;
+	attrs->ia_secdata_len = *secdata_len;
 
 	*secdata = old_secdata;
 	*secdata_len = old_secdata_len;
@@ -216,17 +209,16 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
 
 static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
-	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
+	struct sysfs_inode_attrs *attrs = sd->s_iattr;
 
 	inode->i_mode = sd->s_mode;
-	if (iattrs) {
+	if (attrs) {
 		/* sysfs_dirent has non-default attributes
 		 * get them from persistent copy in sysfs_dirent
 		 */
-		set_inode_attr(inode, &iattrs->ia_iattr);
-		security_inode_notifysecctx(inode,
-					    iattrs->ia_secdata,
-					    iattrs->ia_secdata_len);
+		set_inode_attr(inode, &attrs->ia_iattr);
+		security_inode_notifysecctx(inode, attrs->ia_secdata,
+					    attrs->ia_secdata_len);
 	}
 
 	if (sysfs_type(sd) == SYSFS_DIR)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index d1ff591c5cf..f25b3548bcc 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -18,9 +18,9 @@
 #include <linux/kernfs.h>
 
 struct sysfs_inode_attrs {
-	struct iattr	ia_iattr;
-	void		*ia_secdata;
-	u32		ia_secdata_len;
+	struct iattr		ia_iattr;
+	void			*ia_secdata;
+	u32			ia_secdata_len;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
-- 
cgit v1.2.3-70-g09d2


From 2322392b020badfe49730f1529b9c1a15248c387 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 23 Nov 2013 17:40:02 -0500
Subject: kernfs: implement "trusted.*" xattr support

kernfs inherited "security.*" xattr support from sysfs.  This patch
extends xattr support to "trusted.*" using simple_xattr_*().  As
trusted xattrs are restricted to CAP_SYS_ADMIN, simple_xattr_*() which
uses kernel memory for storage shouldn't be problematic.

Note that the existing "security.*" support doesn't implement
get/remove/list and the this patch only implements those ops for
"trusted.*".  We probably want to extend those ops to include support
for "security.*".

This patch will allow using kernfs from cgroup which requires
"trusted.*" xattr support.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David P. Quigley <dpquigl@tycho.nsa.gov>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 12 ++++++---
 fs/kernfs/inode.c           | 63 +++++++++++++++++++++++++++++++++++++++------
 fs/kernfs/kernfs-internal.h |  7 +++++
 fs/kernfs/symlink.c         |  3 +++
 4 files changed, 74 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f51e0625e66..a441e3be805 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -243,9 +243,12 @@ void kernfs_put(struct sysfs_dirent *sd)
 		kernfs_put(sd->s_symlink.target_sd);
 	if (sysfs_type(sd) & SYSFS_COPY_NAME)
 		kfree(sd->s_name);
-	if (sd->s_iattr && sd->s_iattr->ia_secdata)
-		security_release_secctx(sd->s_iattr->ia_secdata,
-					sd->s_iattr->ia_secdata_len);
+	if (sd->s_iattr) {
+		if (sd->s_iattr->ia_secdata)
+			security_release_secctx(sd->s_iattr->ia_secdata,
+						sd->s_iattr->ia_secdata_len);
+		simple_xattrs_free(&sd->s_iattr->xattrs);
+	}
 	kfree(sd->s_iattr);
 	ida_simple_remove(&root->ino_ida, sd->s_ino);
 	kmem_cache_free(sysfs_dir_cachep, sd);
@@ -718,6 +721,9 @@ const struct inode_operations sysfs_dir_inode_operations = {
 	.setattr	= sysfs_setattr,
 	.getattr	= sysfs_getattr,
 	.setxattr	= sysfs_setxattr,
+	.removexattr	= sysfs_removexattr,
+	.getxattr	= sysfs_getxattr,
+	.listxattr	= sysfs_listxattr,
 };
 
 static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index a1f83825afc..18ad431e8c2 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -35,6 +35,9 @@ static const struct inode_operations sysfs_inode_operations = {
 	.setattr	= sysfs_setattr,
 	.getattr	= sysfs_getattr,
 	.setxattr	= sysfs_setxattr,
+	.removexattr	= sysfs_removexattr,
+	.getxattr	= sysfs_getxattr,
+	.listxattr	= sysfs_listxattr,
 };
 
 void __init sysfs_inode_init(void)
@@ -61,6 +64,8 @@ static struct sysfs_inode_attrs *sysfs_inode_attrs(struct sysfs_dirent *sd)
 	iattrs->ia_gid = GLOBAL_ROOT_GID;
 	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
 
+	simple_xattrs_init(&sd->s_iattr->xattrs);
+
 	return sd->s_iattr;
 }
 
@@ -162,23 +167,25 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags)
 {
 	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_inode_attrs *attrs;
 	void *secdata;
 	int error;
 	u32 secdata_len = 0;
 
-	if (!sd)
-		return -EINVAL;
+	attrs = sysfs_inode_attrs(sd);
+	if (!attrs)
+		return -ENOMEM;
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
 		error = security_inode_setsecurity(dentry->d_inode, suffix,
 						value, size, flags);
 		if (error)
-			goto out;
+			return error;
 		error = security_inode_getsecctx(dentry->d_inode,
 						&secdata, &secdata_len);
 		if (error)
-			goto out;
+			return error;
 
 		mutex_lock(&sysfs_mutex);
 		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
@@ -186,10 +193,50 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 		if (secdata)
 			security_release_secctx(secdata, secdata_len);
-	} else
-		return -EINVAL;
-out:
-	return error;
+		return error;
+	} else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+		return simple_xattr_set(&attrs->xattrs, name, value, size,
+					flags);
+	}
+
+	return -EINVAL;
+}
+
+int sysfs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_inode_attrs *attrs;
+
+	attrs = sysfs_inode_attrs(sd);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_remove(&attrs->xattrs, name);
+}
+
+ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_inode_attrs *attrs;
+
+	attrs = sysfs_inode_attrs(sd);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_get(&attrs->xattrs, name, buf, size);
+}
+
+ssize_t sysfs_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_inode_attrs *attrs;
+
+	attrs = sysfs_inode_attrs(sd);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_list(&attrs->xattrs, buf, size);
 }
 
 static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index f25b3548bcc..910e485b733 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -14,6 +14,7 @@
 #include <linux/lockdep.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/xattr.h>
 
 #include <linux/kernfs.h>
 
@@ -21,6 +22,8 @@ struct sysfs_inode_attrs {
 	struct iattr		ia_iattr;
 	void			*ia_secdata;
 	u32			ia_secdata_len;
+
+	struct simple_xattrs	xattrs;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
@@ -81,6 +84,10 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags);
+int sysfs_removexattr(struct dentry *dentry, const char *name);
+ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size);
+ssize_t sysfs_listxattr(struct dentry *dentry, char *buf, size_t size);
 void sysfs_inode_init(void);
 
 /*
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 12569a73883..adf28755b0e 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -140,6 +140,9 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
 
 const struct inode_operations sysfs_symlink_inode_operations = {
 	.setxattr	= sysfs_setxattr,
+	.removexattr	= sysfs_removexattr,
+	.getxattr	= sysfs_getxattr,
+	.listxattr	= sysfs_listxattr,
 	.readlink	= generic_readlink,
 	.follow_link	= sysfs_follow_link,
 	.put_link	= sysfs_put_link,
-- 
cgit v1.2.3-70-g09d2


From 3fefdeee92686995ff03e847cbd7bf5ebcd85ff8 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 13 Nov 2013 14:53:45 -0600
Subject: xfs: simplify xfs_setsize_buftarg callchain; remove unused arg

The "verbose" argument to xfs_setsize_buftarg_flags() has been
unused since:

ffe37436 xfs: stop using the page cache to back the buffer cache

Remove it, and fold the function into xfs_setsize_buftarg()
now that there's no need for different types of callers.

Fix inconsistent comment spacing while we're at it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c7f0b77dcb0..ce01c1a17cc 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1602,12 +1602,11 @@ xfs_free_buftarg(
 	kmem_free(btp);
 }
 
-STATIC int
-xfs_setsize_buftarg_flags(
+int
+xfs_setsize_buftarg(
 	xfs_buftarg_t		*btp,
 	unsigned int		blocksize,
-	unsigned int		sectorsize,
-	int			verbose)
+	unsigned int		sectorsize)
 {
 	btp->bt_bsize = blocksize;
 	btp->bt_sshift = ffs(sectorsize) - 1;
@@ -1628,26 +1627,17 @@ xfs_setsize_buftarg_flags(
 }
 
 /*
- *	When allocating the initial buffer target we have not yet
- *	read in the superblock, so don't know what sized sectors
- *	are being used at this early stage.  Play safe.
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used at this early stage.  Play safe.
  */
 STATIC int
 xfs_setsize_buftarg_early(
 	xfs_buftarg_t		*btp,
 	struct block_device	*bdev)
 {
-	return xfs_setsize_buftarg_flags(btp,
-			PAGE_SIZE, bdev_logical_block_size(bdev), 0);
-}
-
-int
-xfs_setsize_buftarg(
-	xfs_buftarg_t		*btp,
-	unsigned int		blocksize,
-	unsigned int		sectorsize)
-{
-	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+	return xfs_setsize_buftarg(btp, PAGE_SIZE,
+				   bdev_logical_block_size(bdev));
 }
 
 xfs_buftarg_t *
-- 
cgit v1.2.3-70-g09d2


From f23007784570278ca5963c35d5b3847d710ed695 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 15 Nov 2013 09:38:48 -0800
Subject: xfs: remove unused FI_ flags

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@redhat.com.>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_vnode.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 3e8e797c6d1..e8a77383c0d 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -34,15 +34,6 @@ struct attrlist_cursor_kern;
 	{ IO_ISDIRECT,	"DIRECT" }, \
 	{ IO_INVIS,	"INVIS"}
 
-/*
- * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
- */
-#define FI_NONE			0	/* none */
-#define FI_REMAPF		1	/* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED	2	/* Do a remapf prior to the operation.
-					   Prevent VM access to the pages until
-					   the operation completes. */
-
 /*
  * Some useful predicates.
  */
-- 
cgit v1.2.3-70-g09d2


From 071c529eb672648ee8ca3f90944bcbcc730b4c06 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 31 Oct 2013 21:00:10 +0300
Subject: xfs: underflow bug in xfs_attrlist_by_handle()

If we allocate less than sizeof(struct attrlist) then we end up
corrupting memory or doing a ZERO_PTR_SIZE dereference.

This can only be triggered with CAP_SYS_ADMIN.

Reported-by: Nico Golde <nico@ngolde.de>
Reported-by: Fabian Yamaguchi <fabs@goesec.de>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ioctl.c   | 3 ++-
 fs/xfs/xfs_ioctl32.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4d613401a5e..33ad9a77791 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -442,7 +442,8 @@ xfs_attrlist_by_handle(
 		return -XFS_ERROR(EPERM);
 	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
-	if (al_hreq.buflen > XATTR_LIST_MAX)
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
 		return -XFS_ERROR(EINVAL);
 
 	/*
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index e8fb1231db8..a7992f8de9d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,8 @@ xfs_compat_attrlist_by_handle(
 	if (copy_from_user(&al_hreq, arg,
 			   sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
-	if (al_hreq.buflen > XATTR_LIST_MAX)
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
 		return -XFS_ERROR(EINVAL);
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 10f73d27c8e977fb6fbd6058517069be830c6c9a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 6 Nov 2013 03:45:36 -0800
Subject: xfs: fix the comment explaining xfs_trans_dqlockedjoin

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_trans_dquot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index cd2a10e15d3..41172861e85 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -295,8 +295,8 @@ xfs_trans_mod_dquot(
 /*
  * Given an array of dqtrx structures, lock all the dquots associated and join
  * them to the transaction, provided they have been modified.  We know that the
- * highest number of dquots of one type - usr, grp OR prj - involved in a
- * transaction is 2 so we don't need to make this very generic.
+ * highest number of dquots of one type - usr, grp and prj - involved in a
+ * transaction is 3 so we don't need to make this very generic.
  */
 STATIC void
 xfs_trans_dqlockedjoin(
-- 
cgit v1.2.3-70-g09d2


From f9fd0135610084abef6867d984e9951c3099950d Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 20 Nov 2013 16:08:53 +0800
Subject: xfs: don't perform discard if the given range length is less than
 block size

For discard operation, we should return EINVAL if the given range length
is less than a block size, otherwise it will go through the file system
to discard data blocks as the end range might be evaluated to -1, e.g,
# fstrim -v -o 0 -l 100 /xfs7
/xfs7: 9811378176 bytes were trimmed

This issue can be triggered via xfstests/generic/288.

Also, it seems to get the request queue pointer via bdev_get_queue()
instead of the hard code pointer dereference is not a bad thing.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_discard.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8367d6dc18c..4f11ef01113 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -157,7 +157,7 @@ xfs_ioc_trim(
 	struct xfs_mount		*mp,
 	struct fstrim_range __user	*urange)
 {
-	struct request_queue	*q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+	struct request_queue	*q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
 	unsigned int		granularity = q->limits.discard_granularity;
 	struct fstrim_range	range;
 	xfs_daddr_t		start, end, minlen;
@@ -180,7 +180,8 @@ xfs_ioc_trim(
 	 * matter as trimming blocks is an advisory interface.
 	 */
 	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
-	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+	    range.len < mp->m_sb.sb_blocksize)
 		return -XFS_ERROR(EINVAL);
 
 	start = BTOBB(range.start);
-- 
cgit v1.2.3-70-g09d2


From b7d961b35b3ab69609aeea93f870269cb6e7ba4d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 21 Nov 2013 15:41:06 +1100
Subject: xfs: growfs overruns AGFL buffer on V4 filesystems

This loop in xfs_growfs_data_private() is incorrect for V4
superblocks filesystems:

		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
			agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);

For V4 filesystems, we don't have a agfl header structure, and so
XFS_AGFL_SIZE() returns an entire sector's worth of entries, which
we then index from an offset into the sector. Hence: buffer overrun.

This problem was introduced in 3.10 by commit 77c95bba ("xfs: add
CRC checks to the AGFL") which changed the AGFL structure but failed
to update the growfs code to handle the different structures.

Fix it by using the correct offset into the buffer for both V4 and
V5 filesystems.

Cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fsops.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a6e54b3319b..02fb943cbf2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -220,6 +220,8 @@ xfs_growfs_data_private(
 	 */
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+		__be32	*agfl_bno;
+
 		/*
 		 * AG freespace header block
 		 */
@@ -279,8 +281,10 @@ xfs_growfs_data_private(
 			agfl->agfl_seqno = cpu_to_be32(agno);
 			uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
 		}
+
+		agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
 		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
-			agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+			agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
 
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
-- 
cgit v1.2.3-70-g09d2


From dff6efc326a4d5f305797d4a6bba14f374fdd633 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Nov 2013 07:17:07 -0800
Subject: fs: fix iversion handling

Currently notify_change directly updates i_version for size updates,
which not only is counter to how all other fields are updated through
struct iattr, but also breaks XFS, which need inode updates to happen
under its own lock, and synchronized to the structure that gets written
to the log.

Remove the update in the common code, and it to btrfs and ext4,
XFS already does a proper updaste internally and currently gets a
double update with the existing code.

IMHO this is 3.13 and -stable material and should go in through the XFS
tree.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Acked-by: Jan Kara <jack@suse.cz>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/attr.c        | 5 -----
 fs/btrfs/inode.c | 8 ++++++--
 fs/ext4/inode.c  | 4 ++++
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 267968d9467..5d4e59d56e8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -202,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 			return -EPERM;
 	}
 
-	if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
-		if (attr->ia_size != inode->i_size)
-			inode_inc_iversion(inode);
-	}
-
 	if ((ia_valid & ATTR_MODE)) {
 		umode_t amode = attr->ia_mode;
 		/* Flag setting protected by i_mutex */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d03..471a4f7f404 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4354,8 +4354,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 	 * these flags set.  For all other operations the VFS set these flags
 	 * explicitly if it wants a timestamp update.
 	 */
-	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
-		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+	if (newsize != oldsize) {
+		inode_inc_iversion(inode);
+		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+			inode->i_ctime = inode->i_mtime =
+				current_fs_time(inode->i_sb);
+	}
 
 	if (newsize > oldsize) {
 		truncate_pagecache(inode, newsize);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 07576347411..7f0e15ebacd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4598,6 +4598,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			if (attr->ia_size > sbi->s_bitmap_maxbytes)
 				return -EFBIG;
 		}
+
+		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+			inode_inc_iversion(inode);
+
 		if (S_ISREG(inode->i_mode) &&
 		    (attr->ia_size < inode->i_size)) {
 			if (ext4_should_order_data(inode)) {
-- 
cgit v1.2.3-70-g09d2


From 2a84108fe275f95fbe838b1c92b7c45258dcae5c Mon Sep 17 00:00:00 2001
From: Mark Tinguely <tinguely@sgi.com>
Date: Wed, 2 Oct 2013 07:51:12 -0500
Subject: xfs: free the list of recovery items on error

Recovery builds a list of items on the transaction's
r_itemq head. Normally these items are committed and freed.
But in the event of a recovery error, these allocations
are leaked.

If the error occurs during item reordering, then reconstruct
the r_itemq list before deleting the list to avoid leaking
the entries that were on one of the temporary lists.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b6b669df40f..07ab52ca8ab 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1651,6 +1651,7 @@ xlog_recover_reorder_trans(
 	int			pass)
 {
 	xlog_recover_item_t	*item, *n;
+	int			error = 0;
 	LIST_HEAD(sort_list);
 	LIST_HEAD(cancel_list);
 	LIST_HEAD(buffer_list);
@@ -1692,9 +1693,17 @@ xlog_recover_reorder_trans(
 				"%s: unrecognized type of log operation",
 				__func__);
 			ASSERT(0);
-			return XFS_ERROR(EIO);
+			/*
+			 * return the remaining items back to the transaction
+			 * item list so they can be freed in caller.
+			 */
+			if (!list_empty(&sort_list))
+				list_splice_init(&sort_list, &trans->r_itemq);
+			error = XFS_ERROR(EIO);
+			goto out;
 		}
 	}
+out:
 	ASSERT(list_empty(&sort_list));
 	if (!list_empty(&buffer_list))
 		list_splice(&buffer_list, &trans->r_itemq);
@@ -1704,7 +1713,7 @@ xlog_recover_reorder_trans(
 		list_splice_tail(&inode_buffer_list, &trans->r_itemq);
 	if (!list_empty(&cancel_list))
 		list_splice_tail(&cancel_list, &trans->r_itemq);
-	return 0;
+	return error;
 }
 
 /*
@@ -3608,8 +3617,10 @@ xlog_recover_process_data(
 				error = XFS_ERROR(EIO);
 				break;
 			}
-			if (error)
+			if (error) {
+				xlog_recover_free_trans(trans);
 				return error;
+			}
 		}
 		dp += be32_to_cpu(ohead->oh_len);
 		num_logops--;
-- 
cgit v1.2.3-70-g09d2


From ef701600fd26cace9d513ee174688a2b83832126 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <tinguely@sgi.com>
Date: Sat, 5 Oct 2013 21:48:25 -0500
Subject: xfs: fix memory leak in xfs_dir2_node_removename

Fix the leak of kernel memory in xfs_dir2_node_removename()
when xfs_dir2_leafn_remove() returns an error code.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_node.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 56369d4509d..48c7d18f68c 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -2067,12 +2067,12 @@ xfs_dir2_node_lookup(
  */
 int						/* error */
 xfs_dir2_node_removename(
-	xfs_da_args_t		*args)		/* operation arguments */
+	struct xfs_da_args	*args)		/* operation arguments */
 {
-	xfs_da_state_blk_t	*blk;		/* leaf block */
+	struct xfs_da_state_blk	*blk;		/* leaf block */
 	int			error;		/* error return value */
 	int			rval;		/* operation return value */
-	xfs_da_state_t		*state;		/* btree cursor */
+	struct xfs_da_state	*state;		/* btree cursor */
 
 	trace_xfs_dir2_node_removename(args);
 
@@ -2084,19 +2084,18 @@ xfs_dir2_node_removename(
 	state->mp = args->dp->i_mount;
 	state->blocksize = state->mp->m_dirblksize;
 	state->node_ents = state->mp->m_dir_node_ents;
-	/*
-	 * Look up the entry we're deleting, set up the cursor.
-	 */
+
+	/* Look up the entry we're deleting, set up the cursor. */
 	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error)
-		rval = error;
-	/*
-	 * Didn't find it, upper layer screwed up.
-	 */
+		goto out_free;
+
+	/* Didn't find it, upper layer screwed up. */
 	if (rval != EEXIST) {
-		xfs_da_state_free(state);
-		return rval;
+		error = rval;
+		goto out_free;
 	}
+
 	blk = &state->path.blk[state->path.active - 1];
 	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
 	ASSERT(state->extravalid);
@@ -2107,7 +2106,7 @@ xfs_dir2_node_removename(
 	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
 		&state->extrablk, &rval);
 	if (error)
-		return error;
+		goto out_free;
 	/*
 	 * Fix the hash values up the btree.
 	 */
@@ -2122,6 +2121,7 @@ xfs_dir2_node_removename(
 	 */
 	if (!error)
 		error = xfs_dir2_node_to_leaf(state);
+out_free:
 	xfs_da_state_free(state);
 	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From c7848f69ec4a8c03732cde5c949bd2aa711a9f4b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 4 Dec 2013 17:39:23 -0500
Subject: NFSv4: OPEN must handle the NFS4ERR_IO return code correctly

decode_op_hdr() cannot distinguish between an XDR decoding error and
the perfectly valid errorcode NFS4ERR_IO. This is normally not a
problem, but for the particular case of OPEN, we need to be able
to increment the NFSv4 open sequence id when the server returns
a valid response.

Reported-by: J Bruce Fields <bfields@fieldses.org>
Link: http://lkml.kernel.org/r/20131204210356.GA19452@fieldses.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4xdr.c | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5be2868c02f..8c21d69a9dc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3097,7 +3097,8 @@ out_overflow:
 	return -EIO;
 }
 
-static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+		int *nfs_retval)
 {
 	__be32 *p;
 	uint32_t opnum;
@@ -3107,19 +3108,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	if (unlikely(!p))
 		goto out_overflow;
 	opnum = be32_to_cpup(p++);
-	if (opnum != expected) {
-		dprintk("nfs: Server returned operation"
-			" %d but we issued a request for %d\n",
-				opnum, expected);
-		return -EIO;
-	}
+	if (unlikely(opnum != expected))
+		goto out_bad_operation;
 	nfserr = be32_to_cpup(p);
-	if (nfserr != NFS_OK)
-		return nfs4_stat_to_errno(nfserr);
-	return 0;
+	if (nfserr == NFS_OK)
+		*nfs_retval = 0;
+	else
+		*nfs_retval = nfs4_stat_to_errno(nfserr);
+	return true;
+out_bad_operation:
+	dprintk("nfs: Server returned operation"
+		" %d but we issued a request for %d\n",
+			opnum, expected);
+	*nfs_retval = -EREMOTEIO;
+	return false;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	*nfs_retval = -EIO;
+	return false;
+}
+
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+	int retval;
+
+	__decode_op_hdr(xdr, expected, &retval);
+	return retval;
 }
 
 /* Dummy routine */
@@ -5001,11 +5015,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t savewords, bmlen, i;
 	int status;
 
-	status = decode_op_hdr(xdr, OP_OPEN);
-	if (status != -EIO)
-		nfs_increment_open_seqid(status, res->seqid);
-	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+	if (!__decode_op_hdr(xdr, OP_OPEN, &status))
+		return status;
+	nfs_increment_open_seqid(status, res->seqid);
+	if (status)
+		return status;
+	status = decode_stateid(xdr, &res->stateid);
 	if (unlikely(status))
 		return status;
 
-- 
cgit v1.2.3-70-g09d2


From 6aa23d76a7b549521a03b63b6d5b7880ea87eab7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 14 Nov 2013 07:25:19 -0500
Subject: nfs: check if gssd is running before attempting to use krb5i auth in
 SETCLIENTID call

Currently, the client will attempt to use krb5i in the SETCLIENTID call
even if rpc.gssd isn't running. When that fails, it'll then fall back to
RPC_AUTH_UNIX. This introduced a delay when mounting if rpc.gssd isn't
running, and causes warning messages to pop up in the ring buffer.

Check to see if rpc.gssd is running before even attempting to use krb5i
auth, and just silently skip trying to do so if it isn't. In the event
that the admin is actually trying to mount with krb5*, it will still
fail at a later stage of the mount attempt.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4client.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index b4a160a405c..c1b7a80b670 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -10,6 +10,7 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
 #include "internal.h"
 #include "callback.h"
 #include "delegation.h"
@@ -370,7 +371,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
 	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
-	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+
+	error = -EINVAL;
+	if (gssd_running(clp->cl_net))
+		error = nfs_create_rpc_client(clp, timeparms,
+					      RPC_AUTH_GSS_KRB5I);
 	if (error == -EINVAL)
 		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
 	if (error < 0)
-- 
cgit v1.2.3-70-g09d2


From c61a9e39f637373929a110ad2a5922a2e8b00f4c Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 22 Nov 2013 14:04:00 +0800
Subject: xfs: make quota metadata truncation behavior consistent to user space

In xfs_qm_scall_trunc_qfiles(), we ignore the error if failed to remove
the users quota metadata and proceed to remove groups and projects if
they are being there.  However, in user space, the remove operation will
break and return if failed to remove any kind of quota.
Also for v5 super block, we can enabled both group and project quota at
the same time, in this case the current error handling will cover the
group error with projects but they might failed due to different reasons.

It seems we'd better the error handling consistent to the user space and
don't trying to remove another kind of quota metadata if the previous
operation is failed.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_qm_syscalls.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 437c9198031..3daf5ea1eb8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -278,7 +278,7 @@ xfs_qm_scall_trunc_qfiles(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
-	int		error = 0, error2 = 0;
+	int		error;
 
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
 		xfs_debug(mp, "%s: flags=%x m_qflags=%x",
@@ -286,14 +286,20 @@ xfs_qm_scall_trunc_qfiles(
 		return XFS_ERROR(EINVAL);
 	}
 
-	if (flags & XFS_DQ_USER)
+	if (flags & XFS_DQ_USER) {
 		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-	if (flags & XFS_DQ_GROUP)
-		error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+		if (error)
+			return error;
+	}
+	if (flags & XFS_DQ_GROUP) {
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+		if (error)
+			return error;
+	}
 	if (flags & XFS_DQ_PROJ)
-		error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
 
-	return error ? error : error2;
+	return error;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From afbd123db4e72d5fe44db235976af64a22b32976 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Sat, 23 Nov 2013 00:15:43 +0800
Subject: xfs: integrate xfs_quota_priv header file to xfs_qm

The xfs_quota_priv header file is only included by xfs_qm header and
there is no much users for its contents, hence we can move those stuff
to xfs_qm header file and kill it.

This patch also remove an unused macro DQFLAGTO_TYPESTR.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_qm.h         | 18 +++++++++++++++++-
 fs/xfs/xfs_quota_priv.h | 42 ------------------------------------------
 2 files changed, 17 insertions(+), 43 deletions(-)
 delete mode 100644 fs/xfs/xfs_quota_priv.h

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index a788b66a5cb..797fd463627 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,12 +20,28 @@
 
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
 
 struct xfs_inode;
 
 extern struct kmem_zone	*xfs_qm_dqtrxzone;
 
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE	10
+
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+	!dqp->q_core.d_blk_hardlimit && \
+	!dqp->q_core.d_blk_softlimit && \
+	!dqp->q_core.d_rtb_hardlimit && \
+	!dqp->q_core.d_rtb_softlimit && \
+	!dqp->q_core.d_ino_hardlimit && \
+	!dqp->q_core.d_ino_softlimit && \
+	!dqp->q_core.d_bcount && \
+	!dqp->q_core.d_rtbcount && \
+	!dqp->q_core.d_icount)
+
 /*
  * This defines the unit of allocation of dquots.
  * Currently, it is just one file system block, and a 4K blk contains 30
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
deleted file mode 100644
index 6d86219d93d..00000000000
--- a/fs/xfs/xfs_quota_priv.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE	10
-
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
-	!dqp->q_core.d_blk_hardlimit && \
-	!dqp->q_core.d_blk_softlimit && \
-	!dqp->q_core.d_rtb_hardlimit && \
-	!dqp->q_core.d_rtb_softlimit && \
-	!dqp->q_core.d_ino_hardlimit && \
-	!dqp->q_core.d_ino_softlimit && \
-	!dqp->q_core.d_bcount && \
-	!dqp->q_core.d_rtbcount && \
-	!dqp->q_core.d_icount)
-
-#define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
-				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
-				 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-
-#endif	/* __XFS_QUOTA_PRIV_H__ */
-- 
cgit v1.2.3-70-g09d2


From 37eb9706ebf5b99d14c6086cdeef2c2f73f9c9fb Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Tue, 26 Nov 2013 21:38:54 +0800
Subject: xfs: fix false assertion at xfs_qm_vop_create_dqattach

After the previous fix, there still has another ASSERT failure if turning
off any type of quota while fsstress is running at the same time.

Backtrace in this case:

[   50.867897] XFS: Assertion failed: XFS_IS_GQUOTA_ON(mp), file: fs/xfs/xfs_qm.c, line: 2118
[   50.867924] ------------[ cut here ]------------
... <snip>
[   50.867957] Kernel BUG at ffffffffa0b55a32 [verbose debug info unavailable]
[   50.867999] invalid opcode: 0000 [#1] SMP
[   50.869407] Call Trace:
[   50.869446]  [<ffffffffa0bc408a>] xfs_qm_vop_create_dqattach+0x19a/0x2d0 [xfs]
[   50.869512]  [<ffffffffa0b9cc45>] xfs_create+0x5c5/0x6a0 [xfs]
[   50.869564]  [<ffffffffa0b5307c>] xfs_vn_mknod+0xac/0x1d0 [xfs]
[   50.869615]  [<ffffffffa0b531d6>] xfs_vn_mkdir+0x16/0x20 [xfs]
[   50.869655]  [<ffffffff811becd5>] vfs_mkdir+0x95/0x130
[   50.869689]  [<ffffffff811bf63a>] SyS_mkdirat+0xaa/0xe0
[   50.869723]  [<ffffffff811bf689>] SyS_mkdir+0x19/0x20
[   50.869757]  [<ffffffff8170f7dd>] system_call_fastpath+0x1a/0x1f
[   50.869793] Code: 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 <snip>
[   50.870003] RIP  [<ffffffffa0b55a32>] assfail+0x22/0x30 [xfs]
[   50.870050]  RSP <ffff88002941fd60>
[   50.879251] ---[ end trace c93a2b342341c65b ]---

We're hitting the ASSERT(XFS_IS_*QUOTA_ON(mp)) in xfs_qm_vop_create_dqattach(),
however the assertion itself is not right IMHO.  While performing quota off, we
firstly clear the XFS_*QUOTA_ACTIVE bit(s) from struct xfs_mount without taking
any special locks, see xfs_qm_scall_quotaoff().  Hence there is no guarantee
that the desired quota is still active.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_qm.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 14a4996cfec..588e4909c58 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -2082,24 +2082,21 @@ xfs_qm_vop_create_dqattach(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
-	if (udqp) {
+	if (udqp && XFS_IS_UQUOTA_ON(mp)) {
 		ASSERT(ip->i_udquot == NULL);
-		ASSERT(XFS_IS_UQUOTA_ON(mp));
 		ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
 
 		ip->i_udquot = xfs_qm_dqhold(udqp);
 		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
-	if (gdqp) {
+	if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
 		ASSERT(ip->i_gdquot == NULL);
-		ASSERT(XFS_IS_GQUOTA_ON(mp));
 		ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
 		ip->i_gdquot = xfs_qm_dqhold(gdqp);
 		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
-	if (pdqp) {
+	if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
 		ASSERT(ip->i_pdquot == NULL);
-		ASSERT(XFS_IS_PQUOTA_ON(mp));
 		ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
 
 		ip->i_pdquot = xfs_qm_dqhold(pdqp);
-- 
cgit v1.2.3-70-g09d2


From 0c3d88dfcedf92b28d759182ecb33f2808dc3e59 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 18 Nov 2013 05:10:40 -0800
Subject: xfs: tiny xfs_setattr_mode cleanup

Remove the pointless tp argument, and properly align the local variable
declarations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iops.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 27e0e544e96..e7f4e4f4eab 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -459,14 +459,12 @@ xfs_vn_getattr(
 
 static void
 xfs_setattr_mode(
-	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	struct iattr		*iattr)
 {
-	struct inode	*inode = VFS_I(ip);
-	umode_t		mode = iattr->ia_mode;
+	struct inode		*inode = VFS_I(ip);
+	umode_t			mode = iattr->ia_mode;
 
-	ASSERT(tp);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	ip->i_d.di_mode &= S_IFMT;
@@ -633,7 +631,7 @@ xfs_setattr_nonsize(
 	 * Change file access modes.
 	 */
 	if (mask & ATTR_MODE)
-		xfs_setattr_mode(tp, ip, iattr);
+		xfs_setattr_mode(ip, iattr);
 
 	/*
 	 * Change file access or modified times.
@@ -871,7 +869,7 @@ xfs_setattr_size(
 	 * Change file access modes.
 	 */
 	if (mask & ATTR_MODE)
-		xfs_setattr_mode(tp, ip, iattr);
+		xfs_setattr_mode(ip, iattr);
 
 	if (mask & ATTR_CTIME) {
 		inode->i_ctime = iattr->ia_ctime;
-- 
cgit v1.2.3-70-g09d2


From c91c46c12768daac8486dff0f74bc52c2ec974cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 18 Nov 2013 05:10:52 -0800
Subject: xfs: add xfs_setattr_time

Split out a xfs_setattr_time helper to share code between truncate and
regular setattr similar to xfs_setattr_mode.  I might also have another
caller growing for this in the near future.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iops.c | 66 +++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e7f4e4f4eab..5762282895a 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -474,6 +474,32 @@ xfs_setattr_mode(
 	inode->i_mode |= mode & ~S_IFMT;
 }
 
+static void
+xfs_setattr_time(
+	struct xfs_inode	*ip,
+	struct iattr		*iattr)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (iattr->ia_valid & ATTR_ATIME) {
+		inode->i_atime = iattr->ia_atime;
+		ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+		ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+	}
+	if (iattr->ia_valid & ATTR_CTIME) {
+		inode->i_ctime = iattr->ia_ctime;
+		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+	}
+	if (iattr->ia_valid & ATTR_MTIME) {
+		inode->i_mtime = iattr->ia_mtime;
+		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+	}
+}
+
 int
 xfs_setattr_nonsize(
 	struct xfs_inode	*ip,
@@ -627,30 +653,10 @@ xfs_setattr_nonsize(
 		}
 	}
 
-	/*
-	 * Change file access modes.
-	 */
 	if (mask & ATTR_MODE)
 		xfs_setattr_mode(ip, iattr);
-
-	/*
-	 * Change file access or modified times.
-	 */
-	if (mask & ATTR_ATIME) {
-		inode->i_atime = iattr->ia_atime;
-		ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
-		ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-	}
-	if (mask & ATTR_CTIME) {
-		inode->i_ctime = iattr->ia_ctime;
-		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-	}
-	if (mask & ATTR_MTIME) {
-		inode->i_mtime = iattr->ia_mtime;
-		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-	}
+	if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+		xfs_setattr_time(ip, iattr);
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
@@ -865,22 +871,10 @@ xfs_setattr_size(
 		xfs_inode_clear_eofblocks_tag(ip);
 	}
 
-	/*
-	 * Change file access modes.
-	 */
 	if (mask & ATTR_MODE)
 		xfs_setattr_mode(ip, iattr);
-
-	if (mask & ATTR_CTIME) {
-		inode->i_ctime = iattr->ia_ctime;
-		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-	}
-	if (mask & ATTR_MTIME) {
-		inode->i_mtime = iattr->ia_mtime;
-		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-	}
+	if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+		xfs_setattr_time(ip, iattr);
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-- 
cgit v1.2.3-70-g09d2


From 21d71662f895462abaa3054d504af55a306f42ba Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Sat, 7 Dec 2013 14:07:36 +0800
Subject: sysfs, kernfs: remove duplicated include from file.c

Remove duplicated include.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 990c97fa704..4a5863b79de 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -13,7 +13,6 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/pagemap.h>
-#include <linux/poll.h>
 #include <linux/sched.h>
 
 #include "kernfs-internal.h"
-- 
cgit v1.2.3-70-g09d2


From 5a01dd54f4a7fb513062070c5acef20d13cad980 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Tue, 26 Nov 2013 21:38:34 +0800
Subject: xfs: fix assertion failure at xfs_setattr_nonsize

For CRC enabled v5 super block, change a file's ownership can simply
trigger an ASSERT failure at xfs_setattr_nonsize() if both group and
project quota are enabled, i.e,

[  305.337609] XFS: Assertion failed: !XFS_IS_PQUOTA_ON(mp), file: fs/xfs/xfs_iops.c, line: 621
[  305.339250] Kernel BUG at ffffffffa0a7fa32 [verbose debug info unavailable]
[  305.383939] Call Trace:
[  305.385536]  [<ffffffffa0a7d95a>] xfs_setattr_nonsize+0x69a/0x720 [xfs]
[  305.387142]  [<ffffffffa0a7dea9>] xfs_vn_setattr+0x29/0x70 [xfs]
[  305.388727]  [<ffffffff811ca388>] notify_change+0x1a8/0x350
[  305.390298]  [<ffffffff811ac39d>] chown_common+0xfd/0x110
[  305.391868]  [<ffffffff811ad6bf>] SyS_fchownat+0xaf/0x110
[  305.393440]  [<ffffffff811ad760>] SyS_lchown+0x20/0x30
[  305.394995]  [<ffffffff8170f7dd>] system_call_fastpath+0x1a/0x1f
[  305.399870] RIP  [<ffffffffa0a7fa32>] assfail+0x22/0x30 [xfs]

This fix adjust the assertion to check if the super block support both
quota inodes or not.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 5762282895a..0ce1d759156 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -642,7 +642,8 @@ xfs_setattr_nonsize(
 		}
 		if (!gid_eq(igid, gid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
-				ASSERT(!XFS_IS_PQUOTA_ON(mp));
+				ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
+				       !XFS_IS_PQUOTA_ON(mp));
 				ASSERT(mask & ATTR_GID);
 				ASSERT(gdqp);
 				olddquot2 = xfs_qm_vop_chown(tp, ip,
-- 
cgit v1.2.3-70-g09d2


From df8052e7dae00bde6f21b40b6e3e1099770f3afc Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Tue, 26 Nov 2013 21:38:49 +0800
Subject: xfs: fix infinite loop by detaching the group/project hints from user
 dquot

xfs_quota(8) will hang up if trying to turn group/project quota off
before the user quota is off, this could be 100% reproduced by:
  # mount -ouquota,gquota /dev/sda7 /xfs
  # mkdir /xfs/test
  # xfs_quota -xc 'off -g' /xfs <-- hangs up
  # echo w > /proc/sysrq-trigger
  # dmesg

  SysRq : Show Blocked State
  task                        PC stack   pid father
  xfs_quota       D 0000000000000000     0 27574   2551 0x00000000
  [snip]
  Call Trace:
  [<ffffffff81aaa21d>] schedule+0xad/0xc0
  [<ffffffff81aa327e>] schedule_timeout+0x35e/0x3c0
  [<ffffffff8114b506>] ? mark_held_locks+0x176/0x1c0
  [<ffffffff810ad6c0>] ? call_timer_fn+0x2c0/0x2c0
  [<ffffffffa0c25380>] ? xfs_qm_shrink_count+0x30/0x30 [xfs]
  [<ffffffff81aa3306>] schedule_timeout_uninterruptible+0x26/0x30
  [<ffffffffa0c26155>] xfs_qm_dquot_walk+0x235/0x260 [xfs]
  [<ffffffffa0c059d8>] ? xfs_perag_get+0x1d8/0x2d0 [xfs]
  [<ffffffffa0c05805>] ? xfs_perag_get+0x5/0x2d0 [xfs]
  [<ffffffffa0b7707e>] ? xfs_inode_ag_iterator+0xae/0xf0 [xfs]
  [<ffffffffa0c22280>] ? xfs_trans_free_dqinfo+0x50/0x50 [xfs]
  [<ffffffffa0b7709f>] ? xfs_inode_ag_iterator+0xcf/0xf0 [xfs]
  [<ffffffffa0c261e6>] xfs_qm_dqpurge_all+0x66/0xb0 [xfs]
  [<ffffffffa0c2497a>] xfs_qm_scall_quotaoff+0x20a/0x5f0 [xfs]
  [<ffffffffa0c2b8f6>] xfs_fs_set_xstate+0x136/0x180 [xfs]
  [<ffffffff8136cf7a>] do_quotactl+0x53a/0x6b0
  [<ffffffff812fba4b>] ? iput+0x5b/0x90
  [<ffffffff8136d257>] SyS_quotactl+0x167/0x1d0
  [<ffffffff814cf2ee>] ? trace_hardirqs_on_thunk+0x3a/0x3f
  [<ffffffff81abcd19>] system_call_fastpath+0x16/0x1b

It's fine if we turn user quota off at first, then turn off other
kind of quotas if they are enabled since the group/project dquot
refcount is decreased to zero once the user quota if off. Otherwise,
those dquots refcount is non-zero due to the user dquot might refer
to them as hint(s).  Hence, above operation cause an infinite loop
at xfs_qm_dquot_walk() while trying to purge dquot cache.

This problem has been around since Linux 3.4, it was introduced by:
  [ b84a3a9675 xfs: remove the per-filesystem list of dquots ]

Originally we will release the group dquot pointers because the user
dquots maybe carrying around as a hint via xfs_qm_detach_gdquots().
However, with above change, there is no such work to be done before
purging group/project dquot cache.

In order to solve this problem, this patch introduces a special routine
xfs_qm_dqpurge_hints(), and it would release the group/project dquot
pointers the user dquots maybe carrying around as a hint, and then it
will proceed to purge the user dquot cache if requested.

Cc: stable@vger.kernel.org
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_qm.c | 71 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 588e4909c58..dd88f0e27bd 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -134,8 +134,6 @@ xfs_qm_dqpurge(
 {
 	struct xfs_mount	*mp = dqp->q_mount;
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	struct xfs_dquot	*gdqp = NULL;
-	struct xfs_dquot	*pdqp = NULL;
 
 	xfs_dqlock(dqp);
 	if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -143,21 +141,6 @@ xfs_qm_dqpurge(
 		return EAGAIN;
 	}
 
-	/*
-	 * If this quota has a hint attached, prepare for releasing it now.
-	 */
-	gdqp = dqp->q_gdquot;
-	if (gdqp) {
-		xfs_dqlock(gdqp);
-		dqp->q_gdquot = NULL;
-	}
-
-	pdqp = dqp->q_pdquot;
-	if (pdqp) {
-		xfs_dqlock(pdqp);
-		dqp->q_pdquot = NULL;
-	}
-
 	dqp->dq_flags |= XFS_DQ_FREEING;
 
 	xfs_dqflock(dqp);
@@ -206,11 +189,47 @@ xfs_qm_dqpurge(
 	XFS_STATS_DEC(xs_qm_dquot_unused);
 
 	xfs_qm_dqdestroy(dqp);
+	return 0;
+}
+
+/*
+ * Release the group or project dquot pointers the user dquots maybe carrying
+ * around as a hint, and proceed to purge the user dquot cache if requested.
+*/
+STATIC int
+xfs_qm_dqpurge_hints(
+	struct xfs_dquot	*dqp,
+	void			*data)
+{
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	uint			flags = *((uint *)data);
+
+	xfs_dqlock(dqp);
+	if (dqp->dq_flags & XFS_DQ_FREEING) {
+		xfs_dqunlock(dqp);
+		return EAGAIN;
+	}
+
+	/* If this quota has a hint attached, prepare for releasing it now */
+	gdqp = dqp->q_gdquot;
+	if (gdqp)
+		dqp->q_gdquot = NULL;
+
+	pdqp = dqp->q_pdquot;
+	if (pdqp)
+		dqp->q_pdquot = NULL;
+
+	xfs_dqunlock(dqp);
 
 	if (gdqp)
-		xfs_qm_dqput(gdqp);
+		xfs_qm_dqrele(gdqp);
 	if (pdqp)
-		xfs_qm_dqput(pdqp);
+		xfs_qm_dqrele(pdqp);
+
+	if (flags & XFS_QMOPT_UQUOTA)
+		return xfs_qm_dqpurge(dqp, NULL);
+
 	return 0;
 }
 
@@ -222,8 +241,18 @@ xfs_qm_dqpurge_all(
 	struct xfs_mount	*mp,
 	uint			flags)
 {
-	if (flags & XFS_QMOPT_UQUOTA)
-		xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
+	/*
+	 * We have to release group/project dquot hint(s) from the user dquot
+	 * at first if they are there, otherwise we would run into an infinite
+	 * loop while walking through radix tree to purge other type of dquots
+	 * since their refcount is not zero if the user dquot refers to them
+	 * as hint.
+	 *
+	 * Call the special xfs_qm_dqpurge_hints() will end up go through the
+	 * general xfs_qm_dqpurge() against user dquot cache if requested.
+	 */
+	xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags);
+
 	if (flags & XFS_QMOPT_GQUOTA)
 		xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
 	if (flags & XFS_QMOPT_PQUOTA)
-- 
cgit v1.2.3-70-g09d2


From 6e14b46b91fee8a049b0940333ce13a820beaaa5 Mon Sep 17 00:00:00 2001
From: Albert Fluegel <af@muc.de>
Date: Mon, 18 Nov 2013 12:18:01 -0500
Subject: nfsd: don't return high mode bits

The Linux NFS server replies among other things to a "Check access permission"
the following:

NFS:    File type = 2 (Directory)
NFS:    Mode = 040755

A netapp server replies here:
NFS:    File type = 2 (Directory)
NFS:    Mode = 0755

The RFC 1813 i read:
   fattr3

      struct fattr3 {
         ftype3     type;
         mode3      mode;
         uint32     nlink;
...
For the mode bits only the lowest 9 are defined in the RFC

As far as I can tell, knfsd has always done this, so apparently it's harmless.
Nevertheless, it appears to be wrong.

Note this is already correct in the NFSv4 case, only v2 and v3 need
fixing.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3xdr.c | 2 +-
 fs/nfsd/nfsxdr.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 14d9ecb96cf..1ee6baec5fa 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -168,7 +168,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	      struct kstat *stat)
 {
 	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
-	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) (stat->mode & S_IALLUGO));
 	*p++ = htonl((u32) stat->nlink);
 	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
 	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 9c769a47ac5..b17d93214d0 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	type = (stat->mode & S_IFMT);
 
 	*p++ = htonl(nfs_ftypes[type >> 12]);
-	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) (stat->mode & S_IALLUGO));
 	*p++ = htonl((u32) stat->nlink);
 	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
 	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
-- 
cgit v1.2.3-70-g09d2


From 2d8498dbf8041c51ca49a0be6be594501638e591 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 20 Nov 2013 00:24:11 -0800
Subject: nfsd: start documenting some XDR handling functions

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ee7237f99f5..79754139ccd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -190,6 +190,15 @@ static int zero_clientid(clientid_t *clid)
 	return (clid->cl_boot == 0) && (clid->cl_id == 0);
 }
 
+/**
+ * defer_free - mark an allocation as deferred freed
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @release: release callback to free @p, typically kfree()
+ * @p: pointer to be freed
+ *
+ * Marks @p to be freed when processing the compound operation
+ * described in @argp finishes.
+ */
 static int
 defer_free(struct nfsd4_compoundargs *argp,
 		void (*release)(const void *), void *p)
@@ -206,6 +215,16 @@ defer_free(struct nfsd4_compoundargs *argp,
 	return 0;
 }
 
+/**
+ * savemem - duplicate a chunk of memory for later processing
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @p: pointer to be duplicated
+ * @nbytes: length to be duplicated
+ *
+ * Returns a pointer to a copy of @nbytes bytes of memory at @p
+ * that are preserved until processing of the NFSv4 compound
+ * operation described by @argp finishes.
+ */
 static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 {
 	if (p == argp->tmp) {
-- 
cgit v1.2.3-70-g09d2


From 9b2db6e1894577d48f4e290381bac6e573593838 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Dec 2013 09:29:17 -0500
Subject: sysfs: bail early from kernfs_file_mmap() to avoid spurious lockdep
 warning

This is v3.14 fix for the same issue that a8b14744429f ("sysfs: give
different locking key to regular and bin files") addresses for v3.13.
Due to the extensive kernfs reorganization in v3.14 branch, the same
fix couldn't be ported as-is.  The v3.13 fix was ignored while merging
it into v3.14 branch.

027a485d12e0 ("sysfs: use a separate locking class for open files
depending on mmap") assigned different lockdep key to
sysfs_open_file->mutex depending on whether the file implements mmap
or not in an attempt to avoid spurious lockdep warning caused by
merging of regular and bin file paths.

While this restored some of the original behavior of using different
locks (at least lockdep is concerned) for the different clases of
files.  The restoration wasn't full because now the lockdep key
assignment depends on whether the file has mmap or not instead of
whether it's a regular file or not.

This means that bin files which don't implement mmap will get assigned
the same lockdep class as regular files.  This is problematic because
file_operations for bin files still implements the mmap file operation
and checking whether the sysfs file actually implements mmap happens
in the file operation after grabbing @sysfs_open_file->mutex.  We
still end up adding locking dependency from mmap locking to
sysfs_open_file->mutex to the regular file mutex which triggers
spurious circular locking warning.

For v3.13, a8b14744429f ("sysfs: give different locking key to regular
and bin files") fixed it by giving sysfs_open_file->mutex different
lockdep keys depending on whether the file is regular or bin instead
of whether mmap exists or not; however, due to the way sysfs is now
layered behind kernfs, this approach is no longer viable.  kernfs can
tell whether a sysfs node has mmap implemented or not but can't tell
whether a bin file from a regular one.

This patch updates kernfs such that kernfs_file_mmap() checks
SYSFS_FLAG_HAS_MMAP and bail before grabbing sysfs_open_file->mutex so
that it doesn't add spurious locking dependency from mmap to
sysfs_open_file->mutex and changes sysfs so that it specifies
kernfs_ops->mmap iff the sysfs file implements mmap.  Combined, this
ensures that sysfs_open_file->mutex is grabbed under mmap path iff the
sysfs file actually implements mmap.  As sysfs_open_file->mutex is
already given a different lockdep key if mmap is implemented, this
removes the spurious locking dependency.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Dave Jones <davej@redhat.com>
Link: http://lkml.kernel.org/g/20131203184324.GA11320@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c | 18 ++++++++++++++----
 fs/sysfs/file.c  | 12 ++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 4a5863b79de..fa053151fa9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -421,6 +421,16 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	const struct kernfs_ops *ops;
 	int rc;
 
+	/*
+	 * mmap path and of->mutex are prone to triggering spurious lockdep
+	 * warnings and we don't want to add spurious locking dependency
+	 * between the two.  Check whether mmap is actually implemented
+	 * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
+	 * comment in kernfs_file_open() for more details.
+	 */
+	if (!(of->sd->s_flags & SYSFS_FLAG_HAS_MMAP))
+		return -ENODEV;
+
 	mutex_lock(&of->mutex);
 
 	rc = -ENODEV;
@@ -428,10 +438,7 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out_unlock;
 
 	ops = kernfs_ops(of->sd);
-	if (ops->mmap)
-		rc = ops->mmap(of, vma);
-	if (rc)
-		goto out_put;
+	rc = ops->mmap(of, vma);
 
 	/*
 	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
@@ -596,6 +603,9 @@ static int kernfs_file_open(struct inode *inode, struct file *file)
 	 * happen on the same file.  At this point, we can't easily give
 	 * each file a separate locking class.  Let's differentiate on
 	 * whether the file has mmap or not for now.
+	 *
+	 * Both paths of the branch look the same.  They're supposed to
+	 * look that way and give @of->mutex different static lockdep keys.
 	 */
 	if (has_mmap)
 		mutex_init(&of->mutex);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index ac77d2be3c3..a67d1c682fe 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -142,9 +142,6 @@ static int sysfs_kf_bin_mmap(struct sysfs_open_file *of,
 	struct bin_attribute *battr = of->sd->priv;
 	struct kobject *kobj = of->sd->s_parent->priv;
 
-	if (!battr->mmap)
-		return -ENODEV;
-
 	return battr->mmap(of->file, kobj, battr, vma);
 }
 
@@ -197,6 +194,11 @@ static const struct kernfs_ops sysfs_bin_kfops_wo = {
 static const struct kernfs_ops sysfs_bin_kfops_rw = {
 	.read		= sysfs_kf_bin_read,
 	.write		= sysfs_kf_bin_write,
+};
+
+static const struct kernfs_ops sysfs_bin_kfops_mmap = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
 	.mmap		= sysfs_kf_bin_mmap,
 };
 
@@ -232,7 +234,9 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 	} else {
 		struct bin_attribute *battr = (void *)attr;
 
-		if ((battr->read && battr->write) || battr->mmap)
+		if (battr->mmap)
+			ops = &sysfs_bin_kfops_mmap;
+		else if (battr->read && battr->write)
 			ops = &sysfs_bin_kfops_rw;
 		else if (battr->read)
 			ops = &sysfs_bin_kfops_ro;
-- 
cgit v1.2.3-70-g09d2


From a7560a0132cfc93b25d2df1d277a078a05220cf4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Dec 2013 10:22:30 -0500
Subject: sysfs: fix use-after-free in sysfs_kill_sb()

While restructuring the [u]mount path, 4b93dc9b1c68 ("sysfs, kernfs:
prepare mount path for kernfs") incorrectly updated sysfs_kill_sb() so
that it first kills super_block and then tries to dereference its
namespace tag to drop it.  Fix it by caching namespace tag before
killing the superblock and then drop the cached namespace tag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tested-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tested-by: Vlastimil Babka <vbabka@suse.cz>
Link: http://lkml.kernel.org/g/20131205031051.GC5135@yliu-dev.sh.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/mount.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index e7e3aa8e7b7..8d075272cac 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -45,8 +45,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 
 static void sysfs_kill_sb(struct super_block *sb)
 {
+	void *ns = (void *)kernfs_super_ns(sb);
+
 	kernfs_kill_sb(sb);
-	kobj_ns_drop(KOBJ_NS_TYPE_NET, (void *)kernfs_super_ns(sb));
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 }
 
 static struct file_system_type sysfs_fs_type = {
-- 
cgit v1.2.3-70-g09d2


From a0ef5e19684f0447da9ff0654a12019c484f57ca Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 5 Dec 2013 06:00:51 -0500
Subject: nfsd: don't try to reuse an expired DRC entry off the list

Currently when we are processing a request, we try to scrape an expired
or over-limit entry off the list in preference to allocating a new one
from the slab.

This is unnecessarily complicated. Just use the slab layer.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfscache.c | 36 ++++--------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index b6af150c96b..f8f060ffbf4 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -131,13 +131,6 @@ nfsd_reply_cache_alloc(void)
 	return rp;
 }
 
-static void
-nfsd_reply_cache_unhash(struct svc_cacherep *rp)
-{
-	hlist_del_init(&rp->c_hash);
-	list_del_init(&rp->c_lru);
-}
-
 static void
 nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
 {
@@ -416,22 +409,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 
 	/*
 	 * Since the common case is a cache miss followed by an insert,
-	 * preallocate an entry. First, try to reuse the first entry on the LRU
-	 * if it works, then go ahead and prune the LRU list.
+	 * preallocate an entry.
 	 */
-	spin_lock(&cache_lock);
-	if (!list_empty(&lru_head)) {
-		rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
-		if (nfsd_cache_entry_expired(rp) ||
-		    num_drc_entries >= max_drc_entries) {
-			nfsd_reply_cache_unhash(rp);
-			prune_cache_entries();
-			goto search_cache;
-		}
-	}
-
-	/* No expired ones available, allocate a new one. */
-	spin_unlock(&cache_lock);
 	rp = nfsd_reply_cache_alloc();
 	spin_lock(&cache_lock);
 	if (likely(rp)) {
@@ -439,7 +418,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 		drc_mem_usage += sizeof(*rp);
 	}
 
-search_cache:
+	/* go ahead and prune the cache */
+	prune_cache_entries();
+
 	found = nfsd_cache_search(rqstp, csum);
 	if (found) {
 		if (likely(rp))
@@ -453,15 +434,6 @@ search_cache:
 		goto out;
 	}
 
-	/*
-	 * We're keeping the one we just allocated. Are we now over the
-	 * limit? Prune one off the tip of the LRU in trade for the one we
-	 * just allocated if so.
-	 */
-	if (num_drc_entries >= max_drc_entries)
-		nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
-						struct svc_cacherep, c_lru));
-
 	nfsdstats.rcmisses++;
 	rqstp->rq_cacherep = rp;
 	rp->c_state = RC_INPROG;
-- 
cgit v1.2.3-70-g09d2


From b3f03bac8132207a20286d5602eda64500c19724 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 3 Dec 2013 23:50:57 +1100
Subject: xfs: xfs_dir2_block_to_sf temp buffer allocation fails

If we are using a large directory block size, and memory becomes
fragmented, we can get memory allocation failures trying to
kmem_alloc(64k) for a temporary buffer. However, there is not need
for a directory buffer sized allocation, as the end result ends up
in the inode literal area. This is, at most, slightly less than 2k
of space, and hence we don't need an allocation larger than that
fora temporary buffer.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_sf.c | 58 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index aafc6e46cb5..3725fb1b902 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -170,6 +170,7 @@ xfs_dir2_block_to_sf(
 	char			*ptr;		/* current data pointer */
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
 	xfs_dir2_sf_hdr_t	*sfp;		/* shortform directory header */
+	xfs_dir2_sf_hdr_t	*dst;		/* temporary data buffer */
 
 	trace_xfs_dir2_block_to_sf(args);
 
@@ -177,35 +178,20 @@ xfs_dir2_block_to_sf(
 	mp = dp->i_mount;
 
 	/*
-	 * Make a copy of the block data, so we can shrink the inode
-	 * and add local data.
+	 * allocate a temporary destination buffer the size of the inode
+	 * to format the data into. Once we have formatted the data, we
+	 * can free the block and copy the formatted data into the inode literal
+	 * area.
 	 */
-	hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
-	memcpy(hdr, bp->b_addr, mp->m_dirblksize);
-	logflags = XFS_ILOG_CORE;
-	if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
-		ASSERT(error != ENOSPC);
-		goto out;
-	}
+	dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+	hdr = bp->b_addr;
 
-	/*
-	 * The buffer is now unconditionally gone, whether
-	 * xfs_dir2_shrink_inode worked or not.
-	 *
-	 * Convert the inode to local format.
-	 */
-	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-	dp->i_df.if_flags |= XFS_IFINLINE;
-	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-	ASSERT(dp->i_df.if_bytes == 0);
-	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-	logflags |= XFS_ILOG_DDATA;
 	/*
 	 * Copy the header into the newly allocate local space.
 	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp = (xfs_dir2_sf_hdr_t *)dst;
 	memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
-	dp->i_d.di_size = size;
+
 	/*
 	 * Set up to loop over the block's entries.
 	 */
@@ -258,10 +244,34 @@ xfs_dir2_block_to_sf(
 		ptr += dp->d_ops->data_entsize(dep->namelen);
 	}
 	ASSERT((char *)sfep - (char *)sfp == size);
+
+	/* now we are done with the block, we can shrink the inode */
+	logflags = XFS_ILOG_CORE;
+	error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp);
+	if (error) {
+		ASSERT(error != ENOSPC);
+		goto out;
+	}
+
+	/*
+	 * The buffer is now unconditionally gone, whether
+	 * xfs_dir2_shrink_inode worked or not.
+	 *
+	 * Convert the inode to local format and copy the data in.
+	 */
+	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+	dp->i_df.if_flags |= XFS_IFINLINE;
+	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+
+	logflags |= XFS_ILOG_DDATA;
+	memcpy(dp->i_df.if_u1.if_data, dst, size);
+	dp->i_d.di_size = size;
 	xfs_dir2_sf_check(args);
 out:
 	xfs_trans_log_inode(args->trans, dp, logflags);
-	kmem_free(hdr);
+	kmem_free(dst);
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8e825e3a02ff20973154559c33e662cacedc4458 Mon Sep 17 00:00:00 2001
From: Ben Myers <bpm@sgi.com>
Date: Tue, 10 Dec 2013 14:59:31 -0600
Subject: xfs: fix calculation of freed inode cluster blocks

rec.ir_startino is an agino rather than an ino.  Use the correct macro
when dealing with it in xfs_difree.

Signed-off-by: Ben Myers <bpm@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e87719c5beb..7a728f9fc0b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1229,7 +1229,7 @@ xfs_difree(
 		}
 
 		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
-				agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
+				agno, XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
 				XFS_IALLOC_BLOCKS(mp), flist, mp);
 	} else {
 		*delete = 0;
-- 
cgit v1.2.3-70-g09d2


From f9b395a8ef8f34d19cae2cde361e19c96e097fad Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 22 Nov 2013 10:41:16 +1100
Subject: xfs: align initial file allocations correctly

The function xfs_bmap_isaeof() is used to indicate that an
allocation is occurring at or past the end of file, and as such
should be aligned to the underlying storage geometry if possible.

Commit 27a3f8f ("xfs: introduce xfs_bmap_last_extent") changed the
behaviour of this function for empty files - it turned off
allocation alignment for this case accidentally. Hence large initial
allocations from direct IO are not getting correctly aligned to the
underlying geometry, and that is cause write performance to drop in
alignment sensitive configurations.

Fix it by considering allocation into empty files as requiring
aligned allocation again.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3ef11b22e75..8401f11f378 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1635,7 +1635,7 @@ xfs_bmap_last_extent(
  * blocks at the end of the file which do not start at the previous data block,
  * we will try to align the new blocks at stripe unit boundaries.
  *
- * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
  * at, or past the EOF.
  */
 STATIC int
@@ -1650,9 +1650,14 @@ xfs_bmap_isaeof(
 	bma->aeof = 0;
 	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
 				     &is_empty);
-	if (error || is_empty)
+	if (error)
 		return error;
 
+	if (is_empty) {
+		bma->aeof = 1;
+		return 0;
+	}
+
 	/*
 	 * Check if we are allocation or past the last extent, or at least into
 	 * the last delayed allocated extent.
-- 
cgit v1.2.3-70-g09d2


From 324a56e16e44baecac3ca799fd216154145c14bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 14:11:53 -0500
Subject: kernfs: s/sysfs_dirent/kernfs_node/ and rename its friends
 accordingly

kernfs has just been separated out from sysfs and we're already in
full conflict mode.  Nothing can make the situation any worse.  Let's
take the chance to name things properly.

This patch performs the following renames.

* s/sysfs_elem_dir/kernfs_elem_dir/
* s/sysfs_elem_symlink/kernfs_elem_symlink/
* s/sysfs_elem_attr/kernfs_elem_file/
* s/sysfs_dirent/kernfs_node/
* s/sd/kn/ in kernfs proper
* s/parent_sd/parent/
* s/target_sd/target/
* s/dir_sd/parent/
* s/to_sysfs_dirent()/rb_to_kn()/
* misc renames of local vars when they conflict with the above

Because md, mic and gpio dig into sysfs details, this patch ends up
modifying them.  All are sysfs_dirent renames and trivial.  While we
can avoid these by introducing a dummy wrapping struct sysfs_dirent
around kernfs_node, given the limited usage outside kernfs and sysfs
proper, I don't think such workaround is called for.

This patch is strictly rename only and doesn't introduce any
functional difference.

- mic / gpio renames were missing.  Spotted by kbuild test robot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Ashutosh Dixit <ashutosh.dixit@intel.com>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpio/gpiolib.c             |   4 +-
 drivers/md/bitmap.c                |   2 +-
 drivers/md/bitmap.h                |   2 +-
 drivers/md/md.h                    |  10 +-
 drivers/misc/mic/host/mic_device.h |   2 +-
 fs/kernfs/dir.c                    | 542 ++++++++++++++++++-------------------
 fs/kernfs/file.c                   | 186 ++++++-------
 fs/kernfs/inode.c                  | 123 ++++-----
 fs/kernfs/kernfs-internal.h        |  40 +--
 fs/kernfs/mount.c                  |  12 +-
 fs/kernfs/symlink.c                |  64 ++---
 fs/sysfs/dir.c                     |  66 ++---
 fs/sysfs/file.c                    | 122 ++++-----
 fs/sysfs/group.c                   |  96 +++----
 fs/sysfs/mount.c                   |   4 +-
 fs/sysfs/symlink.c                 |  72 ++---
 fs/sysfs/sysfs.h                   |  10 +-
 include/linux/kernfs.h             | 166 ++++++------
 include/linux/kobject.h            |   2 +-
 include/linux/sysfs.h              |  20 +-
 lib/kobject.c                      |   2 +-
 21 files changed, 774 insertions(+), 773 deletions(-)

(limited to 'fs')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 85f772c0b26..c8a7c810bad 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -393,7 +393,7 @@ static const DEVICE_ATTR(value, 0644,
 
 static irqreturn_t gpio_sysfs_irq(int irq, void *priv)
 {
-	struct sysfs_dirent	*value_sd = priv;
+	struct kernfs_node	*value_sd = priv;
 
 	sysfs_notify_dirent(value_sd);
 	return IRQ_HANDLED;
@@ -402,7 +402,7 @@ static irqreturn_t gpio_sysfs_irq(int irq, void *priv)
 static int gpio_setup_irq(struct gpio_desc *desc, struct device *dev,
 		unsigned long gpio_flags)
 {
-	struct sysfs_dirent	*value_sd;
+	struct kernfs_node	*value_sd;
 	unsigned long		irq_flags;
 	int			ret, irq, id;
 
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 12dc29ba739..4195a01b153 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1635,7 +1635,7 @@ int bitmap_create(struct mddev *mddev)
 	sector_t blocks = mddev->resync_max_sectors;
 	struct file *file = mddev->bitmap_info.file;
 	int err;
-	struct sysfs_dirent *bm = NULL;
+	struct kernfs_node *bm = NULL;
 
 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index df4aeb6ac6f..30210b9c4ef 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -225,7 +225,7 @@ struct bitmap {
 	wait_queue_head_t overflow_wait;
 	wait_queue_head_t behind_wait;
 
-	struct sysfs_dirent *sysfs_can_clear;
+	struct kernfs_node *sysfs_can_clear;
 };
 
 /* the bitmap API */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2f5cc8a7ef3..389a3c93cdb 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -106,7 +106,7 @@ struct md_rdev {
 					   */
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
-	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+	struct kernfs_node *sysfs_state; /* handle for 'state'
 					   * sysfs entry */
 
 	struct badblocks {
@@ -376,10 +376,10 @@ struct mddev {
 	sector_t			resync_max;	/* resync should pause
 							 * when it gets here */
 
-	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+	struct kernfs_node		*sysfs_state;	/* handle for 'array_state'
 							 * file in sysfs.
 							 */
-	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
+	struct kernfs_node		*sysfs_action;  /* handle for 'sync_action' */
 
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
@@ -498,13 +498,13 @@ struct md_sysfs_entry {
 };
 extern struct attribute_group md_bitmap_group;
 
-static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
+static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
 {
 	if (sd)
 		return sysfs_get_dirent(sd, name);
 	return sd;
 }
-static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
+static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
 {
 	if (sd)
 		sysfs_notify_dirent(sd);
diff --git a/drivers/misc/mic/host/mic_device.h b/drivers/misc/mic/host/mic_device.h
index 3574cc375bb..538e3d3d3c8 100644
--- a/drivers/misc/mic/host/mic_device.h
+++ b/drivers/misc/mic/host/mic_device.h
@@ -112,7 +112,7 @@ struct mic_device {
 	struct work_struct shutdown_work;
 	u8 state;
 	u8 shutdown_status;
-	struct sysfs_dirent *state_sysfs;
+	struct kernfs_node *state_sysfs;
 	struct completion reset_wait;
 	void *log_buf_addr;
 	int *log_buf_len;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a441e3be805..800ebf52147 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -19,7 +19,7 @@
 
 DEFINE_MUTEX(sysfs_mutex);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, s_rb)
 
 /**
  *	sysfs_name_hash
@@ -45,28 +45,28 @@ static unsigned int sysfs_name_hash(const char *name, const void *ns)
 }
 
 static int sysfs_name_compare(unsigned int hash, const char *name,
-			      const void *ns, const struct sysfs_dirent *sd)
+			      const void *ns, const struct kernfs_node *kn)
 {
-	if (hash != sd->s_hash)
-		return hash - sd->s_hash;
-	if (ns != sd->s_ns)
-		return ns - sd->s_ns;
-	return strcmp(name, sd->s_name);
+	if (hash != kn->s_hash)
+		return hash - kn->s_hash;
+	if (ns != kn->s_ns)
+		return ns - kn->s_ns;
+	return strcmp(name, kn->s_name);
 }
 
-static int sysfs_sd_compare(const struct sysfs_dirent *left,
-			    const struct sysfs_dirent *right)
+static int sysfs_sd_compare(const struct kernfs_node *left,
+			    const struct kernfs_node *right)
 {
 	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
 				  right);
 }
 
 /**
- *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
- *	@sd: sysfs_dirent of interest
+ *	sysfs_link_sibling - link kernfs_node into sibling rbtree
+ *	@kn: kernfs_node of interest
  *
- *	Link @sd into its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
+ *	Link @kn into its sibling rbtree which starts from
+ *	@kn->s_parent->s_dir.children.
  *
  *	Locking:
  *	mutex_lock(sysfs_mutex)
@@ -74,21 +74,21 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
  *	RETURNS:
  *	0 on susccess -EEXIST on failure.
  */
-static int sysfs_link_sibling(struct sysfs_dirent *sd)
+static int sysfs_link_sibling(struct kernfs_node *kn)
 {
-	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
+	struct rb_node **node = &kn->s_parent->s_dir.children.rb_node;
 	struct rb_node *parent = NULL;
 
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs++;
+	if (sysfs_type(kn) == SYSFS_DIR)
+		kn->s_parent->s_dir.subdirs++;
 
 	while (*node) {
-		struct sysfs_dirent *pos;
+		struct kernfs_node *pos;
 		int result;
 
-		pos = to_sysfs_dirent(*node);
+		pos = rb_to_kn(*node);
 		parent = *node;
-		result = sysfs_sd_compare(sd, pos);
+		result = sysfs_sd_compare(kn, pos);
 		if (result < 0)
 			node = &pos->s_rb.rb_left;
 		else if (result > 0)
@@ -97,168 +97,169 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd)
 			return -EEXIST;
 	}
 	/* add new node and rebalance the tree */
-	rb_link_node(&sd->s_rb, parent, node);
-	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
+	rb_link_node(&kn->s_rb, parent, node);
+	rb_insert_color(&kn->s_rb, &kn->s_parent->s_dir.children);
 	return 0;
 }
 
 /**
- *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
- *	@sd: sysfs_dirent of interest
+ *	sysfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ *	@kn: kernfs_node of interest
  *
- *	Unlink @sd from its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
+ *	Unlink @kn from its sibling rbtree which starts from
+ *	kn->s_parent->s_dir.children.
  *
  *	Locking:
  *	mutex_lock(sysfs_mutex)
  */
-static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
+static void sysfs_unlink_sibling(struct kernfs_node *kn)
 {
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs--;
+	if (sysfs_type(kn) == SYSFS_DIR)
+		kn->s_parent->s_dir.subdirs--;
 
-	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
+	rb_erase(&kn->s_rb, &kn->s_parent->s_dir.children);
 }
 
 /**
- *	sysfs_get_active - get an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to get an active reference to
+ *	sysfs_get_active - get an active reference to kernfs_node
+ *	@kn: kernfs_node to get an active reference to
  *
- *	Get an active reference of @sd.  This function is noop if @sd
+ *	Get an active reference of @kn.  This function is noop if @kn
  *	is NULL.
  *
  *	RETURNS:
- *	Pointer to @sd on success, NULL on failure.
+ *	Pointer to @kn on success, NULL on failure.
  */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+struct kernfs_node *sysfs_get_active(struct kernfs_node *kn)
 {
-	if (unlikely(!sd))
+	if (unlikely(!kn))
 		return NULL;
 
-	if (!atomic_inc_unless_negative(&sd->s_active))
+	if (!atomic_inc_unless_negative(&kn->s_active))
 		return NULL;
 
-	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
-		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
-	return sd;
+	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+	return kn;
 }
 
 /**
- *	sysfs_put_active - put an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to put an active reference to
+ *	sysfs_put_active - put an active reference to kernfs_node
+ *	@kn: kernfs_node to put an active reference to
  *
- *	Put an active reference to @sd.  This function is noop if @sd
+ *	Put an active reference to @kn.  This function is noop if @kn
  *	is NULL.
  */
-void sysfs_put_active(struct sysfs_dirent *sd)
+void sysfs_put_active(struct kernfs_node *kn)
 {
 	int v;
 
-	if (unlikely(!sd))
+	if (unlikely(!kn))
 		return;
 
-	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
-		rwsem_release(&sd->dep_map, 1, _RET_IP_);
-	v = atomic_dec_return(&sd->s_active);
+	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	v = atomic_dec_return(&kn->s_active);
 	if (likely(v != SD_DEACTIVATED_BIAS))
 		return;
 
-	/* atomic_dec_return() is a mb(), we'll always see the updated
-	 * sd->u.completion.
+	/*
+	 * atomic_dec_return() is a mb(), we'll always see the updated
+	 * kn->u.completion.
 	 */
-	complete(sd->u.completion);
+	complete(kn->u.completion);
 }
 
 /**
- *	sysfs_deactivate - deactivate sysfs_dirent
- *	@sd: sysfs_dirent to deactivate
+ *	sysfs_deactivate - deactivate kernfs_node
+ *	@kn: kernfs_node to deactivate
  *
  *	Deny new active references and drain existing ones.
  */
-static void sysfs_deactivate(struct sysfs_dirent *sd)
+static void sysfs_deactivate(struct kernfs_node *kn)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	int v;
 
-	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
+	BUG_ON(!(kn->s_flags & SYSFS_FLAG_REMOVED));
 
-	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
+	if (!(sysfs_type(kn) & SYSFS_ACTIVE_REF))
 		return;
 
-	sd->u.completion = (void *)&wait;
+	kn->u.completion = (void *)&wait;
 
-	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
+	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
 	/* atomic_add_return() is a mb(), put_active() will always see
-	 * the updated sd->u.completion.
+	 * the updated kn->u.completion.
 	 */
-	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
+	v = atomic_add_return(SD_DEACTIVATED_BIAS, &kn->s_active);
 
 	if (v != SD_DEACTIVATED_BIAS) {
-		lock_contended(&sd->dep_map, _RET_IP_);
+		lock_contended(&kn->dep_map, _RET_IP_);
 		wait_for_completion(&wait);
 	}
 
-	lock_acquired(&sd->dep_map, _RET_IP_);
-	rwsem_release(&sd->dep_map, 1, _RET_IP_);
+	lock_acquired(&kn->dep_map, _RET_IP_);
+	rwsem_release(&kn->dep_map, 1, _RET_IP_);
 }
 
 /**
- * kernfs_get - get a reference count on a sysfs_dirent
- * @sd: the target sysfs_dirent
+ * kernfs_get - get a reference count on a kernfs_node
+ * @kn: the target kernfs_node
  */
-void kernfs_get(struct sysfs_dirent *sd)
+void kernfs_get(struct kernfs_node *kn)
 {
-	if (sd) {
-		WARN_ON(!atomic_read(&sd->s_count));
-		atomic_inc(&sd->s_count);
+	if (kn) {
+		WARN_ON(!atomic_read(&kn->s_count));
+		atomic_inc(&kn->s_count);
 	}
 }
 EXPORT_SYMBOL_GPL(kernfs_get);
 
 /**
- * kernfs_put - put a reference count on a sysfs_dirent
- * @sd: the target sysfs_dirent
+ * kernfs_put - put a reference count on a kernfs_node
+ * @kn: the target kernfs_node
  *
- * Put a reference count of @sd and destroy it if it reached zero.
+ * Put a reference count of @kn and destroy it if it reached zero.
  */
-void kernfs_put(struct sysfs_dirent *sd)
+void kernfs_put(struct kernfs_node *kn)
 {
-	struct sysfs_dirent *parent_sd;
+	struct kernfs_node *parent;
 	struct kernfs_root *root;
 
-	if (!sd || !atomic_dec_and_test(&sd->s_count))
+	if (!kn || !atomic_dec_and_test(&kn->s_count))
 		return;
-	root = kernfs_root(sd);
+	root = kernfs_root(kn);
  repeat:
 	/* Moving/renaming is always done while holding reference.
-	 * sd->s_parent won't change beneath us.
+	 * kn->s_parent won't change beneath us.
 	 */
-	parent_sd = sd->s_parent;
+	parent = kn->s_parent;
 
-	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
+	WARN(!(kn->s_flags & SYSFS_FLAG_REMOVED),
 		"sysfs: free using entry: %s/%s\n",
-		parent_sd ? parent_sd->s_name : "", sd->s_name);
-
-	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-		kernfs_put(sd->s_symlink.target_sd);
-	if (sysfs_type(sd) & SYSFS_COPY_NAME)
-		kfree(sd->s_name);
-	if (sd->s_iattr) {
-		if (sd->s_iattr->ia_secdata)
-			security_release_secctx(sd->s_iattr->ia_secdata,
-						sd->s_iattr->ia_secdata_len);
-		simple_xattrs_free(&sd->s_iattr->xattrs);
+		parent ? parent->s_name : "", kn->s_name);
+
+	if (sysfs_type(kn) == SYSFS_KOBJ_LINK)
+		kernfs_put(kn->s_symlink.target_kn);
+	if (sysfs_type(kn) & SYSFS_COPY_NAME)
+		kfree(kn->s_name);
+	if (kn->s_iattr) {
+		if (kn->s_iattr->ia_secdata)
+			security_release_secctx(kn->s_iattr->ia_secdata,
+						kn->s_iattr->ia_secdata_len);
+		simple_xattrs_free(&kn->s_iattr->xattrs);
 	}
-	kfree(sd->s_iattr);
-	ida_simple_remove(&root->ino_ida, sd->s_ino);
-	kmem_cache_free(sysfs_dir_cachep, sd);
+	kfree(kn->s_iattr);
+	ida_simple_remove(&root->ino_ida, kn->s_ino);
+	kmem_cache_free(sysfs_dir_cachep, kn);
 
-	sd = parent_sd;
-	if (sd) {
-		if (atomic_dec_and_test(&sd->s_count))
+	kn = parent;
+	if (kn) {
+		if (atomic_dec_and_test(&kn->s_count))
 			goto repeat;
 	} else {
-		/* just released the root sd, free @root too */
+		/* just released the root kn, free @root too */
 		ida_destroy(&root->ino_ida);
 		kfree(root);
 	}
@@ -267,35 +268,35 @@ EXPORT_SYMBOL_GPL(kernfs_put);
 
 static int sysfs_dentry_delete(const struct dentry *dentry)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
+	struct kernfs_node *kn = dentry->d_fsdata;
+	return !(kn && !(kn->s_flags & SYSFS_FLAG_REMOVED));
 }
 
 static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	sd = dentry->d_fsdata;
+	kn = dentry->d_fsdata;
 	mutex_lock(&sysfs_mutex);
 
 	/* The sysfs dirent has been deleted */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+	if (kn->s_flags & SYSFS_FLAG_REMOVED)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved? */
-	if (dentry->d_parent->d_fsdata != sd->s_parent)
+	if (dentry->d_parent->d_fsdata != kn->s_parent)
 		goto out_bad;
 
 	/* The sysfs dirent has been renamed */
-	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
+	if (strcmp(dentry->d_name.name, kn->s_name) != 0)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved to a different namespace */
-	if (sd->s_parent && kernfs_ns_enabled(sd->s_parent) &&
-	    sysfs_info(dentry->d_sb)->ns != sd->s_ns)
+	if (kn->s_parent && kernfs_ns_enabled(kn->s_parent) &&
+	    sysfs_info(dentry->d_sb)->ns != kn->s_ns)
 		goto out_bad;
 
 	mutex_unlock(&sysfs_mutex);
@@ -335,11 +336,11 @@ const struct dentry_operations sysfs_dentry_ops = {
 	.d_release	= sysfs_dentry_release,
 };
 
-struct sysfs_dirent *sysfs_new_dirent(struct kernfs_root *root,
-				      const char *name, umode_t mode, int type)
+struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
+				     const char *name, umode_t mode, int type)
 {
 	char *dup_name = NULL;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	int ret;
 
 	if (type & SYSFS_COPY_NAME) {
@@ -348,38 +349,38 @@ struct sysfs_dirent *sysfs_new_dirent(struct kernfs_root *root,
 			return NULL;
 	}
 
-	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
-	if (!sd)
+	kn = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
+	if (!kn)
 		goto err_out1;
 
 	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto err_out2;
-	sd->s_ino = ret;
+	kn->s_ino = ret;
 
-	atomic_set(&sd->s_count, 1);
-	atomic_set(&sd->s_active, 0);
+	atomic_set(&kn->s_count, 1);
+	atomic_set(&kn->s_active, 0);
 
-	sd->s_name = name;
-	sd->s_mode = mode;
-	sd->s_flags = type | SYSFS_FLAG_REMOVED;
+	kn->s_name = name;
+	kn->s_mode = mode;
+	kn->s_flags = type | SYSFS_FLAG_REMOVED;
 
-	return sd;
+	return kn;
 
  err_out2:
-	kmem_cache_free(sysfs_dir_cachep, sd);
+	kmem_cache_free(sysfs_dir_cachep, kn);
  err_out1:
 	kfree(dup_name);
 	return NULL;
 }
 
 /**
- *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
+ *	sysfs_addrm_start - prepare for kernfs_node add/remove
  *	@acxt: pointer to sysfs_addrm_cxt to be used
  *
  *	This function is called when the caller is about to add or remove
- *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
- *	to keep and pass context to other addrm functions.
+ *	kernfs_node.  This function acquires sysfs_mutex.  @acxt is used to
+ *	keep and pass context to other addrm functions.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).  sysfs_mutex is locked on
@@ -394,13 +395,13 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
 }
 
 /**
- *	sysfs_add_one - add sysfs_dirent to parent without warning
+ *	sysfs_add_one - add kernfs_node to parent without warning
  *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
- *	@parent_sd: the parent sysfs_dirent to add @sd to
+ *	@kn: kernfs_node to be added
+ *	@parent: the parent kernfs_node to add @kn to
  *
- *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *	the parent inode if @sd is a directory and link into the children
+ *	Get @parent and set @kn->s_parent to it and increment nlink of
+ *	the parent inode if @kn is a directory and link into the children
  *	list of the parent.
  *
  *	This function should be called between calls to
@@ -414,51 +415,51 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd)
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct kernfs_node *kn,
+		  struct kernfs_node *parent)
 {
-	bool has_ns = kernfs_ns_enabled(parent_sd);
+	bool has_ns = kernfs_ns_enabled(parent);
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
-	if (has_ns != (bool)sd->s_ns) {
+	if (has_ns != (bool)kn->s_ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
 		     has_ns ? "required" : "invalid",
-		     parent_sd->s_name, sd->s_name);
+		     parent->s_name, kn->s_name);
 		return -EINVAL;
 	}
 
-	if (sysfs_type(parent_sd) != SYSFS_DIR)
+	if (sysfs_type(parent) != SYSFS_DIR)
 		return -EINVAL;
 
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = parent_sd;
-	kernfs_get(parent_sd);
+	kn->s_hash = sysfs_name_hash(kn->s_name, kn->s_ns);
+	kn->s_parent = parent;
+	kernfs_get(parent);
 
-	ret = sysfs_link_sibling(sd);
+	ret = sysfs_link_sibling(kn);
 	if (ret)
 		return ret;
 
 	/* Update timestamps on the parent */
-	ps_iattr = parent_sd->s_iattr;
+	ps_iattr = parent->s_iattr;
 	if (ps_iattr) {
 		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
 		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
 	}
 
 	/* Mark the entry added into directory tree */
-	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
+	kn->s_flags &= ~SYSFS_FLAG_REMOVED;
 
 	return 0;
 }
 
 /**
- *	sysfs_remove_one - remove sysfs_dirent from parent
+ *	sysfs_remove_one - remove kernfs_node from parent
  *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be removed
+ *	@kn: kernfs_node to be removed
  *
- *	Mark @sd removed and drop nlink of parent inode if @sd is a
- *	directory.  @sd is unlinked from the children list.
+ *	Mark @kn removed and drop nlink of parent inode if @kn is a
+ *	directory.  @kn is unlinked from the children list.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -468,7 +469,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
  *	Determined by sysfs_addrm_start().
  */
 static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
-			     struct sysfs_dirent *sd)
+			     struct kernfs_node *kn)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 
@@ -476,31 +477,31 @@ static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
 	 * Removal can be called multiple times on the same node.  Only the
 	 * first invocation is effective and puts the base ref.
 	 */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+	if (kn->s_flags & SYSFS_FLAG_REMOVED)
 		return;
 
-	if (sd->s_parent) {
-		sysfs_unlink_sibling(sd);
+	if (kn->s_parent) {
+		sysfs_unlink_sibling(kn);
 
 		/* Update timestamps on the parent */
-		ps_iattr = sd->s_parent->s_iattr;
+		ps_iattr = kn->s_parent->s_iattr;
 		if (ps_iattr) {
 			ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
 			ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
 		}
 	}
 
-	sd->s_flags |= SYSFS_FLAG_REMOVED;
-	sd->u.removed_list = acxt->removed;
-	acxt->removed = sd;
+	kn->s_flags |= SYSFS_FLAG_REMOVED;
+	kn->u.removed_list = acxt->removed;
+	acxt->removed = kn;
 }
 
 /**
- *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
+ *	sysfs_addrm_finish - finish up kernfs_node add/remove
  *	@acxt: addrm context to finish up
  *
- *	Finish up sysfs_dirent add/remove.  Resources acquired by
- *	sysfs_addrm_start() are released and removed sysfs_dirents are
+ *	Finish up kernfs_node add/remove.  Resources acquired by
+ *	sysfs_addrm_start() are released and removed kernfs_nodes are
  *	cleaned up.
  *
  *	LOCKING:
@@ -512,30 +513,30 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 	/* release resources acquired by sysfs_addrm_start() */
 	mutex_unlock(&sysfs_mutex);
 
-	/* kill removed sysfs_dirents */
+	/* kill removed kernfs_nodes */
 	while (acxt->removed) {
-		struct sysfs_dirent *sd = acxt->removed;
+		struct kernfs_node *kn = acxt->removed;
 
-		acxt->removed = sd->u.removed_list;
+		acxt->removed = kn->u.removed_list;
 
-		sysfs_deactivate(sd);
-		sysfs_unmap_bin_file(sd);
-		kernfs_put(sd);
+		sysfs_deactivate(kn);
+		sysfs_unmap_bin_file(kn);
+		kernfs_put(kn);
 	}
 }
 
 /**
- * kernfs_find_ns - find sysfs_dirent with the given name
- * @parent: sysfs_dirent to search under
+ * kernfs_find_ns - find kernfs_node with the given name
+ * @parent: kernfs_node to search under
  * @name: name to look for
  * @ns: the namespace tag to use
  *
- * Look for sysfs_dirent with name @name under @parent.  Returns pointer to
- * the found sysfs_dirent on success, %NULL on failure.
+ * Look for kernfs_node with name @name under @parent.  Returns pointer to
+ * the found kernfs_node on success, %NULL on failure.
  */
-static struct sysfs_dirent *kernfs_find_ns(struct sysfs_dirent *parent,
-					   const unsigned char *name,
-					   const void *ns)
+static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
+					  const unsigned char *name,
+					  const void *ns)
 {
 	struct rb_node *node = parent->s_dir.children.rb_node;
 	bool has_ns = kernfs_ns_enabled(parent);
@@ -552,42 +553,42 @@ static struct sysfs_dirent *kernfs_find_ns(struct sysfs_dirent *parent,
 
 	hash = sysfs_name_hash(name, ns);
 	while (node) {
-		struct sysfs_dirent *sd;
+		struct kernfs_node *kn;
 		int result;
 
-		sd = to_sysfs_dirent(node);
-		result = sysfs_name_compare(hash, name, ns, sd);
+		kn = rb_to_kn(node);
+		result = sysfs_name_compare(hash, name, ns, kn);
 		if (result < 0)
 			node = node->rb_left;
 		else if (result > 0)
 			node = node->rb_right;
 		else
-			return sd;
+			return kn;
 	}
 	return NULL;
 }
 
 /**
- * kernfs_find_and_get_ns - find and get sysfs_dirent with the given name
- * @parent: sysfs_dirent to search under
+ * kernfs_find_and_get_ns - find and get kernfs_node with the given name
+ * @parent: kernfs_node to search under
  * @name: name to look for
  * @ns: the namespace tag to use
  *
- * Look for sysfs_dirent with name @name under @parent and get a reference
+ * Look for kernfs_node with name @name under @parent and get a reference
  * if found.  This function may sleep and returns pointer to the found
- * sysfs_dirent on success, %NULL on failure.
+ * kernfs_node on success, %NULL on failure.
  */
-struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
-					    const char *name, const void *ns)
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+					   const char *name, const void *ns)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 
 	mutex_lock(&sysfs_mutex);
-	sd = kernfs_find_ns(parent, name, ns);
-	kernfs_get(sd);
+	kn = kernfs_find_ns(parent, name, ns);
+	kernfs_get(kn);
 	mutex_unlock(&sysfs_mutex);
 
-	return sd;
+	return kn;
 }
 EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 
@@ -601,7 +602,7 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 struct kernfs_root *kernfs_create_root(void *priv)
 {
 	struct kernfs_root *root;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root)
@@ -609,18 +610,18 @@ struct kernfs_root *kernfs_create_root(void *priv)
 
 	ida_init(&root->ino_ida);
 
-	sd = sysfs_new_dirent(root, "", S_IFDIR | S_IRUGO | S_IXUGO, SYSFS_DIR);
-	if (!sd) {
+	kn = sysfs_new_dirent(root, "", S_IFDIR | S_IRUGO | S_IXUGO, SYSFS_DIR);
+	if (!kn) {
 		ida_destroy(&root->ino_ida);
 		kfree(root);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
-	sd->priv = priv;
-	sd->s_dir.root = root;
+	kn->s_flags &= ~SYSFS_FLAG_REMOVED;
+	kn->priv = priv;
+	kn->s_dir.root = root;
 
-	root->sd = sd;
+	root->kn = kn;
 
 	return root;
 }
@@ -634,7 +635,7 @@ struct kernfs_root *kernfs_create_root(void *priv)
  */
 void kernfs_destroy_root(struct kernfs_root *root)
 {
-	kernfs_remove(root->sd);	/* will also free @root */
+	kernfs_remove(root->kn);	/* will also free @root */
 }
 
 /**
@@ -646,33 +647,33 @@ void kernfs_destroy_root(struct kernfs_root *root)
  *
  * Returns the created node on success, ERR_PTR() value on failure.
  */
-struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
-					  const char *name, void *priv,
-					  const void *ns)
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+					 const char *name, void *priv,
+					 const void *ns)
 {
 	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	int rc;
 
 	/* allocate */
-	sd = sysfs_new_dirent(kernfs_root(parent), name, mode, SYSFS_DIR);
-	if (!sd)
+	kn = sysfs_new_dirent(kernfs_root(parent), name, mode, SYSFS_DIR);
+	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
-	sd->s_dir.root = parent->s_dir.root;
-	sd->s_ns = ns;
-	sd->priv = priv;
+	kn->s_dir.root = parent->s_dir.root;
+	kn->s_ns = ns;
+	kn->priv = priv;
 
 	/* link in */
 	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, parent);
+	rc = sysfs_add_one(&acxt, kn, parent);
 	sysfs_addrm_finish(&acxt);
 
 	if (!rc)
-		return sd;
+		return kn;
 
-	kernfs_put(sd);
+	kernfs_put(kn);
 	return ERR_PTR(rc);
 }
 
@@ -680,29 +681,28 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
 {
 	struct dentry *ret = NULL;
-	struct dentry *parent = dentry->d_parent;
-	struct sysfs_dirent *parent_sd = parent->d_fsdata;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+	struct kernfs_node *kn;
 	struct inode *inode;
 	const void *ns = NULL;
 
 	mutex_lock(&sysfs_mutex);
 
-	if (kernfs_ns_enabled(parent_sd))
+	if (kernfs_ns_enabled(parent))
 		ns = sysfs_info(dir->i_sb)->ns;
 
-	sd = kernfs_find_ns(parent_sd, dentry->d_name.name, ns);
+	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
 
 	/* no such entry */
-	if (!sd) {
+	if (!kn) {
 		ret = ERR_PTR(-ENOENT);
 		goto out_unlock;
 	}
-	kernfs_get(sd);
-	dentry->d_fsdata = sd;
+	kernfs_get(kn);
+	dentry->d_fsdata = kn;
 
 	/* attach dentry and inode */
-	inode = sysfs_get_inode(dir->i_sb, sd);
+	inode = sysfs_get_inode(dir->i_sb, kn);
 	if (!inode) {
 		ret = ERR_PTR(-ENOMEM);
 		goto out_unlock;
@@ -726,9 +726,9 @@ const struct inode_operations sysfs_dir_inode_operations = {
 	.listxattr	= sysfs_listxattr,
 };
 
-static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
+static struct kernfs_node *sysfs_leftmost_descendant(struct kernfs_node *pos)
 {
-	struct sysfs_dirent *last;
+	struct kernfs_node *last;
 
 	while (true) {
 		struct rb_node *rbn;
@@ -742,7 +742,7 @@ static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
 		if (!rbn)
 			break;
 
-		pos = to_sysfs_dirent(rbn);
+		pos = rb_to_kn(rbn);
 	}
 
 	return last;
@@ -751,14 +751,14 @@ static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
 /**
  * sysfs_next_descendant_post - find the next descendant for post-order walk
  * @pos: the current position (%NULL to initiate traversal)
- * @root: sysfs_dirent whose descendants to walk
+ * @root: kernfs_node whose descendants to walk
  *
  * Find the next descendant to visit for post-order traversal of @root's
  * descendants.  @root is included in the iteration and the last node to be
  * visited.
  */
-static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
-						       struct sysfs_dirent *root)
+static struct kernfs_node *sysfs_next_descendant_post(struct kernfs_node *pos,
+						      struct kernfs_node *root)
 {
 	struct rb_node *rbn;
 
@@ -775,62 +775,62 @@ static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
 	/* if there's an unvisited sibling, visit its leftmost descendant */
 	rbn = rb_next(&pos->s_rb);
 	if (rbn)
-		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
+		return sysfs_leftmost_descendant(rb_to_kn(rbn));
 
 	/* no sibling left, visit parent */
 	return pos->s_parent;
 }
 
 static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
-			    struct sysfs_dirent *sd)
+			    struct kernfs_node *kn)
 {
-	struct sysfs_dirent *pos, *next;
+	struct kernfs_node *pos, *next;
 
-	if (!sd)
+	if (!kn)
 		return;
 
-	pr_debug("sysfs %s: removing\n", sd->s_name);
+	pr_debug("sysfs %s: removing\n", kn->s_name);
 
 	next = NULL;
 	do {
 		pos = next;
-		next = sysfs_next_descendant_post(pos, sd);
+		next = sysfs_next_descendant_post(pos, kn);
 		if (pos)
 			sysfs_remove_one(acxt, pos);
 	} while (next);
 }
 
 /**
- * kernfs_remove - remove a sysfs_dirent recursively
- * @sd: the sysfs_dirent to remove
+ * kernfs_remove - remove a kernfs_node recursively
+ * @kn: the kernfs_node to remove
  *
- * Remove @sd along with all its subdirectories and files.
+ * Remove @kn along with all its subdirectories and files.
  */
-void kernfs_remove(struct sysfs_dirent *sd)
+void kernfs_remove(struct kernfs_node *kn)
 {
 	struct sysfs_addrm_cxt acxt;
 
 	sysfs_addrm_start(&acxt);
-	__kernfs_remove(&acxt, sd);
+	__kernfs_remove(&acxt, kn);
 	sysfs_addrm_finish(&acxt);
 }
 
 /**
- * kernfs_remove_by_name_ns - find a sysfs_dirent by name and remove it
- * @dir_sd: parent of the target
- * @name: name of the sysfs_dirent to remove
- * @ns: namespace tag of the sysfs_dirent to remove
+ * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
+ * @parent: parent of the target
+ * @name: name of the kernfs_node to remove
+ * @ns: namespace tag of the kernfs_node to remove
  *
- * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
- * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
+ * Look for the kernfs_node with @name and @ns under @parent and remove it.
+ * Returns 0 on success, -ENOENT if such entry doesn't exist.
  */
-int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns)
 {
 	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 
-	if (!dir_sd) {
+	if (!parent) {
 		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
 			name);
 		return -ENOENT;
@@ -838,13 +838,13 @@ int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
 
 	sysfs_addrm_start(&acxt);
 
-	sd = kernfs_find_ns(dir_sd, name, ns);
-	if (sd)
-		__kernfs_remove(&acxt, sd);
+	kn = kernfs_find_ns(parent, name, ns);
+	if (kn)
+		__kernfs_remove(&acxt, kn);
 
 	sysfs_addrm_finish(&acxt);
 
-	if (sd)
+	if (kn)
 		return 0;
 	else
 		return -ENOENT;
@@ -852,12 +852,12 @@ int kernfs_remove_by_name_ns(struct sysfs_dirent *dir_sd, const char *name,
 
 /**
  * kernfs_rename_ns - move and rename a kernfs_node
- * @sd: target node
+ * @kn: target node
  * @new_parent: new parent to put @sd under
  * @new_name: new name
  * @new_ns: new namespace tag
  */
-int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		     const char *new_name, const void *new_ns)
 {
 	int error;
@@ -865,35 +865,35 @@ int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 	mutex_lock(&sysfs_mutex);
 
 	error = 0;
-	if ((sd->s_parent == new_parent) && (sd->s_ns == new_ns) &&
-	    (strcmp(sd->s_name, new_name) == 0))
+	if ((kn->s_parent == new_parent) && (kn->s_ns == new_ns) &&
+	    (strcmp(kn->s_name, new_name) == 0))
 		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
 	if (kernfs_find_ns(new_parent, new_name, new_ns))
 		goto out;
 
-	/* rename sysfs_dirent */
-	if (strcmp(sd->s_name, new_name) != 0) {
+	/* rename kernfs_node */
+	if (strcmp(kn->s_name, new_name) != 0) {
 		error = -ENOMEM;
 		new_name = kstrdup(new_name, GFP_KERNEL);
 		if (!new_name)
 			goto out;
 
-		kfree(sd->s_name);
-		sd->s_name = new_name;
+		kfree(kn->s_name);
+		kn->s_name = new_name;
 	}
 
 	/*
 	 * Move to the appropriate place in the appropriate directories rbtree.
 	 */
-	sysfs_unlink_sibling(sd);
+	sysfs_unlink_sibling(kn);
 	kernfs_get(new_parent);
-	kernfs_put(sd->s_parent);
-	sd->s_ns = new_ns;
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = new_parent;
-	sysfs_link_sibling(sd);
+	kernfs_put(kn->s_parent);
+	kn->s_ns = new_ns;
+	kn->s_hash = sysfs_name_hash(kn->s_name, kn->s_ns);
+	kn->s_parent = new_parent;
+	sysfs_link_sibling(kn);
 
 	error = 0;
  out:
@@ -902,9 +902,9 @@ int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
 }
 
 /* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct sysfs_dirent *sd)
+static inline unsigned char dt_type(struct kernfs_node *kn)
 {
-	return (sd->s_mode >> 12) & 15;
+	return (kn->s_mode >> 12) & 15;
 }
 
 static int sysfs_dir_release(struct inode *inode, struct file *filp)
@@ -913,21 +913,21 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
+static struct kernfs_node *sysfs_dir_pos(const void *ns,
+	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
 {
 	if (pos) {
 		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
-			pos->s_parent == parent_sd &&
+			pos->s_parent == parent &&
 			hash == pos->s_hash;
 		kernfs_put(pos);
 		if (!valid)
 			pos = NULL;
 	}
 	if (!pos && (hash > 1) && (hash < INT_MAX)) {
-		struct rb_node *node = parent_sd->s_dir.children.rb_node;
+		struct rb_node *node = parent->s_dir.children.rb_node;
 		while (node) {
-			pos = to_sysfs_dirent(node);
+			pos = rb_to_kn(node);
 
 			if (hash < pos->s_hash)
 				node = node->rb_left;
@@ -943,22 +943,22 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 		if (!node)
 			pos = NULL;
 		else
-			pos = to_sysfs_dirent(node);
+			pos = rb_to_kn(node);
 	}
 	return pos;
 }
 
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+static struct kernfs_node *sysfs_dir_next_pos(const void *ns,
+	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
 {
-	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
+	pos = sysfs_dir_pos(ns, parent, ino, pos);
 	if (pos)
 		do {
 			struct rb_node *node = rb_next(&pos->s_rb);
 			if (!node)
 				pos = NULL;
 			else
-				pos = to_sysfs_dirent(node);
+				pos = rb_to_kn(node);
 		} while (pos && pos->s_ns != ns);
 	return pos;
 }
@@ -966,20 +966,20 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
 static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dentry = file->f_path.dentry;
-	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *pos = file->private_data;
+	struct kernfs_node *parent = dentry->d_fsdata;
+	struct kernfs_node *pos = file->private_data;
 	const void *ns = NULL;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 	mutex_lock(&sysfs_mutex);
 
-	if (kernfs_ns_enabled(parent_sd))
+	if (kernfs_ns_enabled(parent))
 		ns = sysfs_info(dentry->d_sb)->ns;
 
-	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
+	for (pos = sysfs_dir_pos(ns, parent, ctx->pos, pos);
 	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
+	     pos = sysfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
 		const char *name = pos->s_name;
 		unsigned int type = dt_type(pos);
 		int len = strlen(name);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index fa053151fa9..1bf07ded826 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -19,9 +19,9 @@
 
 /*
  * There's one sysfs_open_file for each open file and one sysfs_open_dirent
- * for each sysfs_dirent with one or more open files.
+ * for each kernfs_node with one or more open files.
  *
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
+ * kernfs_node->s_attr.open points to sysfs_open_dirent.  s_attr.open is
  * protected by sysfs_open_dirent_lock.
  *
  * filp->private_data points to seq_file whose ->private points to
@@ -44,14 +44,14 @@ static struct sysfs_open_file *sysfs_of(struct file *file)
 }
 
 /*
- * Determine the kernfs_ops for the given sysfs_dirent.  This function must
+ * Determine the kernfs_ops for the given kernfs_node.  This function must
  * be called while holding an active reference.
  */
-static const struct kernfs_ops *kernfs_ops(struct sysfs_dirent *sd)
+static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
 {
-	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
-		lockdep_assert_held(sd);
-	return sd->s_attr.ops;
+	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+		lockdep_assert_held(kn);
+	return kn->s_attr.ops;
 }
 
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
@@ -64,10 +64,10 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return ERR_PTR(-ENODEV);
 
-	ops = kernfs_ops(of->sd);
+	ops = kernfs_ops(of->kn);
 	if (ops->seq_start) {
 		return ops->seq_start(sf, ppos);
 	} else {
@@ -82,7 +82,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 {
 	struct sysfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops = kernfs_ops(of->sd);
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
 	if (ops->seq_next) {
 		return ops->seq_next(sf, v, ppos);
@@ -99,12 +99,12 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 static void kernfs_seq_stop(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops = kernfs_ops(of->sd);
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
 	if (ops->seq_stop)
 		ops->seq_stop(sf, v);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	mutex_unlock(&of->mutex);
 }
 
@@ -112,9 +112,9 @@ static int kernfs_seq_show(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
 
-	of->event = atomic_read(&of->sd->s_attr.open->event);
+	of->event = atomic_read(&of->kn->s_attr.open->event);
 
-	return of->sd->s_attr.ops->seq_show(sf, v);
+	return of->kn->s_attr.ops->seq_show(sf, v);
 }
 
 static const struct seq_operations kernfs_seq_ops = {
@@ -147,19 +147,19 @@ static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
+	if (!sysfs_get_active(of->kn)) {
 		len = -ENODEV;
 		mutex_unlock(&of->mutex);
 		goto out_free;
 	}
 
-	ops = kernfs_ops(of->sd);
+	ops = kernfs_ops(of->kn);
 	if (ops->read)
 		len = ops->read(of, buf, len, *ppos);
 	else
 		len = -EINVAL;
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	mutex_unlock(&of->mutex);
 
 	if (len < 0)
@@ -189,7 +189,7 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 {
 	struct sysfs_open_file *of = sysfs_of(file);
 
-	if (of->sd->s_flags & SYSFS_FLAG_HAS_SEQ_SHOW)
+	if (of->kn->s_flags & SYSFS_FLAG_HAS_SEQ_SHOW)
 		return seq_read(file, user_buf, count, ppos);
 	else
 		return kernfs_file_direct_read(of, user_buf, count, ppos);
@@ -234,19 +234,19 @@ static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
+	if (!sysfs_get_active(of->kn)) {
 		mutex_unlock(&of->mutex);
 		len = -ENODEV;
 		goto out_free;
 	}
 
-	ops = kernfs_ops(of->sd);
+	ops = kernfs_ops(of->kn);
 	if (ops->write)
 		len = ops->write(of, buf, len, *ppos);
 	else
 		len = -EINVAL;
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	mutex_unlock(&of->mutex);
 
 	if (len > 0)
@@ -264,13 +264,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
 	if (!of->vm_ops)
 		return;
 
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return;
 
 	if (of->vm_ops->open)
 		of->vm_ops->open(vma);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 }
 
 static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -282,14 +282,14 @@ static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!of->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return VM_FAULT_SIGBUS;
 
 	ret = VM_FAULT_SIGBUS;
 	if (of->vm_ops->fault)
 		ret = of->vm_ops->fault(vma, vmf);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	return ret;
 }
 
@@ -303,7 +303,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return VM_FAULT_SIGBUS;
 
 	ret = 0;
@@ -312,7 +312,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 	else
 		file_update_time(file);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	return ret;
 }
 
@@ -326,14 +326,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
 	if (!of->vm_ops)
 		return -EINVAL;
 
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return -EINVAL;
 
 	ret = -EINVAL;
 	if (of->vm_ops->access)
 		ret = of->vm_ops->access(vma, addr, buf, len, write);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	return ret;
 }
 
@@ -348,14 +348,14 @@ static int kernfs_vma_set_policy(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return 0;
 
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return -EINVAL;
 
 	ret = 0;
 	if (of->vm_ops->set_policy)
 		ret = of->vm_ops->set_policy(vma, new);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	return ret;
 }
 
@@ -369,14 +369,14 @@ static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return vma->vm_policy;
 
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return vma->vm_policy;
 
 	pol = vma->vm_policy;
 	if (of->vm_ops->get_policy)
 		pol = of->vm_ops->get_policy(vma, addr);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	return pol;
 }
 
@@ -391,14 +391,14 @@ static int kernfs_vma_migrate(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return 0;
 
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		return 0;
 
 	ret = 0;
 	if (of->vm_ops->migrate)
 		ret = of->vm_ops->migrate(vma, from, to, flags);
 
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 	return ret;
 }
 #endif
@@ -428,16 +428,16 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
 	 * comment in kernfs_file_open() for more details.
 	 */
-	if (!(of->sd->s_flags & SYSFS_FLAG_HAS_MMAP))
+	if (!(of->kn->s_flags & SYSFS_FLAG_HAS_MMAP))
 		return -ENODEV;
 
 	mutex_lock(&of->mutex);
 
 	rc = -ENODEV;
-	if (!sysfs_get_active(of->sd))
+	if (!sysfs_get_active(of->kn))
 		goto out_unlock;
 
-	ops = kernfs_ops(of->sd);
+	ops = kernfs_ops(of->kn);
 	rc = ops->mmap(of, vma);
 
 	/*
@@ -465,7 +465,7 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	of->vm_ops = vma->vm_ops;
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
-	sysfs_put_active(of->sd);
+	sysfs_put_active(of->kn);
 out_unlock:
 	mutex_unlock(&of->mutex);
 
@@ -474,10 +474,10 @@ out_unlock:
 
 /**
  *	sysfs_get_open_dirent - get or create sysfs_open_dirent
- *	@sd: target sysfs_dirent
+ *	@kn: target kernfs_node
  *	@of: sysfs_open_file for this instance of open
  *
- *	If @sd->s_attr.open exists, increment its reference count;
+ *	If @kn->s_attr.open exists, increment its reference count;
  *	otherwise, create one.  @of is chained to the files list.
  *
  *	LOCKING:
@@ -486,7 +486,7 @@ out_unlock:
  *	RETURNS:
  *	0 on success, -errno on failure.
  */
-static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
+static int sysfs_get_open_dirent(struct kernfs_node *kn,
 				 struct sysfs_open_file *of)
 {
 	struct sysfs_open_dirent *od, *new_od = NULL;
@@ -495,12 +495,12 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irq(&sysfs_open_dirent_lock);
 
-	if (!sd->s_attr.open && new_od) {
-		sd->s_attr.open = new_od;
+	if (!kn->s_attr.open && new_od) {
+		kn->s_attr.open = new_od;
 		new_od = NULL;
 	}
 
-	od = sd->s_attr.open;
+	od = kn->s_attr.open;
 	if (od) {
 		atomic_inc(&od->refcnt);
 		list_add_tail(&of->list, &od->files);
@@ -528,19 +528,19 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 
 /**
  *	sysfs_put_open_dirent - put sysfs_open_dirent
- *	@sd: target sysfs_dirent
+ *	@kn: target kernfs_nodet
  *	@of: associated sysfs_open_file
  *
- *	Put @sd->s_attr.open and unlink @of from the files list.  If
+ *	Put @kn->s_attr.open and unlink @of from the files list.  If
  *	reference count reaches zero, disassociate and free it.
  *
  *	LOCKING:
  *	None.
  */
-static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
+static void sysfs_put_open_dirent(struct kernfs_node *kn,
 				  struct sysfs_open_file *of)
 {
-	struct sysfs_open_dirent *od = sd->s_attr.open;
+	struct sysfs_open_dirent *od = kn->s_attr.open;
 	unsigned long flags;
 
 	mutex_lock(&sysfs_open_file_mutex);
@@ -550,7 +550,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 		list_del(&of->list);
 
 	if (atomic_dec_and_test(&od->refcnt))
-		sd->s_attr.open = NULL;
+		kn->s_attr.open = NULL;
 	else
 		od = NULL;
 
@@ -562,16 +562,16 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 
 static int kernfs_file_open(struct inode *inode, struct file *file)
 {
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
 	const struct kernfs_ops *ops;
 	struct sysfs_open_file *of;
 	bool has_read, has_write, has_mmap;
 	int error = -EACCES;
 
-	if (!sysfs_get_active(attr_sd))
+	if (!sysfs_get_active(kn))
 		return -ENODEV;
 
-	ops = kernfs_ops(attr_sd);
+	ops = kernfs_ops(kn);
 
 	has_read = ops->seq_show || ops->read || ops->mmap;
 	has_write = ops->write || ops->mmap;
@@ -612,7 +612,7 @@ static int kernfs_file_open(struct inode *inode, struct file *file)
 	else
 		mutex_init(&of->mutex);
 
-	of->sd = attr_sd;
+	of->kn = kn;
 	of->file = file;
 
 	/*
@@ -634,12 +634,12 @@ static int kernfs_file_open(struct inode *inode, struct file *file)
 		file->f_mode |= FMODE_PWRITE;
 
 	/* make sure we have open dirent struct */
-	error = sysfs_get_open_dirent(attr_sd, of);
+	error = sysfs_get_open_dirent(kn, of);
 	if (error)
 		goto err_close;
 
 	/* open succeeded, put active references */
-	sysfs_put_active(attr_sd);
+	sysfs_put_active(kn);
 	return 0;
 
 err_close:
@@ -647,32 +647,32 @@ err_close:
 err_free:
 	kfree(of);
 err_out:
-	sysfs_put_active(attr_sd);
+	sysfs_put_active(kn);
 	return error;
 }
 
 static int kernfs_file_release(struct inode *inode, struct file *filp)
 {
-	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
 	struct sysfs_open_file *of = sysfs_of(filp);
 
-	sysfs_put_open_dirent(sd, of);
+	sysfs_put_open_dirent(kn, of);
 	seq_release(inode, filp);
 	kfree(of);
 
 	return 0;
 }
 
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
+void sysfs_unmap_bin_file(struct kernfs_node *kn)
 {
 	struct sysfs_open_dirent *od;
 	struct sysfs_open_file *of;
 
-	if (!(sd->s_flags & SYSFS_FLAG_HAS_MMAP))
+	if (!(kn->s_flags & SYSFS_FLAG_HAS_MMAP))
 		return;
 
 	spin_lock_irq(&sysfs_open_dirent_lock);
-	od = sd->s_attr.open;
+	od = kn->s_attr.open;
 	if (od)
 		atomic_inc(&od->refcnt);
 	spin_unlock_irq(&sysfs_open_dirent_lock);
@@ -686,7 +686,7 @@ void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
 	}
 	mutex_unlock(&sysfs_open_file_mutex);
 
-	sysfs_put_open_dirent(sd, NULL);
+	sysfs_put_open_dirent(kn, NULL);
 }
 
 /* Sysfs attribute files are pollable.  The idea is that you read
@@ -705,16 +705,16 @@ void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
 static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
 {
 	struct sysfs_open_file *of = sysfs_of(filp);
-	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+	struct sysfs_open_dirent *od = kn->s_attr.open;
 
 	/* need parent for the kobj, grab both */
-	if (!sysfs_get_active(attr_sd))
+	if (!sysfs_get_active(kn))
 		goto trigger;
 
 	poll_wait(filp, &od->poll, wait);
 
-	sysfs_put_active(attr_sd);
+	sysfs_put_active(kn);
 
 	if (of->event != atomic_read(&od->event))
 		goto trigger;
@@ -727,19 +727,19 @@ static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
 
 /**
  * kernfs_notify - notify a kernfs file
- * @sd: file to notify
+ * @kn: file to notify
  *
- * Notify @sd such that poll(2) on @sd wakes up.
+ * Notify @kn such that poll(2) on @kn wakes up.
  */
-void kernfs_notify(struct sysfs_dirent *sd)
+void kernfs_notify(struct kernfs_node *kn)
 {
 	struct sysfs_open_dirent *od;
 	unsigned long flags;
 
 	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
-	if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-		od = sd->s_attr.open;
+	if (!WARN_ON(sysfs_type(kn) != SYSFS_KOBJ_ATTR)) {
+		od = kn->s_attr.open;
 		if (od) {
 			atomic_inc(&od->event);
 			wake_up_interruptible(&od->poll);
@@ -773,51 +773,51 @@ const struct file_operations kernfs_file_operations = {
  *
  * Returns the created node on success, ERR_PTR() value on error.
  */
-struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
-					       const char *name,
-					       umode_t mode, loff_t size,
-					       const struct kernfs_ops *ops,
-					       void *priv, const void *ns,
-					       struct lock_class_key *key)
+struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
+					      const char *name,
+					      umode_t mode, loff_t size,
+					      const struct kernfs_ops *ops,
+					      void *priv, const void *ns,
+					      struct lock_class_key *key)
 {
 	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	int rc;
 
-	sd = sysfs_new_dirent(kernfs_root(parent), name,
+	kn = sysfs_new_dirent(kernfs_root(parent), name,
 			      (mode & S_IALLUGO) | S_IFREG, SYSFS_KOBJ_ATTR);
-	if (!sd)
+	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
-	sd->s_attr.ops = ops;
-	sd->s_attr.size = size;
-	sd->s_ns = ns;
-	sd->priv = priv;
+	kn->s_attr.ops = ops;
+	kn->s_attr.size = size;
+	kn->s_ns = ns;
+	kn->priv = priv;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	if (key) {
-		lockdep_init_map(&sd->dep_map, "s_active", key, 0);
-		sd->s_flags |= SYSFS_FLAG_LOCKDEP;
+		lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+		kn->s_flags |= SYSFS_FLAG_LOCKDEP;
 	}
 #endif
 
 	/*
-	 * sd->s_attr.ops is accesible only while holding active ref.  We
+	 * kn->s_attr.ops is accesible only while holding active ref.  We
 	 * need to know whether some ops are implemented outside active
 	 * ref.  Cache their existence in flags.
 	 */
 	if (ops->seq_show)
-		sd->s_flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
+		kn->s_flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
 	if (ops->mmap)
-		sd->s_flags |= SYSFS_FLAG_HAS_MMAP;
+		kn->s_flags |= SYSFS_FLAG_HAS_MMAP;
 
 	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, parent);
+	rc = sysfs_add_one(&acxt, kn, parent);
 	sysfs_addrm_finish(&acxt);
 
 	if (rc) {
-		kernfs_put(sd);
+		kernfs_put(kn);
 		return ERR_PTR(rc);
 	}
-	return sd;
+	return kn;
 }
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 18ad431e8c2..9e74eed6353 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -46,36 +46,36 @@ void __init sysfs_inode_init(void)
 		panic("failed to init sysfs_backing_dev_info");
 }
 
-static struct sysfs_inode_attrs *sysfs_inode_attrs(struct sysfs_dirent *sd)
+static struct sysfs_inode_attrs *sysfs_inode_attrs(struct kernfs_node *kn)
 {
 	struct iattr *iattrs;
 
-	if (sd->s_iattr)
-		return sd->s_iattr;
+	if (kn->s_iattr)
+		return kn->s_iattr;
 
-	sd->s_iattr = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-	if (!sd->s_iattr)
+	kn->s_iattr = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	if (!kn->s_iattr)
 		return NULL;
-	iattrs = &sd->s_iattr->ia_iattr;
+	iattrs = &kn->s_iattr->ia_iattr;
 
 	/* assign default attributes */
-	iattrs->ia_mode = sd->s_mode;
+	iattrs->ia_mode = kn->s_mode;
 	iattrs->ia_uid = GLOBAL_ROOT_UID;
 	iattrs->ia_gid = GLOBAL_ROOT_GID;
 	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
 
-	simple_xattrs_init(&sd->s_iattr->xattrs);
+	simple_xattrs_init(&kn->s_iattr->xattrs);
 
-	return sd->s_iattr;
+	return kn->s_iattr;
 }
 
-static int __kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
+static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	struct sysfs_inode_attrs *attrs;
 	struct iattr *iattrs;
 	unsigned int ia_valid = iattr->ia_valid;
 
-	attrs = sysfs_inode_attrs(sd);
+	attrs = sysfs_inode_attrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -93,24 +93,24 @@ static int __kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
 		iattrs->ia_ctime = iattr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = iattr->ia_mode;
-		iattrs->ia_mode = sd->s_mode = mode;
+		iattrs->ia_mode = kn->s_mode = mode;
 	}
 	return 0;
 }
 
 /**
  * kernfs_setattr - set iattr on a node
- * @sd: target node
+ * @kn: target node
  * @iattr: iattr to set
  *
  * Returns 0 on success, -errno on failure.
  */
-int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	int ret;
 
 	mutex_lock(&sysfs_mutex);
-	ret = __kernfs_setattr(sd, iattr);
+	ret = __kernfs_setattr(kn, iattr);
 	mutex_unlock(&sysfs_mutex);
 	return ret;
 }
@@ -118,10 +118,10 @@ int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr)
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct kernfs_node *kn = dentry->d_fsdata;
 	int error;
 
-	if (!sd)
+	if (!kn)
 		return -EINVAL;
 
 	mutex_lock(&sysfs_mutex);
@@ -129,7 +129,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (error)
 		goto out;
 
-	error = __kernfs_setattr(sd, iattr);
+	error = __kernfs_setattr(kn, iattr);
 	if (error)
 		goto out;
 
@@ -141,14 +141,14 @@ out:
 	return error;
 }
 
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
+static int sysfs_sd_setsecdata(struct kernfs_node *kn, void **secdata,
 			       u32 *secdata_len)
 {
 	struct sysfs_inode_attrs *attrs;
 	void *old_secdata;
 	size_t old_secdata_len;
 
-	attrs = sysfs_inode_attrs(sd);
+	attrs = sysfs_inode_attrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -166,13 +166,13 @@ static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct kernfs_node *kn = dentry->d_fsdata;
 	struct sysfs_inode_attrs *attrs;
 	void *secdata;
 	int error;
 	u32 secdata_len = 0;
 
-	attrs = sysfs_inode_attrs(sd);
+	attrs = sysfs_inode_attrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -188,7 +188,7 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 			return error;
 
 		mutex_lock(&sysfs_mutex);
-		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
+		error = sysfs_sd_setsecdata(kn, &secdata, &secdata_len);
 		mutex_unlock(&sysfs_mutex);
 
 		if (secdata)
@@ -204,10 +204,10 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 int sysfs_removexattr(struct dentry *dentry, const char *name)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct kernfs_node *kn = dentry->d_fsdata;
 	struct sysfs_inode_attrs *attrs;
 
-	attrs = sysfs_inode_attrs(sd);
+	attrs = sysfs_inode_attrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -217,10 +217,10 @@ int sysfs_removexattr(struct dentry *dentry, const char *name)
 ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
 		       size_t size)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct kernfs_node *kn = dentry->d_fsdata;
 	struct sysfs_inode_attrs *attrs;
 
-	attrs = sysfs_inode_attrs(sd);
+	attrs = sysfs_inode_attrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -229,10 +229,10 @@ ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
 
 ssize_t sysfs_listxattr(struct dentry *dentry, char *buf, size_t size)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct kernfs_node *kn = dentry->d_fsdata;
 	struct sysfs_inode_attrs *attrs;
 
-	attrs = sysfs_inode_attrs(sd);
+	attrs = sysfs_inode_attrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -254,57 +254,58 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
 	inode->i_ctime = iattr->ia_ctime;
 }
 
-static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
+static void sysfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 {
-	struct sysfs_inode_attrs *attrs = sd->s_iattr;
+	struct sysfs_inode_attrs *attrs = kn->s_iattr;
 
-	inode->i_mode = sd->s_mode;
+	inode->i_mode = kn->s_mode;
 	if (attrs) {
-		/* sysfs_dirent has non-default attributes
-		 * get them from persistent copy in sysfs_dirent
+		/*
+		 * kernfs_node has non-default attributes get them from
+		 * persistent copy in kernfs_node.
 		 */
 		set_inode_attr(inode, &attrs->ia_iattr);
 		security_inode_notifysecctx(inode, attrs->ia_secdata,
 					    attrs->ia_secdata_len);
 	}
 
-	if (sysfs_type(sd) == SYSFS_DIR)
-		set_nlink(inode, sd->s_dir.subdirs + 2);
+	if (sysfs_type(kn) == SYSFS_DIR)
+		set_nlink(inode, kn->s_dir.subdirs + 2);
 }
 
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct kernfs_node *kn = dentry->d_fsdata;
 	struct inode *inode = dentry->d_inode;
 
 	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
+	sysfs_refresh_inode(kn, inode);
 	mutex_unlock(&sysfs_mutex);
 
 	generic_fillattr(inode, stat);
 	return 0;
 }
 
-static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
+static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 {
-	kernfs_get(sd);
-	inode->i_private = sd;
+	kernfs_get(kn);
+	inode->i_private = kn;
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
 
-	set_default_inode_attr(inode, sd->s_mode);
-	sysfs_refresh_inode(sd, inode);
+	set_default_inode_attr(inode, kn->s_mode);
+	sysfs_refresh_inode(kn, inode);
 
 	/* initialize inode according to type */
-	switch (sysfs_type(sd)) {
+	switch (sysfs_type(kn)) {
 	case SYSFS_DIR:
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
 		break;
 	case SYSFS_KOBJ_ATTR:
-		inode->i_size = sd->s_attr.size;
+		inode->i_size = kn->s_attr.size;
 		inode->i_fop = &kernfs_file_operations;
 		break;
 	case SYSFS_KOBJ_LINK:
@@ -318,13 +319,13 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 }
 
 /**
- *	sysfs_get_inode - get inode for sysfs_dirent
+ *	sysfs_get_inode - get inode for kernfs_node
  *	@sb: super block
- *	@sd: sysfs_dirent to allocate inode for
+ *	@kn: kernfs_node to allocate inode for
  *
- *	Get inode for @sd.  If such inode doesn't exist, a new inode
- *	is allocated and basics are initialized.  New inode is
- *	returned locked.
+ *	Get inode for @kn.  If such inode doesn't exist, a new inode is
+ *	allocated and basics are initialized.  New inode is returned
+ *	locked.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
@@ -332,44 +333,44 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
  *	RETURNS:
  *	Pointer to allocated inode on success, NULL on failure.
  */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
+struct inode *sysfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
 {
 	struct inode *inode;
 
-	inode = iget_locked(sb, sd->s_ino);
+	inode = iget_locked(sb, kn->s_ino);
 	if (inode && (inode->i_state & I_NEW))
-		sysfs_init_inode(sd, inode);
+		sysfs_init_inode(kn, inode);
 
 	return inode;
 }
 
 /*
- * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take a
- * reference to sysfs_dirent from the sysfs inode.  A
+ * The kernfs_node serves as both an inode and a directory entry for sysfs.
+ * To prevent the sysfs inode numbers from being freed prematurely we take
+ * a reference to kernfs_node from the sysfs inode.  A
  * super_operations.evict_inode() implementation is needed to drop that
  * reference upon inode destruction.
  */
 void sysfs_evict_inode(struct inode *inode)
 {
-	struct sysfs_dirent *sd  = inode->i_private;
+	struct kernfs_node *kn = inode->i_private;
 
 	truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
-	kernfs_put(sd);
+	kernfs_put(kn);
 }
 
 int sysfs_permission(struct inode *inode, int mask)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
-	sd = inode->i_private;
+	kn = inode->i_private;
 
 	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
+	sysfs_refresh_inode(kn, inode);
 	mutex_unlock(&sysfs_mutex);
 
 	return generic_permission(inode, mask);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 910e485b733..b7ea76c6fb3 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -31,24 +31,24 @@ struct sysfs_inode_attrs {
 /* SYSFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
 
 /**
- * kernfs_root - find out the kernfs_root a sysfs_dirent belongs to
- * @sd: sysfs_dirent of interest
+ * kernfs_root - find out the kernfs_root a kernfs_node belongs to
+ * @kn: kernfs_node of interest
  *
- * Return the kernfs_root @sd belongs to.
+ * Return the kernfs_root @kn belongs to.
  */
-static inline struct kernfs_root *kernfs_root(struct sysfs_dirent *sd)
+static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
 {
 	/* if parent exists, it's always a dir; otherwise, @sd is a dir */
-	if (sd->s_parent)
-		sd = sd->s_parent;
-	return sd->s_dir.root;
+	if (kn->s_parent)
+		kn = kn->s_parent;
+	return kn->s_dir.root;
 }
 
 /*
  * Context structure to be used while adding/removing nodes.
  */
 struct sysfs_addrm_cxt {
-	struct sysfs_dirent	*removed;
+	struct kernfs_node	*removed;
 };
 
 /*
@@ -62,10 +62,10 @@ struct sysfs_super_info {
 	struct kernfs_root	*root;
 
 	/*
-	 * Each sb is associated with one namespace tag, currently the network
-	 * namespace of the task which mounted this sysfs instance.  If multiple
-	 * tags become necessary, make the following an array and compare
-	 * sysfs_dirent tag against every entry.
+	 * Each sb is associated with one namespace tag, currently the
+	 * network namespace of the task which mounted this sysfs instance.
+	 * If multiple tags become necessary, make the following an array
+	 * and compare kernfs_node tag against every entry.
 	 */
 	const void		*ns;
 };
@@ -76,7 +76,7 @@ extern struct kmem_cache *sysfs_dir_cachep;
 /*
  * inode.c
  */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
+struct inode *sysfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
@@ -98,21 +98,21 @@ extern const struct dentry_operations sysfs_dentry_ops;
 extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active(struct sysfs_dirent *sd);
+struct kernfs_node *sysfs_get_active(struct kernfs_node *kn);
+void sysfs_put_active(struct kernfs_node *kn);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd);
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct kernfs_node *kn,
+		  struct kernfs_node *parent);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-struct sysfs_dirent *sysfs_new_dirent(struct kernfs_root *root,
-				      const char *name, umode_t mode, int type);
+struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
+				     const char *name, umode_t mode, int type);
 
 /*
  * file.c
  */
 extern const struct file_operations kernfs_file_operations;
 
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
+void sysfs_unmap_bin_file(struct kernfs_node *kn);
 
 /*
  * symlink.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 84c83e24bf2..9dbbf37b1af 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -39,7 +39,7 @@ static int sysfs_fill_super(struct super_block *sb)
 
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(sb, info->root->sd);
+	inode = sysfs_get_inode(sb, info->root->kn);
 	mutex_unlock(&sysfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
@@ -52,8 +52,8 @@ static int sysfs_fill_super(struct super_block *sb)
 		pr_debug("%s: could not get root dentry!\n", __func__);
 		return -ENOMEM;
 	}
-	kernfs_get(info->root->sd);
-	root->d_fsdata = info->root->sd;
+	kernfs_get(info->root->kn);
+	root->d_fsdata = info->root->kn;
 	sb->s_root = root;
 	sb->s_d_op = &sysfs_dentry_ops;
 	return 0;
@@ -145,7 +145,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 void kernfs_kill_sb(struct super_block *sb)
 {
 	struct sysfs_super_info *info = sysfs_info(sb);
-	struct sysfs_dirent *root_sd = sb->s_root->d_fsdata;
+	struct kernfs_node *root_kn = sb->s_root->d_fsdata;
 
 	/*
 	 * Remove the superblock from fs_supers/s_instances
@@ -153,13 +153,13 @@ void kernfs_kill_sb(struct super_block *sb)
 	 */
 	kill_anon_super(sb);
 	kfree(info);
-	kernfs_put(root_sd);
+	kernfs_put(root_kn);
 }
 
 void __init kernfs_init(void)
 {
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
-					      sizeof(struct sysfs_dirent),
+					      sizeof(struct kernfs_node),
 					      0, SLAB_PANIC, NULL);
 	sysfs_inode_init();
 }
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index adf28755b0e..29dcf5e8deb 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -22,50 +22,50 @@
  *
  * Returns the created node on success, ERR_PTR() value on error.
  */
-struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
-					const char *name,
-					struct sysfs_dirent *target)
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+				       const char *name,
+				       struct kernfs_node *target)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	struct sysfs_addrm_cxt acxt;
 	int error;
 
-	sd = sysfs_new_dirent(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
+	kn = sysfs_new_dirent(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
 			      SYSFS_KOBJ_LINK);
-	if (!sd)
+	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
 	if (kernfs_ns_enabled(parent))
-		sd->s_ns = target->s_ns;
-	sd->s_symlink.target_sd = target;
+		kn->s_ns = target->s_ns;
+	kn->s_symlink.target_kn = target;
 	kernfs_get(target);	/* ref owned by symlink */
 
 	sysfs_addrm_start(&acxt);
-	error = sysfs_add_one(&acxt, sd, parent);
+	error = sysfs_add_one(&acxt, kn, parent);
 	sysfs_addrm_finish(&acxt);
 
 	if (!error)
-		return sd;
+		return kn;
 
-	kernfs_put(sd);
+	kernfs_put(kn);
 	return ERR_PTR(error);
 }
 
-static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-				 struct sysfs_dirent *target_sd, char *path)
+static int sysfs_get_target_path(struct kernfs_node *parent,
+				 struct kernfs_node *target, char *path)
 {
-	struct sysfs_dirent *base, *sd;
+	struct kernfs_node *base, *kn;
 	char *s = path;
 	int len = 0;
 
 	/* go up to the root, stop at the base */
-	base = parent_sd;
+	base = parent;
 	while (base->s_parent) {
-		sd = target_sd->s_parent;
-		while (sd->s_parent && base != sd)
-			sd = sd->s_parent;
+		kn = target->s_parent;
+		while (kn->s_parent && base != kn)
+			kn = kn->s_parent;
 
-		if (base == sd)
+		if (base == kn)
 			break;
 
 		strcpy(s, "../");
@@ -74,10 +74,10 @@ static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
 	}
 
 	/* determine end of target string for reverse fillup */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		len += strlen(sd->s_name) + 1;
-		sd = sd->s_parent;
+	kn = target;
+	while (kn->s_parent && kn != base) {
+		len += strlen(kn->s_name) + 1;
+		kn = kn->s_parent;
 	}
 
 	/* check limits */
@@ -88,16 +88,16 @@ static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
 		return -ENAMETOOLONG;
 
 	/* reverse fillup of target string from target to base */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		int slen = strlen(sd->s_name);
+	kn = target;
+	while (kn->s_parent && kn != base) {
+		int slen = strlen(kn->s_name);
 
 		len -= slen;
-		strncpy(s + len, sd->s_name, slen);
+		strncpy(s + len, kn->s_name, slen);
 		if (len)
 			s[--len] = '/';
 
-		sd = sd->s_parent;
+		kn = kn->s_parent;
 	}
 
 	return 0;
@@ -105,13 +105,13 @@ static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
 
 static int sysfs_getlink(struct dentry *dentry, char *path)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_node *parent = kn->s_parent;
+	struct kernfs_node *target = kn->s_symlink.target_kn;
 	int error;
 
 	mutex_lock(&sysfs_mutex);
-	error = sysfs_get_target_path(parent_sd, target_sd, path);
+	error = sysfs_get_target_path(parent, target, path);
 	mutex_unlock(&sysfs_mutex);
 
 	return error;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 2fea501889e..f1efe3df0de 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -21,23 +21,23 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock);
 
 /**
  *	sysfs_pathname - return full path to sysfs dirent
- *	@sd: sysfs_dirent whose path we want
+ *	@kn: kernfs_node whose path we want
  *	@path: caller allocated buffer of size PATH_MAX
  *
  *	Gives the name "/" to the sysfs_root entry; any path returned
  *	is relative to wherever sysfs is mounted.
  */
-static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+static char *sysfs_pathname(struct kernfs_node *kn, char *path)
 {
-	if (sd->s_parent) {
-		sysfs_pathname(sd->s_parent, path);
+	if (kn->s_parent) {
+		sysfs_pathname(kn->s_parent, path);
 		strlcat(path, "/", PATH_MAX);
 	}
-	strlcat(path, sd->s_name, PATH_MAX);
+	strlcat(path, kn->s_name, PATH_MAX);
 	return path;
 }
 
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
 {
 	char *path;
 
@@ -61,26 +61,26 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
  */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
-	struct sysfs_dirent *parent_sd, *sd;
+	struct kernfs_node *parent, *kn;
 
 	BUG_ON(!kobj);
 
 	if (kobj->parent)
-		parent_sd = kobj->parent->sd;
+		parent = kobj->parent->sd;
 	else
-		parent_sd = sysfs_root_sd;
+		parent = sysfs_root_kn;
 
-	if (!parent_sd)
+	if (!parent)
 		return -ENOENT;
 
-	sd = kernfs_create_dir_ns(parent_sd, kobject_name(kobj), kobj, ns);
-	if (IS_ERR(sd)) {
-		if (PTR_ERR(sd) == -EEXIST)
-			sysfs_warn_dup(parent_sd, kobject_name(kobj));
-		return PTR_ERR(sd);
+	kn = kernfs_create_dir_ns(parent, kobject_name(kobj), kobj, ns);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, kobject_name(kobj));
+		return PTR_ERR(kn);
 	}
 
-	kobj->sd = sd;
+	kobj->sd = kn;
 	return 0;
 }
 
@@ -94,47 +94,47 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
  */
 void sysfs_remove_dir(struct kobject *kobj)
 {
-	struct sysfs_dirent *sd = kobj->sd;
+	struct kernfs_node *kn = kobj->sd;
 
 	/*
 	 * In general, kboject owner is responsible for ensuring removal
 	 * doesn't race with other operations and sysfs doesn't provide any
 	 * protection; however, when @kobj is used as a symlink target, the
 	 * symlinking entity usually doesn't own @kobj and thus has no
-	 * control over removal.  @kobj->sd may be removed anytime and
-	 * symlink code may end up dereferencing an already freed sd.
+	 * control over removal.  @kobj->sd may be removed anytime
+	 * and symlink code may end up dereferencing an already freed node.
 	 *
-	 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
-	 * against symlink operations so that symlink code can safely
-	 * dereference @kobj->sd.
+	 * sysfs_symlink_target_lock synchronizes @kobj->sd
+	 * disassociation against symlink operations so that symlink code
+	 * can safely dereference @kobj->sd.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
 	kobj->sd = NULL;
 	spin_unlock(&sysfs_symlink_target_lock);
 
-	if (sd) {
-		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
-		kernfs_remove(sd);
+	if (kn) {
+		WARN_ON_ONCE(sysfs_type(kn) != SYSFS_DIR);
+		kernfs_remove(kn);
 	}
 }
 
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 			const void *new_ns)
 {
-	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+	struct kernfs_node *parent = kobj->sd->s_parent;
 
-	return kernfs_rename_ns(kobj->sd, parent_sd, new_name, new_ns);
+	return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
 }
 
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 		      const void *new_ns)
 {
-	struct sysfs_dirent *sd = kobj->sd;
-	struct sysfs_dirent *new_parent_sd;
+	struct kernfs_node *kn = kobj->sd;
+	struct kernfs_node *new_parent;
 
-	BUG_ON(!sd->s_parent);
-	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
-		new_parent_kobj->sd : sysfs_root_sd;
+	BUG_ON(!kn->s_parent);
+	new_parent = new_parent_kobj && new_parent_kobj->sd ?
+		new_parent_kobj->sd : sysfs_root_kn;
 
-	return kernfs_rename_ns(sd, new_parent_sd, sd->s_name, new_ns);
+	return kernfs_rename_ns(kn, new_parent, kn->s_name, new_ns);
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a67d1c682fe..be1cc39035b 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -22,15 +22,15 @@
 #include "../kernfs/kernfs-internal.h"
 
 /*
- * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
+ * Determine ktype->sysfs_ops for the given kernfs_node.  This function
  * must be called while holding an active reference.
  */
-static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
+static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 {
-	struct kobject *kobj = sd->s_parent->priv;
+	struct kobject *kobj = kn->s_parent->priv;
 
-	if (sd->s_flags & SYSFS_FLAG_LOCKDEP)
-		lockdep_assert_held(sd);
+	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+		lockdep_assert_held(kn);
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
 
@@ -42,8 +42,8 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
-	struct kobject *kobj = of->sd->s_parent->priv;
-	const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
+	struct kobject *kobj = of->kn->s_parent->priv;
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
 	ssize_t count;
 	char *buf;
 
@@ -59,7 +59,7 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 	 * if @ops->show() isn't implemented.
 	 */
 	if (ops->show) {
-		count = ops->show(kobj, of->sd->priv, buf);
+		count = ops->show(kobj, of->kn->priv, buf);
 		if (count < 0)
 			return count;
 	}
@@ -81,8 +81,8 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
 				 size_t count, loff_t pos)
 {
-	struct bin_attribute *battr = of->sd->priv;
-	struct kobject *kobj = of->sd->s_parent->priv;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->s_parent->priv;
 	loff_t size = file_inode(of->file)->i_size;
 
 	if (!count)
@@ -105,21 +105,21 @@ static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
 static ssize_t sysfs_kf_write(struct sysfs_open_file *of, char *buf,
 			      size_t count, loff_t pos)
 {
-	const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
-	struct kobject *kobj = of->sd->s_parent->priv;
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+	struct kobject *kobj = of->kn->s_parent->priv;
 
 	if (!count)
 		return 0;
 
-	return ops->store(kobj, of->sd->priv, buf, count);
+	return ops->store(kobj, of->kn->priv, buf, count);
 }
 
 /* kernfs write callback for bin sysfs files */
 static ssize_t sysfs_kf_bin_write(struct sysfs_open_file *of, char *buf,
 				  size_t count, loff_t pos)
 {
-	struct bin_attribute *battr = of->sd->priv;
-	struct kobject *kobj = of->sd->s_parent->priv;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->s_parent->priv;
 	loff_t size = file_inode(of->file)->i_size;
 
 	if (size) {
@@ -139,30 +139,30 @@ static ssize_t sysfs_kf_bin_write(struct sysfs_open_file *of, char *buf,
 static int sysfs_kf_bin_mmap(struct sysfs_open_file *of,
 			     struct vm_area_struct *vma)
 {
-	struct bin_attribute *battr = of->sd->priv;
-	struct kobject *kobj = of->sd->s_parent->priv;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->s_parent->priv;
 
 	return battr->mmap(of->file, kobj, battr, vma);
 }
 
-void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
 {
-	struct sysfs_dirent *sd = k->sd, *tmp;
+	struct kernfs_node *kn = kobj->sd, *tmp;
 
-	if (sd && dir)
-		sd = kernfs_find_and_get(sd, dir);
+	if (kn && dir)
+		kn = kernfs_find_and_get(kn, dir);
 	else
-		kernfs_get(sd);
+		kernfs_get(kn);
 
-	if (sd && attr) {
-		tmp = kernfs_find_and_get(sd, attr);
-		kernfs_put(sd);
-		sd = tmp;
+	if (kn && attr) {
+		tmp = kernfs_find_and_get(kn, attr);
+		kernfs_put(kn);
+		kn = tmp;
 	}
 
-	if (sd) {
-		kernfs_notify(sd);
-		kernfs_put(sd);
+	if (kn) {
+		kernfs_notify(kn);
+		kernfs_put(kn);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_notify);
@@ -202,17 +202,17 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
 	.mmap		= sysfs_kf_bin_mmap,
 };
 
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 			   const struct attribute *attr, bool is_bin,
 			   umode_t mode, const void *ns)
 {
 	struct lock_class_key *key = NULL;
 	const struct kernfs_ops *ops;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	loff_t size;
 
 	if (!is_bin) {
-		struct kobject *kobj = dir_sd->priv;
+		struct kobject *kobj = parent->priv;
 		const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
 
 		/* every kobject with an attribute needs a ktype assigned */
@@ -252,20 +252,20 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 	if (!attr->ignore_lockdep)
 		key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
-	sd = kernfs_create_file_ns_key(dir_sd, attr->name, mode, size,
+	kn = kernfs_create_file_ns_key(parent, attr->name, mode, size,
 				       ops, (void *)attr, ns, key);
-	if (IS_ERR(sd)) {
-		if (PTR_ERR(sd) == -EEXIST)
-			sysfs_warn_dup(dir_sd, attr->name);
-		return PTR_ERR(sd);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, attr->name);
+		return PTR_ERR(kn);
 	}
 	return 0;
 }
 
-int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
+int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
 		   bool is_bin)
 {
-	return sysfs_add_file_mode_ns(dir_sd, attr, is_bin, attr->mode, NULL);
+	return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
 }
 
 /**
@@ -307,21 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
 int sysfs_add_file_to_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error;
 
 	if (group) {
-		dir_sd = kernfs_find_and_get(kobj->sd, group);
+		parent = kernfs_find_and_get(kobj->sd, group);
 	} else {
-		dir_sd = kobj->sd;
-		kernfs_get(dir_sd);
+		parent = kobj->sd;
+		kernfs_get(parent);
 	}
 
-	if (!dir_sd)
+	if (!parent)
 		return -ENOENT;
 
-	error = sysfs_add_file(dir_sd, attr, false);
-	kernfs_put(dir_sd);
+	error = sysfs_add_file(parent, attr, false);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -337,20 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 		     umode_t mode)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	struct iattr newattrs;
 	int rc;
 
-	sd = kernfs_find_and_get(kobj->sd, attr->name);
-	if (!sd)
+	kn = kernfs_find_and_get(kobj->sd, attr->name);
+	if (!kn)
 		return -ENOENT;
 
-	newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+	newattrs.ia_mode = (mode & S_IALLUGO) | (kn->s_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE;
 
-	rc = kernfs_setattr(sd, &newattrs);
+	rc = kernfs_setattr(kn, &newattrs);
 
-	kernfs_put(sd);
+	kernfs_put(kn);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -366,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 			  const void *ns)
 {
-	struct sysfs_dirent *dir_sd = kobj->sd;
+	struct kernfs_node *parent = kobj->sd;
 
-	kernfs_remove_by_name_ns(dir_sd, attr->name, ns);
+	kernfs_remove_by_name_ns(parent, attr->name, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
@@ -389,18 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
 void sysfs_remove_file_from_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 
 	if (group) {
-		dir_sd = kernfs_find_and_get(kobj->sd, group);
+		parent = kernfs_find_and_get(kobj->sd, group);
 	} else {
-		dir_sd = kobj->sd;
-		kernfs_get(dir_sd);
+		parent = kobj->sd;
+		kernfs_get(parent);
 	}
 
-	if (dir_sd) {
-		kernfs_remove_by_name(dir_sd, attr->name);
-		kernfs_put(dir_sd);
+	if (parent) {
+		kernfs_remove_by_name(parent, attr->name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 7177532b8f7..4d00d399647 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
 #include "sysfs.h"
 
 
-static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
 			 const struct attribute_group *grp)
 {
 	struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 
 	if (grp->attrs)
 		for (attr = grp->attrs; *attr; attr++)
-			kernfs_remove_by_name(dir_sd, (*attr)->name);
+			kernfs_remove_by_name(parent, (*attr)->name);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
 			sysfs_remove_bin_file(kobj, *bin_attr);
 }
 
-static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 			const struct attribute_group *grp, int update)
 {
 	struct attribute *const *attr;
@@ -49,20 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 * re-adding (if required) the file.
 			 */
 			if (update)
-				kernfs_remove_by_name(dir_sd, (*attr)->name);
+				kernfs_remove_by_name(parent, (*attr)->name);
 			if (grp->is_visible) {
 				mode = grp->is_visible(kobj, *attr, i);
 				if (!mode)
 					continue;
 			}
-			error = sysfs_add_file_mode_ns(dir_sd, *attr, false,
+			error = sysfs_add_file_mode_ns(parent, *attr, false,
 						       (*attr)->mode | mode,
 						       NULL);
 			if (unlikely(error))
 				break;
 		}
 		if (error) {
-			remove_files(dir_sd, kobj, grp);
+			remove_files(parent, kobj, grp);
 			goto exit;
 		}
 	}
@@ -76,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 				break;
 		}
 		if (error)
-			remove_files(dir_sd, kobj, grp);
+			remove_files(parent, kobj, grp);
 	}
 exit:
 	return error;
@@ -86,7 +86,7 @@ exit:
 static int internal_create_group(struct kobject *kobj, int update,
 				 const struct attribute_group *grp)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	int error;
 
 	BUG_ON(!kobj || (!update && !kobj->sd));
@@ -100,21 +100,21 @@ static int internal_create_group(struct kobject *kobj, int update,
 		return -EINVAL;
 	}
 	if (grp->name) {
-		sd = kernfs_create_dir(kobj->sd, grp->name, kobj);
-		if (IS_ERR(sd)) {
-			if (PTR_ERR(sd) == -EEXIST)
+		kn = kernfs_create_dir(kobj->sd, grp->name, kobj);
+		if (IS_ERR(kn)) {
+			if (PTR_ERR(kn) == -EEXIST)
 				sysfs_warn_dup(kobj->sd, grp->name);
-			return PTR_ERR(sd);
+			return PTR_ERR(kn);
 		}
 	} else
-		sd = kobj->sd;
-	kernfs_get(sd);
-	error = create_files(sd, kobj, grp, update);
+		kn = kobj->sd;
+	kernfs_get(kn);
+	error = create_files(kn, kobj, grp, update);
 	if (error) {
 		if (grp->name)
-			kernfs_remove(sd);
+			kernfs_remove(kn);
 	}
-	kernfs_put(sd);
+	kernfs_put(kn);
 	return error;
 }
 
@@ -204,27 +204,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
 void sysfs_remove_group(struct kobject *kobj,
 			const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd = kobj->sd;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *parent = kobj->sd;
+	struct kernfs_node *kn;
 
 	if (grp->name) {
-		sd = kernfs_find_and_get(dir_sd, grp->name);
-		if (!sd) {
-			WARN(!sd, KERN_WARNING
+		kn = kernfs_find_and_get(parent, grp->name);
+		if (!kn) {
+			WARN(!kn, KERN_WARNING
 			     "sysfs group %p not found for kobject '%s'\n",
 			     grp, kobject_name(kobj));
 			return;
 		}
 	} else {
-		sd = dir_sd;
-		kernfs_get(sd);
+		kn = parent;
+		kernfs_get(kn);
 	}
 
-	remove_files(sd, kobj, grp);
+	remove_files(kn, kobj, grp);
 	if (grp->name)
-		kernfs_remove(sd);
+		kernfs_remove(kn);
 
-	kernfs_put(sd);
+	kernfs_put(kn);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_group);
 
@@ -260,22 +260,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
 int sysfs_merge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error = 0;
 	struct attribute *const *attr;
 	int i;
 
-	dir_sd = kernfs_find_and_get(kobj->sd, grp->name);
-	if (!dir_sd)
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (!parent)
 		return -ENOENT;
 
 	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
-		error = sysfs_add_file(dir_sd, *attr, false);
+		error = sysfs_add_file(parent, *attr, false);
 	if (error) {
 		while (--i >= 0)
-			kernfs_remove_by_name(dir_sd, (*--attr)->name);
+			kernfs_remove_by_name(parent, (*--attr)->name);
 	}
-	kernfs_put(dir_sd);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -289,14 +289,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
 void sysfs_unmerge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	struct attribute *const *attr;
 
-	dir_sd = kernfs_find_and_get(kobj->sd, grp->name);
-	if (dir_sd) {
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (parent) {
 		for (attr = grp->attrs; *attr; ++attr)
-			kernfs_remove_by_name(dir_sd, (*attr)->name);
-		kernfs_put(dir_sd);
+			kernfs_remove_by_name(parent, (*attr)->name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -311,15 +311,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
 int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
 			    struct kobject *target, const char *link_name)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error = 0;
 
-	dir_sd = kernfs_find_and_get(kobj->sd, group_name);
-	if (!dir_sd)
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (!parent)
 		return -ENOENT;
 
-	error = sysfs_create_link_sd(dir_sd, target, link_name);
-	kernfs_put(dir_sd);
+	error = sysfs_create_link_sd(parent, target, link_name);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
 void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 				  const char *link_name)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 
-	dir_sd = kernfs_find_and_get(kobj->sd, group_name);
-	if (dir_sd) {
-		kernfs_remove_by_name(dir_sd, link_name);
-		kernfs_put(dir_sd);
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (parent) {
+		kernfs_remove_by_name(parent, link_name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8d075272cac..701a56f341c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -20,7 +20,7 @@
 #include "sysfs.h"
 
 static struct kernfs_root *sysfs_root;
-struct sysfs_dirent *sysfs_root_sd;
+struct kernfs_node *sysfs_root_kn;
 
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
@@ -66,7 +66,7 @@ int __init sysfs_init(void)
 	if (IS_ERR(sysfs_root))
 		return PTR_ERR(sysfs_root);
 
-	sysfs_root_sd = sysfs_root->sd;
+	sysfs_root_kn = sysfs_root->kn;
 
 	err = register_filesystem(&sysfs_fs_type);
 	if (err) {
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1b8c9ed8511..4ed3d49ad27 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -18,66 +18,66 @@
 
 #include "sysfs.h"
 
-static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
-				   struct kobject *target,
+static int sysfs_do_create_link_sd(struct kernfs_node *parent,
+				   struct kobject *target_kobj,
 				   const char *name, int warn)
 {
-	struct sysfs_dirent *sd, *target_sd = NULL;
+	struct kernfs_node *kn, *target = NULL;
 
-	BUG_ON(!name || !parent_sd);
+	BUG_ON(!name || !parent);
 
 	/*
-	 * We don't own @target and it may be removed at any time.
+	 * We don't own @target_kobj and it may be removed at any time.
 	 * Synchronize using sysfs_symlink_target_lock.  See
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (target->sd) {
-		target_sd = target->sd;
-		kernfs_get(target_sd);
+	if (target_kobj->sd) {
+		target = target_kobj->sd;
+		kernfs_get(target);
 	}
 	spin_unlock(&sysfs_symlink_target_lock);
 
-	if (!target_sd)
+	if (!target)
 		return -ENOENT;
 
-	sd = kernfs_create_link(parent_sd, name, target_sd);
-	kernfs_put(target_sd);
+	kn = kernfs_create_link(parent, name, target);
+	kernfs_put(target);
 
-	if (!IS_ERR(sd))
+	if (!IS_ERR(kn))
 		return 0;
 
-	if (warn && PTR_ERR(sd) == -EEXIST)
-		sysfs_warn_dup(parent_sd, name);
-	return PTR_ERR(sd);
+	if (warn && PTR_ERR(kn) == -EEXIST)
+		sysfs_warn_dup(parent, name);
+	return PTR_ERR(kn);
 }
 
 /**
  *	sysfs_create_link_sd - create symlink to a given object.
- *	@sd:		directory we're creating the link in.
+ *	@kn:		directory we're creating the link in.
  *	@target:	object we're pointing to.
  *	@name:		name of the symlink.
  */
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
 			 const char *name)
 {
-	return sysfs_do_create_link_sd(sd, target, name, 1);
+	return sysfs_do_create_link_sd(kn, target, name, 1);
 }
 
 static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 				const char *name, int warn)
 {
-	struct sysfs_dirent *parent_sd = NULL;
+	struct kernfs_node *parent = NULL;
 
 	if (!kobj)
-		parent_sd = sysfs_root_sd;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
-	if (!parent_sd)
+	if (!parent)
 		return -EFAULT;
 
-	return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+	return sysfs_do_create_link_sd(parent, target, name, warn);
 }
 
 /**
@@ -141,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
  */
 void sysfs_remove_link(struct kobject *kobj, const char *name)
 {
-	struct sysfs_dirent *parent_sd = NULL;
+	struct kernfs_node *parent = NULL;
 
 	if (!kobj)
-		parent_sd = sysfs_root_sd;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
-	kernfs_remove_by_name(parent_sd, name);
+	kernfs_remove_by_name(parent, name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
 
@@ -165,33 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link);
 int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 			 const char *old, const char *new, const void *new_ns)
 {
-	struct sysfs_dirent *parent_sd, *sd = NULL;
+	struct kernfs_node *parent, *kn = NULL;
 	const void *old_ns = NULL;
 	int result;
 
 	if (!kobj)
-		parent_sd = sysfs_root_sd;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
 	if (targ->sd)
 		old_ns = targ->sd->s_ns;
 
 	result = -ENOENT;
-	sd = kernfs_find_and_get_ns(parent_sd, old, old_ns);
-	if (!sd)
+	kn = kernfs_find_and_get_ns(parent, old, old_ns);
+	if (!kn)
 		goto out;
 
 	result = -EINVAL;
-	if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+	if (sysfs_type(kn) != SYSFS_KOBJ_LINK)
 		goto out;
-	if (sd->s_symlink.target_sd->priv != targ)
+	if (kn->s_symlink.target_kn->priv != targ)
 		goto out;
 
-	result = kernfs_rename_ns(sd, parent_sd, new, new_ns);
+	result = kernfs_rename_ns(kn, parent, new, new_ns);
 
 out:
-	kernfs_put(sd);
+	kernfs_put(kn);
 	return result;
 }
 EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index c8e395b4933..0e2f1cccb81 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -16,28 +16,28 @@
 /*
  * mount.c
  */
-extern struct sysfs_dirent *sysfs_root_sd;
+extern struct kernfs_node *sysfs_root_kn;
 
 /*
  * dir.c
  */
 extern spinlock_t sysfs_symlink_target_lock;
 
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
 
 /*
  * file.c
  */
-int sysfs_add_file(struct sysfs_dirent *dir_sd,
+int sysfs_add_file(struct kernfs_node *parent,
 		   const struct attribute *attr, bool is_bin);
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 			   const struct attribute *attr, bool is_bin,
 			   umode_t amode, const void *ns);
 
 /*
  * symlink.c
  */
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
 			 const char *name);
 
 #endif	/* __SYSFS_INTERNAL_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d6554130841..195d1c6a8b0 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -46,61 +46,61 @@ enum kernfs_node_flag {
 	SYSFS_FLAG_LOCKDEP	= 0x0100,
 };
 
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
+/* type-specific structures for kernfs_node union members */
+struct kernfs_elem_dir {
 	unsigned long		subdirs;
-	/* children rbtree starts here and goes through sd->s_rb */
+	/* children rbtree starts here and goes through kn->s_rb */
 	struct rb_root		children;
 
 	/*
 	 * The kernfs hierarchy this directory belongs to.  This fits
-	 * better directly in sysfs_dirent but is here to save space.
+	 * better directly in kernfs_node but is here to save space.
 	 */
 	struct kernfs_root	*root;
 };
 
-struct sysfs_elem_symlink {
-	struct sysfs_dirent	*target_sd;
+struct kernfs_elem_symlink {
+	struct kernfs_node	*target_kn;
 };
 
-struct sysfs_elem_attr {
+struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct sysfs_open_dirent *open;
 	loff_t			size;
 };
 
 /*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and every
- * sysfs node is represented by single sysfs_dirent.  Most fields are
+ * kernfs_node - the building block of kernfs hierarchy.  Each and every
+ * kernfs node is represented by single kernfs_node.  Most fields are
  * private to kernfs and shouldn't be accessed directly by kernfs users.
  *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
+ * As long as s_count reference is held, the kernfs_node itself is
+ * accessible.  Dereferencing elem or any other outer entity requires
+ * active reference.
  */
-struct sysfs_dirent {
+struct kernfs_node {
 	atomic_t		s_count;
 	atomic_t		s_active;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
 	/* the following two fields are published */
-	struct sysfs_dirent	*s_parent;
+	struct kernfs_node	*s_parent;
 	const char		*s_name;
 
 	struct rb_node		s_rb;
 
 	union {
 		struct completion	*completion;
-		struct sysfs_dirent	*removed_list;
+		struct kernfs_node	*removed_list;
 	} u;
 
 	const void		*s_ns; /* namespace tag */
 	unsigned int		s_hash; /* ns + name hash */
 	union {
-		struct sysfs_elem_dir		s_dir;
-		struct sysfs_elem_symlink	s_symlink;
-		struct sysfs_elem_attr		s_attr;
+		struct kernfs_elem_dir		s_dir;
+		struct kernfs_elem_symlink	s_symlink;
+		struct kernfs_elem_attr		s_attr;
 	};
 
 	void			*priv;
@@ -113,7 +113,7 @@ struct sysfs_dirent {
 
 struct kernfs_root {
 	/* published fields */
-	struct sysfs_dirent	*sd;
+	struct kernfs_node	*kn;
 
 	/* private fields, do not use outside kernfs proper */
 	struct ida		ino_ida;
@@ -121,7 +121,7 @@ struct kernfs_root {
 
 struct sysfs_open_file {
 	/* published fields */
-	struct sysfs_dirent	*sd;
+	struct kernfs_node	*kn;
 	struct file		*file;
 
 	/* private fields, do not use outside kernfs proper */
@@ -170,64 +170,64 @@ struct kernfs_ops {
 
 #ifdef CONFIG_SYSFS
 
-static inline enum kernfs_node_type sysfs_type(struct sysfs_dirent *sd)
+static inline enum kernfs_node_type sysfs_type(struct kernfs_node *kn)
 {
-	return sd->s_flags & SYSFS_TYPE_MASK;
+	return kn->s_flags & SYSFS_TYPE_MASK;
 }
 
 /**
  * kernfs_enable_ns - enable namespace under a directory
- * @sd: directory of interest, should be empty
+ * @kn: directory of interest, should be empty
  *
- * This is to be called right after @sd is created to enable namespace
- * under it.  All children of @sd must have non-NULL namespace tags and
+ * This is to be called right after @kn is created to enable namespace
+ * under it.  All children of @kn must have non-NULL namespace tags and
  * only the ones which match the super_block's tag will be visible.
  */
-static inline void kernfs_enable_ns(struct sysfs_dirent *sd)
+static inline void kernfs_enable_ns(struct kernfs_node *kn)
 {
-	WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
-	WARN_ON_ONCE(!RB_EMPTY_ROOT(&sd->s_dir.children));
-	sd->s_flags |= SYSFS_FLAG_NS;
+	WARN_ON_ONCE(sysfs_type(kn) != SYSFS_DIR);
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->s_dir.children));
+	kn->s_flags |= SYSFS_FLAG_NS;
 }
 
 /**
  * kernfs_ns_enabled - test whether namespace is enabled
- * @sd: the node to test
+ * @kn: the node to test
  *
  * Test whether namespace filtering is enabled for the children of @ns.
  */
-static inline bool kernfs_ns_enabled(struct sysfs_dirent *sd)
+static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 {
-	return sd->s_flags & SYSFS_FLAG_NS;
+	return kn->s_flags & SYSFS_FLAG_NS;
 }
 
-struct sysfs_dirent *kernfs_find_and_get_ns(struct sysfs_dirent *parent,
-					    const char *name, const void *ns);
-void kernfs_get(struct sysfs_dirent *sd);
-void kernfs_put(struct sysfs_dirent *sd);
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+					   const char *name, const void *ns);
+void kernfs_get(struct kernfs_node *kn);
+void kernfs_put(struct kernfs_node *kn);
 
 struct kernfs_root *kernfs_create_root(void *priv);
 void kernfs_destroy_root(struct kernfs_root *root);
 
-struct sysfs_dirent *kernfs_create_dir_ns(struct sysfs_dirent *parent,
-					  const char *name, void *priv,
-					  const void *ns);
-struct sysfs_dirent *kernfs_create_file_ns_key(struct sysfs_dirent *parent,
-					       const char *name,
-					       umode_t mode, loff_t size,
-					       const struct kernfs_ops *ops,
-					       void *priv, const void *ns,
-					       struct lock_class_key *key);
-struct sysfs_dirent *kernfs_create_link(struct sysfs_dirent *parent,
-					const char *name,
-					struct sysfs_dirent *target);
-void kernfs_remove(struct sysfs_dirent *sd);
-int kernfs_remove_by_name_ns(struct sysfs_dirent *parent, const char *name,
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+					 const char *name, void *priv,
+					 const void *ns);
+struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
+					      const char *name,
+					      umode_t mode, loff_t size,
+					      const struct kernfs_ops *ops,
+					      void *priv, const void *ns,
+					      struct lock_class_key *key);
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+				       const char *name,
+				       struct kernfs_node *target);
+void kernfs_remove(struct kernfs_node *kn);
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns);
-int kernfs_rename_ns(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent,
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		     const char *new_name, const void *new_ns);
-int kernfs_setattr(struct sysfs_dirent *sd, const struct iattr *iattr);
-void kernfs_notify(struct sysfs_dirent *sd);
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
+void kernfs_notify(struct kernfs_node *kn);
 
 const void *kernfs_super_ns(struct super_block *sb);
 struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
@@ -238,60 +238,60 @@ void kernfs_init(void);
 
 #else	/* CONFIG_SYSFS */
 
-static inline enum kernfs_node_type sysfs_type(struct sysfs_dirent *sd)
+static inline enum kernfs_node_type sysfs_type(struct kernfs_node *kn)
 { return 0; }	/* whatever */
 
-static inline void kernfs_enable_ns(struct sysfs_dirent *sd) { }
+static inline void kernfs_enable_ns(struct kernfs_node *kn) { }
 
-static inline bool kernfs_ns_enabled(struct sysfs_dirent *sd)
+static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 { return false; }
 
-static inline struct sysfs_dirent *
-kernfs_find_and_get_ns(struct sysfs_dirent *parent, const char *name,
+static inline struct kernfs_node *
+kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
 		       const void *ns)
 { return NULL; }
 
-static inline void kernfs_get(struct sysfs_dirent *sd) { }
-static inline void kernfs_put(struct sysfs_dirent *sd) { }
+static inline void kernfs_get(struct kernfs_node *kn) { }
+static inline void kernfs_put(struct kernfs_node *kn) { }
 
 static inline struct kernfs_root *kernfs_create_root(void *priv)
 { return ERR_PTR(-ENOSYS); }
 
 static inline void kernfs_destroy_root(struct kernfs_root *root) { }
 
-static inline struct sysfs_dirent *
-kernfs_create_dir_ns(struct sysfs_dirent *parent, const char *name, void *priv,
+static inline struct kernfs_node *
+kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, void *priv,
 		     const void *ns)
 { return ERR_PTR(-ENOSYS); }
 
-static inline struct sysfs_dirent *
-kernfs_create_file_ns_key(struct sysfs_dirent *parent, const char *name,
+static inline struct kernfs_node *
+kernfs_create_file_ns_key(struct kernfs_node *parent, const char *name,
 			  umode_t mode, loff_t size,
 			  const struct kernfs_ops *ops, void *priv,
 			  const void *ns, struct lock_class_key *key)
 { return ERR_PTR(-ENOSYS); }
 
-static inline struct sysfs_dirent *
-kernfs_create_link(struct sysfs_dirent *parent, const char *name,
-		   struct sysfs_dirent *target)
+static inline struct kernfs_node *
+kernfs_create_link(struct kernfs_node *parent, const char *name,
+		   struct kernfs_node *target)
 { return ERR_PTR(-ENOSYS); }
 
-static inline void kernfs_remove(struct sysfs_dirent *sd) { }
+static inline void kernfs_remove(struct kernfs_node *kn) { }
 
-static inline int kernfs_remove_by_name_ns(struct sysfs_dirent *parent,
+static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
 					   const char *name, const void *ns)
 { return -ENOSYS; }
 
-static inline int kernfs_rename_ns(struct sysfs_dirent *sd,
-				   struct sysfs_dirent *new_parent,
+static inline int kernfs_rename_ns(struct kernfs_node *kn,
+				   struct kernfs_node *new_parent,
 				   const char *new_name, const void *new_ns)
 { return -ENOSYS; }
 
-static inline int kernfs_setattr(struct sysfs_dirent *sd,
+static inline int kernfs_setattr(struct kernfs_node *kn,
 				 const struct iattr *iattr)
 { return -ENOSYS; }
 
-static inline void kernfs_notify(struct sysfs_dirent *sd) { }
+static inline void kernfs_notify(struct kernfs_node *kn) { }
 
 static inline const void *kernfs_super_ns(struct super_block *sb)
 { return NULL; }
@@ -307,20 +307,20 @@ static inline void kernfs_init(void) { }
 
 #endif	/* CONFIG_SYSFS */
 
-static inline struct sysfs_dirent *
-kernfs_find_and_get(struct sysfs_dirent *sd, const char *name)
+static inline struct kernfs_node *
+kernfs_find_and_get(struct kernfs_node *kn, const char *name)
 {
-	return kernfs_find_and_get_ns(sd, name, NULL);
+	return kernfs_find_and_get_ns(kn, name, NULL);
 }
 
-static inline struct sysfs_dirent *
-kernfs_create_dir(struct sysfs_dirent *parent, const char *name, void *priv)
+static inline struct kernfs_node *
+kernfs_create_dir(struct kernfs_node *parent, const char *name, void *priv)
 {
 	return kernfs_create_dir_ns(parent, name, priv, NULL);
 }
 
-static inline struct sysfs_dirent *
-kernfs_create_file_ns(struct sysfs_dirent *parent, const char *name,
+static inline struct kernfs_node *
+kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
 		      umode_t mode, loff_t size, const struct kernfs_ops *ops,
 		      void *priv, const void *ns)
 {
@@ -333,14 +333,14 @@ kernfs_create_file_ns(struct sysfs_dirent *parent, const char *name,
 					 ns, key);
 }
 
-static inline struct sysfs_dirent *
-kernfs_create_file(struct sysfs_dirent *parent, const char *name, umode_t mode,
+static inline struct kernfs_node *
+kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode,
 		   loff_t size, const struct kernfs_ops *ops, void *priv)
 {
 	return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL);
 }
 
-static inline int kernfs_remove_by_name(struct sysfs_dirent *parent,
+static inline int kernfs_remove_by_name(struct kernfs_node *parent,
 					const char *name)
 {
 	return kernfs_remove_by_name_ns(parent, name, NULL);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e7ba650086c..926afb6f6b5 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -64,7 +64,7 @@ struct kobject {
 	struct kobject		*parent;
 	struct kset		*kset;
 	struct kobj_type	*ktype;
-	struct sysfs_dirent	*sd;
+	struct kernfs_node	*sd;
 	struct kref		kref;
 #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
 	struct delayed_work	release;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index cd8f90bf51a..30b2ebee643 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -438,26 +438,26 @@ static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target
 	return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
 }
 
-static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
+static inline void sysfs_notify_dirent(struct kernfs_node *kn)
 {
-	kernfs_notify(sd);
+	kernfs_notify(kn);
 }
 
-static inline struct sysfs_dirent *
-sysfs_get_dirent(struct sysfs_dirent *parent_sd, const unsigned char *name)
+static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
+						   const unsigned char *name)
 {
-	return kernfs_find_and_get(parent_sd, name);
+	return kernfs_find_and_get(parent, name);
 }
 
-static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn)
 {
-	kernfs_get(sd);
-	return sd;
+	kernfs_get(kn);
+	return kn;
 }
 
-static inline void sysfs_put(struct sysfs_dirent *sd)
+static inline void sysfs_put(struct kernfs_node *kn)
 {
-	kernfs_put(sd);
+	kernfs_put(kn);
 }
 
 #endif /* _SYSFS_H_ */
diff --git a/lib/kobject.c b/lib/kobject.c
index 94b321f4ac6..064451f2a6c 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -556,7 +556,7 @@ out:
  */
 void kobject_del(struct kobject *kobj)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *sd;
 
 	if (!kobj)
 		return;
-- 
cgit v1.2.3-70-g09d2


From adc5e8b58f4886d45f79f4ff41a09001a76a6b12 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 14:11:54 -0500
Subject: kernfs: drop s_ prefix from kernfs_node members

kernfs has just been separated out from sysfs and we're already in
full conflict mode.  Nothing can make the situation any worse.  Let's
take the chance to name things properly.

s_ prefix for kernfs members is used inconsistently and a misnomer
now.  It's not like kernfs_node is used widely across the kernel
making the ability to grep for the members particularly useful.  Let's
just drop the prefix.

This patch is strictly rename only and doesn't introduce any
functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 198 ++++++++++++++++++++++----------------------
 fs/kernfs/file.c            |  52 ++++++------
 fs/kernfs/inode.c           |  30 +++----
 fs/kernfs/kernfs-internal.h |   6 +-
 fs/kernfs/symlink.c         |  32 +++----
 fs/sysfs/dir.c              |  12 +--
 fs/sysfs/file.c             |  16 ++--
 fs/sysfs/symlink.c          |   6 +-
 include/linux/kernfs.h      |  38 ++++-----
 9 files changed, 193 insertions(+), 197 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 800ebf52147..51fff9d2b33 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -19,7 +19,7 @@
 
 DEFINE_MUTEX(sysfs_mutex);
 
-#define rb_to_kn(X) rb_entry((X), struct kernfs_node, s_rb)
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
 /**
  *	sysfs_name_hash
@@ -47,18 +47,17 @@ static unsigned int sysfs_name_hash(const char *name, const void *ns)
 static int sysfs_name_compare(unsigned int hash, const char *name,
 			      const void *ns, const struct kernfs_node *kn)
 {
-	if (hash != kn->s_hash)
-		return hash - kn->s_hash;
-	if (ns != kn->s_ns)
-		return ns - kn->s_ns;
-	return strcmp(name, kn->s_name);
+	if (hash != kn->hash)
+		return hash - kn->hash;
+	if (ns != kn->ns)
+		return ns - kn->ns;
+	return strcmp(name, kn->name);
 }
 
 static int sysfs_sd_compare(const struct kernfs_node *left,
 			    const struct kernfs_node *right)
 {
-	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
-				  right);
+	return sysfs_name_compare(left->hash, left->name, left->ns, right);
 }
 
 /**
@@ -66,7 +65,7 @@ static int sysfs_sd_compare(const struct kernfs_node *left,
  *	@kn: kernfs_node of interest
  *
  *	Link @kn into its sibling rbtree which starts from
- *	@kn->s_parent->s_dir.children.
+ *	@kn->parent->dir.children.
  *
  *	Locking:
  *	mutex_lock(sysfs_mutex)
@@ -76,11 +75,11 @@ static int sysfs_sd_compare(const struct kernfs_node *left,
  */
 static int sysfs_link_sibling(struct kernfs_node *kn)
 {
-	struct rb_node **node = &kn->s_parent->s_dir.children.rb_node;
+	struct rb_node **node = &kn->parent->dir.children.rb_node;
 	struct rb_node *parent = NULL;
 
 	if (sysfs_type(kn) == SYSFS_DIR)
-		kn->s_parent->s_dir.subdirs++;
+		kn->parent->dir.subdirs++;
 
 	while (*node) {
 		struct kernfs_node *pos;
@@ -90,15 +89,15 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
 		parent = *node;
 		result = sysfs_sd_compare(kn, pos);
 		if (result < 0)
-			node = &pos->s_rb.rb_left;
+			node = &pos->rb.rb_left;
 		else if (result > 0)
-			node = &pos->s_rb.rb_right;
+			node = &pos->rb.rb_right;
 		else
 			return -EEXIST;
 	}
 	/* add new node and rebalance the tree */
-	rb_link_node(&kn->s_rb, parent, node);
-	rb_insert_color(&kn->s_rb, &kn->s_parent->s_dir.children);
+	rb_link_node(&kn->rb, parent, node);
+	rb_insert_color(&kn->rb, &kn->parent->dir.children);
 	return 0;
 }
 
@@ -107,7 +106,7 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
  *	@kn: kernfs_node of interest
  *
  *	Unlink @kn from its sibling rbtree which starts from
- *	kn->s_parent->s_dir.children.
+ *	kn->parent->dir.children.
  *
  *	Locking:
  *	mutex_lock(sysfs_mutex)
@@ -115,9 +114,9 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
 static void sysfs_unlink_sibling(struct kernfs_node *kn)
 {
 	if (sysfs_type(kn) == SYSFS_DIR)
-		kn->s_parent->s_dir.subdirs--;
+		kn->parent->dir.subdirs--;
 
-	rb_erase(&kn->s_rb, &kn->s_parent->s_dir.children);
+	rb_erase(&kn->rb, &kn->parent->dir.children);
 }
 
 /**
@@ -135,10 +134,10 @@ struct kernfs_node *sysfs_get_active(struct kernfs_node *kn)
 	if (unlikely(!kn))
 		return NULL;
 
-	if (!atomic_inc_unless_negative(&kn->s_active))
+	if (!atomic_inc_unless_negative(&kn->active))
 		return NULL;
 
-	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & SYSFS_FLAG_LOCKDEP)
 		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
 	return kn;
 }
@@ -157,9 +156,9 @@ void sysfs_put_active(struct kernfs_node *kn)
 	if (unlikely(!kn))
 		return;
 
-	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & SYSFS_FLAG_LOCKDEP)
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
-	v = atomic_dec_return(&kn->s_active);
+	v = atomic_dec_return(&kn->active);
 	if (likely(v != SD_DEACTIVATED_BIAS))
 		return;
 
@@ -181,7 +180,7 @@ static void sysfs_deactivate(struct kernfs_node *kn)
 	DECLARE_COMPLETION_ONSTACK(wait);
 	int v;
 
-	BUG_ON(!(kn->s_flags & SYSFS_FLAG_REMOVED));
+	BUG_ON(!(kn->flags & SYSFS_FLAG_REMOVED));
 
 	if (!(sysfs_type(kn) & SYSFS_ACTIVE_REF))
 		return;
@@ -192,7 +191,7 @@ static void sysfs_deactivate(struct kernfs_node *kn)
 	/* atomic_add_return() is a mb(), put_active() will always see
 	 * the updated kn->u.completion.
 	 */
-	v = atomic_add_return(SD_DEACTIVATED_BIAS, &kn->s_active);
+	v = atomic_add_return(SD_DEACTIVATED_BIAS, &kn->active);
 
 	if (v != SD_DEACTIVATED_BIAS) {
 		lock_contended(&kn->dep_map, _RET_IP_);
@@ -210,8 +209,8 @@ static void sysfs_deactivate(struct kernfs_node *kn)
 void kernfs_get(struct kernfs_node *kn)
 {
 	if (kn) {
-		WARN_ON(!atomic_read(&kn->s_count));
-		atomic_inc(&kn->s_count);
+		WARN_ON(!atomic_read(&kn->count));
+		atomic_inc(&kn->count);
 	}
 }
 EXPORT_SYMBOL_GPL(kernfs_get);
@@ -227,36 +226,36 @@ void kernfs_put(struct kernfs_node *kn)
 	struct kernfs_node *parent;
 	struct kernfs_root *root;
 
-	if (!kn || !atomic_dec_and_test(&kn->s_count))
+	if (!kn || !atomic_dec_and_test(&kn->count))
 		return;
 	root = kernfs_root(kn);
  repeat:
 	/* Moving/renaming is always done while holding reference.
-	 * kn->s_parent won't change beneath us.
+	 * kn->parent won't change beneath us.
 	 */
-	parent = kn->s_parent;
+	parent = kn->parent;
 
-	WARN(!(kn->s_flags & SYSFS_FLAG_REMOVED),
+	WARN(!(kn->flags & SYSFS_FLAG_REMOVED),
 		"sysfs: free using entry: %s/%s\n",
-		parent ? parent->s_name : "", kn->s_name);
+		parent ? parent->name : "", kn->name);
 
 	if (sysfs_type(kn) == SYSFS_KOBJ_LINK)
-		kernfs_put(kn->s_symlink.target_kn);
+		kernfs_put(kn->symlink.target_kn);
 	if (sysfs_type(kn) & SYSFS_COPY_NAME)
-		kfree(kn->s_name);
-	if (kn->s_iattr) {
-		if (kn->s_iattr->ia_secdata)
-			security_release_secctx(kn->s_iattr->ia_secdata,
-						kn->s_iattr->ia_secdata_len);
-		simple_xattrs_free(&kn->s_iattr->xattrs);
+		kfree(kn->name);
+	if (kn->iattr) {
+		if (kn->iattr->ia_secdata)
+			security_release_secctx(kn->iattr->ia_secdata,
+						kn->iattr->ia_secdata_len);
+		simple_xattrs_free(&kn->iattr->xattrs);
 	}
-	kfree(kn->s_iattr);
-	ida_simple_remove(&root->ino_ida, kn->s_ino);
+	kfree(kn->iattr);
+	ida_simple_remove(&root->ino_ida, kn->ino);
 	kmem_cache_free(sysfs_dir_cachep, kn);
 
 	kn = parent;
 	if (kn) {
-		if (atomic_dec_and_test(&kn->s_count))
+		if (atomic_dec_and_test(&kn->count))
 			goto repeat;
 	} else {
 		/* just released the root kn, free @root too */
@@ -269,7 +268,7 @@ EXPORT_SYMBOL_GPL(kernfs_put);
 static int sysfs_dentry_delete(const struct dentry *dentry)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
-	return !(kn && !(kn->s_flags & SYSFS_FLAG_REMOVED));
+	return !(kn && !(kn->flags & SYSFS_FLAG_REMOVED));
 }
 
 static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
@@ -283,20 +282,20 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 	mutex_lock(&sysfs_mutex);
 
 	/* The sysfs dirent has been deleted */
-	if (kn->s_flags & SYSFS_FLAG_REMOVED)
+	if (kn->flags & SYSFS_FLAG_REMOVED)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved? */
-	if (dentry->d_parent->d_fsdata != kn->s_parent)
+	if (dentry->d_parent->d_fsdata != kn->parent)
 		goto out_bad;
 
 	/* The sysfs dirent has been renamed */
-	if (strcmp(dentry->d_name.name, kn->s_name) != 0)
+	if (strcmp(dentry->d_name.name, kn->name) != 0)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved to a different namespace */
-	if (kn->s_parent && kernfs_ns_enabled(kn->s_parent) &&
-	    sysfs_info(dentry->d_sb)->ns != kn->s_ns)
+	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+	    sysfs_info(dentry->d_sb)->ns != kn->ns)
 		goto out_bad;
 
 	mutex_unlock(&sysfs_mutex);
@@ -356,14 +355,14 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto err_out2;
-	kn->s_ino = ret;
+	kn->ino = ret;
 
-	atomic_set(&kn->s_count, 1);
-	atomic_set(&kn->s_active, 0);
+	atomic_set(&kn->count, 1);
+	atomic_set(&kn->active, 0);
 
-	kn->s_name = name;
-	kn->s_mode = mode;
-	kn->s_flags = type | SYSFS_FLAG_REMOVED;
+	kn->name = name;
+	kn->mode = mode;
+	kn->flags = type | SYSFS_FLAG_REMOVED;
 
 	return kn;
 
@@ -400,9 +399,9 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
  *	@kn: kernfs_node to be added
  *	@parent: the parent kernfs_node to add @kn to
  *
- *	Get @parent and set @kn->s_parent to it and increment nlink of
- *	the parent inode if @kn is a directory and link into the children
- *	list of the parent.
+ *	Get @parent and set @kn->parent to it and increment nlink of the
+ *	parent inode if @kn is a directory and link into the children list
+ *	of the parent.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -422,18 +421,17 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
-	if (has_ns != (bool)kn->s_ns) {
+	if (has_ns != (bool)kn->ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-		     has_ns ? "required" : "invalid",
-		     parent->s_name, kn->s_name);
+		     has_ns ? "required" : "invalid", parent->name, kn->name);
 		return -EINVAL;
 	}
 
 	if (sysfs_type(parent) != SYSFS_DIR)
 		return -EINVAL;
 
-	kn->s_hash = sysfs_name_hash(kn->s_name, kn->s_ns);
-	kn->s_parent = parent;
+	kn->hash = sysfs_name_hash(kn->name, kn->ns);
+	kn->parent = parent;
 	kernfs_get(parent);
 
 	ret = sysfs_link_sibling(kn);
@@ -441,14 +439,14 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct kernfs_node *kn,
 		return ret;
 
 	/* Update timestamps on the parent */
-	ps_iattr = parent->s_iattr;
+	ps_iattr = parent->iattr;
 	if (ps_iattr) {
 		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
 		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
 	}
 
 	/* Mark the entry added into directory tree */
-	kn->s_flags &= ~SYSFS_FLAG_REMOVED;
+	kn->flags &= ~SYSFS_FLAG_REMOVED;
 
 	return 0;
 }
@@ -477,21 +475,21 @@ static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
 	 * Removal can be called multiple times on the same node.  Only the
 	 * first invocation is effective and puts the base ref.
 	 */
-	if (kn->s_flags & SYSFS_FLAG_REMOVED)
+	if (kn->flags & SYSFS_FLAG_REMOVED)
 		return;
 
-	if (kn->s_parent) {
+	if (kn->parent) {
 		sysfs_unlink_sibling(kn);
 
 		/* Update timestamps on the parent */
-		ps_iattr = kn->s_parent->s_iattr;
+		ps_iattr = kn->parent->iattr;
 		if (ps_iattr) {
 			ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
 			ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
 		}
 	}
 
-	kn->s_flags |= SYSFS_FLAG_REMOVED;
+	kn->flags |= SYSFS_FLAG_REMOVED;
 	kn->u.removed_list = acxt->removed;
 	acxt->removed = kn;
 }
@@ -538,7 +536,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 					  const unsigned char *name,
 					  const void *ns)
 {
-	struct rb_node *node = parent->s_dir.children.rb_node;
+	struct rb_node *node = parent->dir.children.rb_node;
 	bool has_ns = kernfs_ns_enabled(parent);
 	unsigned int hash;
 
@@ -546,8 +544,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 
 	if (has_ns != (bool)ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-		     has_ns ? "required" : "invalid",
-		     parent->s_name, name);
+		     has_ns ? "required" : "invalid", parent->name, name);
 		return NULL;
 	}
 
@@ -617,9 +614,9 @@ struct kernfs_root *kernfs_create_root(void *priv)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	kn->s_flags &= ~SYSFS_FLAG_REMOVED;
+	kn->flags &= ~SYSFS_FLAG_REMOVED;
 	kn->priv = priv;
-	kn->s_dir.root = root;
+	kn->dir.root = root;
 
 	root->kn = kn;
 
@@ -661,8 +658,8 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
-	kn->s_dir.root = parent->s_dir.root;
-	kn->s_ns = ns;
+	kn->dir.root = parent->dir.root;
+	kn->ns = ns;
 	kn->priv = priv;
 
 	/* link in */
@@ -738,7 +735,7 @@ static struct kernfs_node *sysfs_leftmost_descendant(struct kernfs_node *pos)
 		if (sysfs_type(pos) != SYSFS_DIR)
 			break;
 
-		rbn = rb_first(&pos->s_dir.children);
+		rbn = rb_first(&pos->dir.children);
 		if (!rbn)
 			break;
 
@@ -773,12 +770,12 @@ static struct kernfs_node *sysfs_next_descendant_post(struct kernfs_node *pos,
 		return NULL;
 
 	/* if there's an unvisited sibling, visit its leftmost descendant */
-	rbn = rb_next(&pos->s_rb);
+	rbn = rb_next(&pos->rb);
 	if (rbn)
 		return sysfs_leftmost_descendant(rb_to_kn(rbn));
 
 	/* no sibling left, visit parent */
-	return pos->s_parent;
+	return pos->parent;
 }
 
 static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
@@ -789,7 +786,7 @@ static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
 	if (!kn)
 		return;
 
-	pr_debug("sysfs %s: removing\n", kn->s_name);
+	pr_debug("sysfs %s: removing\n", kn->name);
 
 	next = NULL;
 	do {
@@ -865,8 +862,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	mutex_lock(&sysfs_mutex);
 
 	error = 0;
-	if ((kn->s_parent == new_parent) && (kn->s_ns == new_ns) &&
-	    (strcmp(kn->s_name, new_name) == 0))
+	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+	    (strcmp(kn->name, new_name) == 0))
 		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
@@ -874,14 +871,14 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		goto out;
 
 	/* rename kernfs_node */
-	if (strcmp(kn->s_name, new_name) != 0) {
+	if (strcmp(kn->name, new_name) != 0) {
 		error = -ENOMEM;
 		new_name = kstrdup(new_name, GFP_KERNEL);
 		if (!new_name)
 			goto out;
 
-		kfree(kn->s_name);
-		kn->s_name = new_name;
+		kfree(kn->name);
+		kn->name = new_name;
 	}
 
 	/*
@@ -889,10 +886,10 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	 */
 	sysfs_unlink_sibling(kn);
 	kernfs_get(new_parent);
-	kernfs_put(kn->s_parent);
-	kn->s_ns = new_ns;
-	kn->s_hash = sysfs_name_hash(kn->s_name, kn->s_ns);
-	kn->s_parent = new_parent;
+	kernfs_put(kn->parent);
+	kn->ns = new_ns;
+	kn->hash = sysfs_name_hash(kn->name, kn->ns);
+	kn->parent = new_parent;
 	sysfs_link_sibling(kn);
 
 	error = 0;
@@ -904,7 +901,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 /* Relationship between s_mode and the DT_xxx types */
 static inline unsigned char dt_type(struct kernfs_node *kn)
 {
-	return (kn->s_mode >> 12) & 15;
+	return (kn->mode >> 12) & 15;
 }
 
 static int sysfs_dir_release(struct inode *inode, struct file *filp)
@@ -917,29 +914,28 @@ static struct kernfs_node *sysfs_dir_pos(const void *ns,
 	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
 {
 	if (pos) {
-		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
-			pos->s_parent == parent &&
-			hash == pos->s_hash;
+		int valid = !(pos->flags & SYSFS_FLAG_REMOVED) &&
+			pos->parent == parent && hash == pos->hash;
 		kernfs_put(pos);
 		if (!valid)
 			pos = NULL;
 	}
 	if (!pos && (hash > 1) && (hash < INT_MAX)) {
-		struct rb_node *node = parent->s_dir.children.rb_node;
+		struct rb_node *node = parent->dir.children.rb_node;
 		while (node) {
 			pos = rb_to_kn(node);
 
-			if (hash < pos->s_hash)
+			if (hash < pos->hash)
 				node = node->rb_left;
-			else if (hash > pos->s_hash)
+			else if (hash > pos->hash)
 				node = node->rb_right;
 			else
 				break;
 		}
 	}
 	/* Skip over entries in the wrong namespace */
-	while (pos && pos->s_ns != ns) {
-		struct rb_node *node = rb_next(&pos->s_rb);
+	while (pos && pos->ns != ns) {
+		struct rb_node *node = rb_next(&pos->rb);
 		if (!node)
 			pos = NULL;
 		else
@@ -954,12 +950,12 @@ static struct kernfs_node *sysfs_dir_next_pos(const void *ns,
 	pos = sysfs_dir_pos(ns, parent, ino, pos);
 	if (pos)
 		do {
-			struct rb_node *node = rb_next(&pos->s_rb);
+			struct rb_node *node = rb_next(&pos->rb);
 			if (!node)
 				pos = NULL;
 			else
 				pos = rb_to_kn(node);
-		} while (pos && pos->s_ns != ns);
+		} while (pos && pos->ns != ns);
 	return pos;
 }
 
@@ -980,12 +976,12 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 	for (pos = sysfs_dir_pos(ns, parent, ctx->pos, pos);
 	     pos;
 	     pos = sysfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
-		const char *name = pos->s_name;
+		const char *name = pos->name;
 		unsigned int type = dt_type(pos);
 		int len = strlen(name);
-		ino_t ino = pos->s_ino;
+		ino_t ino = pos->ino;
 
-		ctx->pos = pos->s_hash;
+		ctx->pos = pos->hash;
 		file->private_data = pos;
 		kernfs_get(pos);
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 1bf07ded826..5277021196a 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -21,7 +21,7 @@
  * There's one sysfs_open_file for each open file and one sysfs_open_dirent
  * for each kernfs_node with one or more open files.
  *
- * kernfs_node->s_attr.open points to sysfs_open_dirent.  s_attr.open is
+ * kernfs_node->attr.open points to sysfs_open_dirent.  attr.open is
  * protected by sysfs_open_dirent_lock.
  *
  * filp->private_data points to seq_file whose ->private points to
@@ -49,9 +49,9 @@ static struct sysfs_open_file *sysfs_of(struct file *file)
  */
 static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
 {
-	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & SYSFS_FLAG_LOCKDEP)
 		lockdep_assert_held(kn);
-	return kn->s_attr.ops;
+	return kn->attr.ops;
 }
 
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
@@ -112,9 +112,9 @@ static int kernfs_seq_show(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
 
-	of->event = atomic_read(&of->kn->s_attr.open->event);
+	of->event = atomic_read(&of->kn->attr.open->event);
 
-	return of->kn->s_attr.ops->seq_show(sf, v);
+	return of->kn->attr.ops->seq_show(sf, v);
 }
 
 static const struct seq_operations kernfs_seq_ops = {
@@ -189,7 +189,7 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 {
 	struct sysfs_open_file *of = sysfs_of(file);
 
-	if (of->kn->s_flags & SYSFS_FLAG_HAS_SEQ_SHOW)
+	if (of->kn->flags & SYSFS_FLAG_HAS_SEQ_SHOW)
 		return seq_read(file, user_buf, count, ppos);
 	else
 		return kernfs_file_direct_read(of, user_buf, count, ppos);
@@ -428,7 +428,7 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
 	 * comment in kernfs_file_open() for more details.
 	 */
-	if (!(of->kn->s_flags & SYSFS_FLAG_HAS_MMAP))
+	if (!(of->kn->flags & SYSFS_FLAG_HAS_MMAP))
 		return -ENODEV;
 
 	mutex_lock(&of->mutex);
@@ -477,8 +477,8 @@ out_unlock:
  *	@kn: target kernfs_node
  *	@of: sysfs_open_file for this instance of open
  *
- *	If @kn->s_attr.open exists, increment its reference count;
- *	otherwise, create one.  @of is chained to the files list.
+ *	If @kn->attr.open exists, increment its reference count; otherwise,
+ *	create one.  @of is chained to the files list.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
@@ -495,12 +495,12 @@ static int sysfs_get_open_dirent(struct kernfs_node *kn,
 	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irq(&sysfs_open_dirent_lock);
 
-	if (!kn->s_attr.open && new_od) {
-		kn->s_attr.open = new_od;
+	if (!kn->attr.open && new_od) {
+		kn->attr.open = new_od;
 		new_od = NULL;
 	}
 
-	od = kn->s_attr.open;
+	od = kn->attr.open;
 	if (od) {
 		atomic_inc(&od->refcnt);
 		list_add_tail(&of->list, &od->files);
@@ -531,7 +531,7 @@ static int sysfs_get_open_dirent(struct kernfs_node *kn,
  *	@kn: target kernfs_nodet
  *	@of: associated sysfs_open_file
  *
- *	Put @kn->s_attr.open and unlink @of from the files list.  If
+ *	Put @kn->attr.open and unlink @of from the files list.  If
  *	reference count reaches zero, disassociate and free it.
  *
  *	LOCKING:
@@ -540,7 +540,7 @@ static int sysfs_get_open_dirent(struct kernfs_node *kn,
 static void sysfs_put_open_dirent(struct kernfs_node *kn,
 				  struct sysfs_open_file *of)
 {
-	struct sysfs_open_dirent *od = kn->s_attr.open;
+	struct sysfs_open_dirent *od = kn->attr.open;
 	unsigned long flags;
 
 	mutex_lock(&sysfs_open_file_mutex);
@@ -550,7 +550,7 @@ static void sysfs_put_open_dirent(struct kernfs_node *kn,
 		list_del(&of->list);
 
 	if (atomic_dec_and_test(&od->refcnt))
-		kn->s_attr.open = NULL;
+		kn->attr.open = NULL;
 	else
 		od = NULL;
 
@@ -668,11 +668,11 @@ void sysfs_unmap_bin_file(struct kernfs_node *kn)
 	struct sysfs_open_dirent *od;
 	struct sysfs_open_file *of;
 
-	if (!(kn->s_flags & SYSFS_FLAG_HAS_MMAP))
+	if (!(kn->flags & SYSFS_FLAG_HAS_MMAP))
 		return;
 
 	spin_lock_irq(&sysfs_open_dirent_lock);
-	od = kn->s_attr.open;
+	od = kn->attr.open;
 	if (od)
 		atomic_inc(&od->refcnt);
 	spin_unlock_irq(&sysfs_open_dirent_lock);
@@ -706,7 +706,7 @@ static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
 {
 	struct sysfs_open_file *of = sysfs_of(filp);
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_dirent *od = kn->s_attr.open;
+	struct sysfs_open_dirent *od = kn->attr.open;
 
 	/* need parent for the kobj, grab both */
 	if (!sysfs_get_active(kn))
@@ -739,7 +739,7 @@ void kernfs_notify(struct kernfs_node *kn)
 	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
 	if (!WARN_ON(sysfs_type(kn) != SYSFS_KOBJ_ATTR)) {
-		od = kn->s_attr.open;
+		od = kn->attr.open;
 		if (od) {
 			atomic_inc(&od->event);
 			wake_up_interruptible(&od->poll);
@@ -789,27 +789,27 @@ struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
-	kn->s_attr.ops = ops;
-	kn->s_attr.size = size;
-	kn->s_ns = ns;
+	kn->attr.ops = ops;
+	kn->attr.size = size;
+	kn->ns = ns;
 	kn->priv = priv;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	if (key) {
 		lockdep_init_map(&kn->dep_map, "s_active", key, 0);
-		kn->s_flags |= SYSFS_FLAG_LOCKDEP;
+		kn->flags |= SYSFS_FLAG_LOCKDEP;
 	}
 #endif
 
 	/*
-	 * kn->s_attr.ops is accesible only while holding active ref.  We
+	 * kn->attr.ops is accesible only while holding active ref.  We
 	 * need to know whether some ops are implemented outside active
 	 * ref.  Cache their existence in flags.
 	 */
 	if (ops->seq_show)
-		kn->s_flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
+		kn->flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
 	if (ops->mmap)
-		kn->s_flags |= SYSFS_FLAG_HAS_MMAP;
+		kn->flags |= SYSFS_FLAG_HAS_MMAP;
 
 	sysfs_addrm_start(&acxt);
 	rc = sysfs_add_one(&acxt, kn, parent);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 9e74eed6353..f6c0aae3dd5 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -50,23 +50,23 @@ static struct sysfs_inode_attrs *sysfs_inode_attrs(struct kernfs_node *kn)
 {
 	struct iattr *iattrs;
 
-	if (kn->s_iattr)
-		return kn->s_iattr;
+	if (kn->iattr)
+		return kn->iattr;
 
-	kn->s_iattr = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-	if (!kn->s_iattr)
+	kn->iattr = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	if (!kn->iattr)
 		return NULL;
-	iattrs = &kn->s_iattr->ia_iattr;
+	iattrs = &kn->iattr->ia_iattr;
 
 	/* assign default attributes */
-	iattrs->ia_mode = kn->s_mode;
+	iattrs->ia_mode = kn->mode;
 	iattrs->ia_uid = GLOBAL_ROOT_UID;
 	iattrs->ia_gid = GLOBAL_ROOT_GID;
 	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
 
-	simple_xattrs_init(&kn->s_iattr->xattrs);
+	simple_xattrs_init(&kn->iattr->xattrs);
 
-	return kn->s_iattr;
+	return kn->iattr;
 }
 
 static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
@@ -93,7 +93,7 @@ static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 		iattrs->ia_ctime = iattr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = iattr->ia_mode;
-		iattrs->ia_mode = kn->s_mode = mode;
+		iattrs->ia_mode = kn->mode = mode;
 	}
 	return 0;
 }
@@ -256,9 +256,9 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
 
 static void sysfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 {
-	struct sysfs_inode_attrs *attrs = kn->s_iattr;
+	struct sysfs_inode_attrs *attrs = kn->iattr;
 
-	inode->i_mode = kn->s_mode;
+	inode->i_mode = kn->mode;
 	if (attrs) {
 		/*
 		 * kernfs_node has non-default attributes get them from
@@ -270,7 +270,7 @@ static void sysfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 	}
 
 	if (sysfs_type(kn) == SYSFS_DIR)
-		set_nlink(inode, kn->s_dir.subdirs + 2);
+		set_nlink(inode, kn->dir.subdirs + 2);
 }
 
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -295,7 +295,7 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
 
-	set_default_inode_attr(inode, kn->s_mode);
+	set_default_inode_attr(inode, kn->mode);
 	sysfs_refresh_inode(kn, inode);
 
 	/* initialize inode according to type */
@@ -305,7 +305,7 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 		inode->i_fop = &sysfs_dir_operations;
 		break;
 	case SYSFS_KOBJ_ATTR:
-		inode->i_size = kn->s_attr.size;
+		inode->i_size = kn->attr.size;
 		inode->i_fop = &kernfs_file_operations;
 		break;
 	case SYSFS_KOBJ_LINK:
@@ -337,7 +337,7 @@ struct inode *sysfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
 {
 	struct inode *inode;
 
-	inode = iget_locked(sb, kn->s_ino);
+	inode = iget_locked(sb, kn->ino);
 	if (inode && (inode->i_state & I_NEW))
 		sysfs_init_inode(kn, inode);
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index b7ea76c6fb3..2dbb1cb95e7 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -39,9 +39,9 @@ struct sysfs_inode_attrs {
 static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
 {
 	/* if parent exists, it's always a dir; otherwise, @sd is a dir */
-	if (kn->s_parent)
-		kn = kn->s_parent;
-	return kn->s_dir.root;
+	if (kn->parent)
+		kn = kn->parent;
+	return kn->dir.root;
 }
 
 /*
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 29dcf5e8deb..5ac1a57c380 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -36,8 +36,8 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 		return ERR_PTR(-ENOMEM);
 
 	if (kernfs_ns_enabled(parent))
-		kn->s_ns = target->s_ns;
-	kn->s_symlink.target_kn = target;
+		kn->ns = target->ns;
+	kn->symlink.target_kn = target;
 	kernfs_get(target);	/* ref owned by symlink */
 
 	sysfs_addrm_start(&acxt);
@@ -60,24 +60,24 @@ static int sysfs_get_target_path(struct kernfs_node *parent,
 
 	/* go up to the root, stop at the base */
 	base = parent;
-	while (base->s_parent) {
-		kn = target->s_parent;
-		while (kn->s_parent && base != kn)
-			kn = kn->s_parent;
+	while (base->parent) {
+		kn = target->parent;
+		while (kn->parent && base != kn)
+			kn = kn->parent;
 
 		if (base == kn)
 			break;
 
 		strcpy(s, "../");
 		s += 3;
-		base = base->s_parent;
+		base = base->parent;
 	}
 
 	/* determine end of target string for reverse fillup */
 	kn = target;
-	while (kn->s_parent && kn != base) {
-		len += strlen(kn->s_name) + 1;
-		kn = kn->s_parent;
+	while (kn->parent && kn != base) {
+		len += strlen(kn->name) + 1;
+		kn = kn->parent;
 	}
 
 	/* check limits */
@@ -89,15 +89,15 @@ static int sysfs_get_target_path(struct kernfs_node *parent,
 
 	/* reverse fillup of target string from target to base */
 	kn = target;
-	while (kn->s_parent && kn != base) {
-		int slen = strlen(kn->s_name);
+	while (kn->parent && kn != base) {
+		int slen = strlen(kn->name);
 
 		len -= slen;
-		strncpy(s + len, kn->s_name, slen);
+		strncpy(s + len, kn->name, slen);
 		if (len)
 			s[--len] = '/';
 
-		kn = kn->s_parent;
+		kn = kn->parent;
 	}
 
 	return 0;
@@ -106,8 +106,8 @@ static int sysfs_get_target_path(struct kernfs_node *parent,
 static int sysfs_getlink(struct dentry *dentry, char *path)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
-	struct kernfs_node *parent = kn->s_parent;
-	struct kernfs_node *target = kn->s_symlink.target_kn;
+	struct kernfs_node *parent = kn->parent;
+	struct kernfs_node *target = kn->symlink.target_kn;
 	int error;
 
 	mutex_lock(&sysfs_mutex);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index f1efe3df0de..4a800017558 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -29,11 +29,11 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock);
  */
 static char *sysfs_pathname(struct kernfs_node *kn, char *path)
 {
-	if (kn->s_parent) {
-		sysfs_pathname(kn->s_parent, path);
+	if (kn->parent) {
+		sysfs_pathname(kn->parent, path);
 		strlcat(path, "/", PATH_MAX);
 	}
-	strlcat(path, kn->s_name, PATH_MAX);
+	strlcat(path, kn->name, PATH_MAX);
 	return path;
 }
 
@@ -121,7 +121,7 @@ void sysfs_remove_dir(struct kobject *kobj)
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 			const void *new_ns)
 {
-	struct kernfs_node *parent = kobj->sd->s_parent;
+	struct kernfs_node *parent = kobj->sd->parent;
 
 	return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
 }
@@ -132,9 +132,9 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 	struct kernfs_node *kn = kobj->sd;
 	struct kernfs_node *new_parent;
 
-	BUG_ON(!kn->s_parent);
+	BUG_ON(!kn->parent);
 	new_parent = new_parent_kobj && new_parent_kobj->sd ?
 		new_parent_kobj->sd : sysfs_root_kn;
 
-	return kernfs_rename_ns(kn, new_parent, kn->s_name, new_ns);
+	return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index be1cc39035b..887703a7906 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -27,9 +27,9 @@
  */
 static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 {
-	struct kobject *kobj = kn->s_parent->priv;
+	struct kobject *kobj = kn->parent->priv;
 
-	if (kn->s_flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & SYSFS_FLAG_LOCKDEP)
 		lockdep_assert_held(kn);
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
@@ -42,7 +42,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
 	struct sysfs_open_file *of = sf->private;
-	struct kobject *kobj = of->kn->s_parent->priv;
+	struct kobject *kobj = of->kn->parent->priv;
 	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
 	ssize_t count;
 	char *buf;
@@ -82,7 +82,7 @@ static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
 				 size_t count, loff_t pos)
 {
 	struct bin_attribute *battr = of->kn->priv;
-	struct kobject *kobj = of->kn->s_parent->priv;
+	struct kobject *kobj = of->kn->parent->priv;
 	loff_t size = file_inode(of->file)->i_size;
 
 	if (!count)
@@ -106,7 +106,7 @@ static ssize_t sysfs_kf_write(struct sysfs_open_file *of, char *buf,
 			      size_t count, loff_t pos)
 {
 	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
-	struct kobject *kobj = of->kn->s_parent->priv;
+	struct kobject *kobj = of->kn->parent->priv;
 
 	if (!count)
 		return 0;
@@ -119,7 +119,7 @@ static ssize_t sysfs_kf_bin_write(struct sysfs_open_file *of, char *buf,
 				  size_t count, loff_t pos)
 {
 	struct bin_attribute *battr = of->kn->priv;
-	struct kobject *kobj = of->kn->s_parent->priv;
+	struct kobject *kobj = of->kn->parent->priv;
 	loff_t size = file_inode(of->file)->i_size;
 
 	if (size) {
@@ -140,7 +140,7 @@ static int sysfs_kf_bin_mmap(struct sysfs_open_file *of,
 			     struct vm_area_struct *vma)
 {
 	struct bin_attribute *battr = of->kn->priv;
-	struct kobject *kobj = of->kn->s_parent->priv;
+	struct kobject *kobj = of->kn->parent->priv;
 
 	return battr->mmap(of->file, kobj, battr, vma);
 }
@@ -345,7 +345,7 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 	if (!kn)
 		return -ENOENT;
 
-	newattrs.ia_mode = (mode & S_IALLUGO) | (kn->s_mode & ~S_IALLUGO);
+	newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE;
 
 	rc = kernfs_setattr(kn, &newattrs);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 4ed3d49ad27..0d48ea91150 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -129,7 +129,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
 	if (targ->sd && kernfs_ns_enabled(kobj->sd))
-		ns = targ->sd->s_ns;
+		ns = targ->sd->ns;
 	spin_unlock(&sysfs_symlink_target_lock);
 	kernfs_remove_by_name_ns(kobj->sd, name, ns);
 }
@@ -175,7 +175,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 		parent = kobj->sd;
 
 	if (targ->sd)
-		old_ns = targ->sd->s_ns;
+		old_ns = targ->sd->ns;
 
 	result = -ENOENT;
 	kn = kernfs_find_and_get_ns(parent, old, old_ns);
@@ -185,7 +185,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 	result = -EINVAL;
 	if (sysfs_type(kn) != SYSFS_KOBJ_LINK)
 		goto out;
-	if (kn->s_symlink.target_kn->priv != targ)
+	if (kn->symlink.target_kn->priv != targ)
 		goto out;
 
 	result = kernfs_rename_ns(kn, parent, new, new_ns);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 195d1c6a8b0..092469f60e3 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -49,7 +49,7 @@ enum kernfs_node_flag {
 /* type-specific structures for kernfs_node union members */
 struct kernfs_elem_dir {
 	unsigned long		subdirs;
-	/* children rbtree starts here and goes through kn->s_rb */
+	/* children rbtree starts here and goes through kn->rb */
 	struct rb_root		children;
 
 	/*
@@ -79,36 +79,36 @@ struct kernfs_elem_attr {
  * active reference.
  */
 struct kernfs_node {
-	atomic_t		s_count;
-	atomic_t		s_active;
+	atomic_t		count;
+	atomic_t		active;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
 	/* the following two fields are published */
-	struct kernfs_node	*s_parent;
-	const char		*s_name;
+	struct kernfs_node	*parent;
+	const char		*name;
 
-	struct rb_node		s_rb;
+	struct rb_node		rb;
 
 	union {
 		struct completion	*completion;
 		struct kernfs_node	*removed_list;
 	} u;
 
-	const void		*s_ns; /* namespace tag */
-	unsigned int		s_hash; /* ns + name hash */
+	const void		*ns;	/* namespace tag */
+	unsigned int		hash;	/* ns + name hash */
 	union {
-		struct kernfs_elem_dir		s_dir;
-		struct kernfs_elem_symlink	s_symlink;
-		struct kernfs_elem_attr		s_attr;
+		struct kernfs_elem_dir		dir;
+		struct kernfs_elem_symlink	symlink;
+		struct kernfs_elem_attr		attr;
 	};
 
 	void			*priv;
 
-	unsigned short		s_flags;
-	umode_t			s_mode;
-	unsigned int		s_ino;
-	struct sysfs_inode_attrs *s_iattr;
+	unsigned short		flags;
+	umode_t			mode;
+	unsigned int		ino;
+	struct sysfs_inode_attrs *iattr;
 };
 
 struct kernfs_root {
@@ -172,7 +172,7 @@ struct kernfs_ops {
 
 static inline enum kernfs_node_type sysfs_type(struct kernfs_node *kn)
 {
-	return kn->s_flags & SYSFS_TYPE_MASK;
+	return kn->flags & SYSFS_TYPE_MASK;
 }
 
 /**
@@ -186,8 +186,8 @@ static inline enum kernfs_node_type sysfs_type(struct kernfs_node *kn)
 static inline void kernfs_enable_ns(struct kernfs_node *kn)
 {
 	WARN_ON_ONCE(sysfs_type(kn) != SYSFS_DIR);
-	WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->s_dir.children));
-	kn->s_flags |= SYSFS_FLAG_NS;
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
+	kn->flags |= SYSFS_FLAG_NS;
 }
 
 /**
@@ -198,7 +198,7 @@ static inline void kernfs_enable_ns(struct kernfs_node *kn)
  */
 static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 {
-	return kn->s_flags & SYSFS_FLAG_NS;
+	return kn->flags & SYSFS_FLAG_NS;
 }
 
 struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
-- 
cgit v1.2.3-70-g09d2


From c525aaddc366df23eb095d58a2bdf11cce62a98b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 14:11:55 -0500
Subject: kernfs: s/sysfs/kernfs/ in various data structures

kernfs has just been separated out from sysfs and we're already in
full conflict mode.  Nothing can make the situation any worse.  Let's
take the chance to name things properly.

This patch performs the following renames.

* s/sysfs_open_dirent/kernfs_open_node/
* s/sysfs_open_file/kernfs_open_file/
* s/sysfs_inode_attrs/kernfs_iattrs/
* s/sysfs_addrm_cxt/kernfs_addrm_cxt/
* s/sysfs_super_info/kernfs_super_info/
* s/sysfs_info()/kernfs_info()/
* s/sysfs_open_dirent_lock/kernfs_open_node_lock/
* s/sysfs_open_file_mutex/kernfs_open_file_mutex/
* s/sysfs_of()/kernfs_of()/

This patch is strictly rename only and doesn't introduce any
functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             |  28 ++++----
 fs/kernfs/file.c            | 172 ++++++++++++++++++++++----------------------
 fs/kernfs/inode.c           |  30 ++++----
 fs/kernfs/kernfs-internal.h |  20 +++---
 fs/kernfs/mount.c           |  14 ++--
 fs/kernfs/symlink.c         |   2 +-
 fs/sysfs/file.c             |  10 +--
 include/linux/kernfs.h      |  18 ++---
 8 files changed, 147 insertions(+), 147 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 51fff9d2b33..d038bb204b5 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -295,7 +295,7 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 
 	/* The sysfs dirent has been moved to a different namespace */
 	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
-	    sysfs_info(dentry->d_sb)->ns != kn->ns)
+	    kernfs_info(dentry->d_sb)->ns != kn->ns)
 		goto out_bad;
 
 	mutex_unlock(&sysfs_mutex);
@@ -375,7 +375,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 
 /**
  *	sysfs_addrm_start - prepare for kernfs_node add/remove
- *	@acxt: pointer to sysfs_addrm_cxt to be used
+ *	@acxt: pointer to kernfs_addrm_cxt to be used
  *
  *	This function is called when the caller is about to add or remove
  *	kernfs_node.  This function acquires sysfs_mutex.  @acxt is used to
@@ -385,7 +385,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
  *	Kernel thread context (may sleep).  sysfs_mutex is locked on
  *	return.
  */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
+void sysfs_addrm_start(struct kernfs_addrm_cxt *acxt)
 	__acquires(sysfs_mutex)
 {
 	memset(acxt, 0, sizeof(*acxt));
@@ -414,11 +414,11 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct kernfs_node *kn,
+int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 		  struct kernfs_node *parent)
 {
 	bool has_ns = kernfs_ns_enabled(parent);
-	struct sysfs_inode_attrs *ps_iattr;
+	struct kernfs_iattrs *ps_iattr;
 	int ret;
 
 	if (has_ns != (bool)kn->ns) {
@@ -466,10 +466,10 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct kernfs_node *kn,
  *	LOCKING:
  *	Determined by sysfs_addrm_start().
  */
-static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
+static void sysfs_remove_one(struct kernfs_addrm_cxt *acxt,
 			     struct kernfs_node *kn)
 {
-	struct sysfs_inode_attrs *ps_iattr;
+	struct kernfs_iattrs *ps_iattr;
 
 	/*
 	 * Removal can be called multiple times on the same node.  Only the
@@ -505,7 +505,7 @@ static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
  *	LOCKING:
  *	sysfs_mutex is released.
  */
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
+void sysfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 	__releases(sysfs_mutex)
 {
 	/* release resources acquired by sysfs_addrm_start() */
@@ -649,7 +649,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const void *ns)
 {
 	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	struct sysfs_addrm_cxt acxt;
+	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 	int rc;
 
@@ -686,7 +686,7 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	mutex_lock(&sysfs_mutex);
 
 	if (kernfs_ns_enabled(parent))
-		ns = sysfs_info(dir->i_sb)->ns;
+		ns = kernfs_info(dir->i_sb)->ns;
 
 	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
 
@@ -778,7 +778,7 @@ static struct kernfs_node *sysfs_next_descendant_post(struct kernfs_node *pos,
 	return pos->parent;
 }
 
-static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
+static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 			    struct kernfs_node *kn)
 {
 	struct kernfs_node *pos, *next;
@@ -805,7 +805,7 @@ static void __kernfs_remove(struct sysfs_addrm_cxt *acxt,
  */
 void kernfs_remove(struct kernfs_node *kn)
 {
-	struct sysfs_addrm_cxt acxt;
+	struct kernfs_addrm_cxt acxt;
 
 	sysfs_addrm_start(&acxt);
 	__kernfs_remove(&acxt, kn);
@@ -824,7 +824,7 @@ void kernfs_remove(struct kernfs_node *kn)
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns)
 {
-	struct sysfs_addrm_cxt acxt;
+	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 
 	if (!parent) {
@@ -971,7 +971,7 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 	mutex_lock(&sysfs_mutex);
 
 	if (kernfs_ns_enabled(parent))
-		ns = sysfs_info(dentry->d_sb)->ns;
+		ns = kernfs_info(dentry->d_sb)->ns;
 
 	for (pos = sysfs_dir_pos(ns, parent, ctx->pos, pos);
 	     pos;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 5277021196a..2714a394cd8 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -18,27 +18,27 @@
 #include "kernfs-internal.h"
 
 /*
- * There's one sysfs_open_file for each open file and one sysfs_open_dirent
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
  * for each kernfs_node with one or more open files.
  *
- * kernfs_node->attr.open points to sysfs_open_dirent.  attr.open is
- * protected by sysfs_open_dirent_lock.
+ * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
+ * protected by kernfs_open_node_lock.
  *
  * filp->private_data points to seq_file whose ->private points to
- * sysfs_open_file.  sysfs_open_files are chained at
- * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
+ * kernfs_open_file.  kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
  */
-static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
-static DEFINE_MUTEX(sysfs_open_file_mutex);
+static DEFINE_SPINLOCK(kernfs_open_node_lock);
+static DEFINE_MUTEX(kernfs_open_file_mutex);
 
-struct sysfs_open_dirent {
+struct kernfs_open_node {
 	atomic_t		refcnt;
 	atomic_t		event;
 	wait_queue_head_t	poll;
-	struct list_head	files; /* goes through sysfs_open_file.list */
+	struct list_head	files; /* goes through kernfs_open_file.list */
 };
 
-static struct sysfs_open_file *sysfs_of(struct file *file)
+static struct kernfs_open_file *kernfs_of(struct file *file)
 {
 	return ((struct seq_file *)file->private_data)->private;
 }
@@ -56,7 +56,7 @@ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
 
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 {
-	struct sysfs_open_file *of = sf->private;
+	struct kernfs_open_file *of = sf->private;
 	const struct kernfs_ops *ops;
 
 	/*
@@ -81,7 +81,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 
 static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 {
-	struct sysfs_open_file *of = sf->private;
+	struct kernfs_open_file *of = sf->private;
 	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
 	if (ops->seq_next) {
@@ -98,7 +98,7 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 
 static void kernfs_seq_stop(struct seq_file *sf, void *v)
 {
-	struct sysfs_open_file *of = sf->private;
+	struct kernfs_open_file *of = sf->private;
 	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
 	if (ops->seq_stop)
@@ -110,7 +110,7 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
 
 static int kernfs_seq_show(struct seq_file *sf, void *v)
 {
-	struct sysfs_open_file *of = sf->private;
+	struct kernfs_open_file *of = sf->private;
 
 	of->event = atomic_read(&of->kn->attr.open->event);
 
@@ -130,7 +130,7 @@ static const struct seq_operations kernfs_seq_ops = {
  * it difficult to use seq_file.  Implement simplistic custom buffering for
  * bin files.
  */
-static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
+static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 				       char __user *user_buf, size_t count,
 				       loff_t *ppos)
 {
@@ -187,7 +187,7 @@ static ssize_t kernfs_file_direct_read(struct sysfs_open_file *of,
 static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 
 	if (of->kn->flags & SYSFS_FLAG_HAS_SEQ_SHOW)
 		return seq_read(file, user_buf, count, ppos);
@@ -214,7 +214,7 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
 				 size_t count, loff_t *ppos)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	ssize_t len = min_t(size_t, count, PAGE_SIZE);
 	const struct kernfs_ops *ops;
 	char *buf;
@@ -259,7 +259,7 @@ out_free:
 static void kernfs_vma_open(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 
 	if (!of->vm_ops)
 		return;
@@ -276,7 +276,7 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
 static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
 	if (!of->vm_ops)
@@ -297,7 +297,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 				   struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
 	if (!of->vm_ops)
@@ -320,7 +320,7 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
 			     void *buf, int len, int write)
 {
 	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
 	if (!of->vm_ops)
@@ -342,7 +342,7 @@ static int kernfs_vma_set_policy(struct vm_area_struct *vma,
 				 struct mempolicy *new)
 {
 	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
 	if (!of->vm_ops)
@@ -363,7 +363,7 @@ static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
 					       unsigned long addr)
 {
 	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	struct mempolicy *pol;
 
 	if (!of->vm_ops)
@@ -385,7 +385,7 @@ static int kernfs_vma_migrate(struct vm_area_struct *vma,
 			      unsigned long flags)
 {
 	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
 	if (!of->vm_ops)
@@ -417,7 +417,7 @@ static const struct vm_operations_struct kernfs_vm_ops = {
 
 static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
+	struct kernfs_open_file *of = kernfs_of(file);
 	const struct kernfs_ops *ops;
 	int rc;
 
@@ -473,9 +473,9 @@ out_unlock:
 }
 
 /**
- *	sysfs_get_open_dirent - get or create sysfs_open_dirent
+ *	sysfs_get_open_dirent - get or create kernfs_open_node
  *	@kn: target kernfs_node
- *	@of: sysfs_open_file for this instance of open
+ *	@of: kernfs_open_file for this instance of open
  *
  *	If @kn->attr.open exists, increment its reference count; otherwise,
  *	create one.  @of is chained to the files list.
@@ -487,49 +487,49 @@ out_unlock:
  *	0 on success, -errno on failure.
  */
 static int sysfs_get_open_dirent(struct kernfs_node *kn,
-				 struct sysfs_open_file *of)
+				 struct kernfs_open_file *of)
 {
-	struct sysfs_open_dirent *od, *new_od = NULL;
+	struct kernfs_open_node *on, *new_on = NULL;
 
  retry:
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irq(&sysfs_open_dirent_lock);
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irq(&kernfs_open_node_lock);
 
-	if (!kn->attr.open && new_od) {
-		kn->attr.open = new_od;
-		new_od = NULL;
+	if (!kn->attr.open && new_on) {
+		kn->attr.open = new_on;
+		new_on = NULL;
 	}
 
-	od = kn->attr.open;
-	if (od) {
-		atomic_inc(&od->refcnt);
-		list_add_tail(&of->list, &od->files);
+	on = kn->attr.open;
+	if (on) {
+		atomic_inc(&on->refcnt);
+		list_add_tail(&of->list, &on->files);
 	}
 
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	mutex_unlock(&sysfs_open_file_mutex);
+	spin_unlock_irq(&kernfs_open_node_lock);
+	mutex_unlock(&kernfs_open_file_mutex);
 
-	if (od) {
-		kfree(new_od);
+	if (on) {
+		kfree(new_on);
 		return 0;
 	}
 
 	/* not there, initialize a new one and retry */
-	new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
-	if (!new_od)
+	new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+	if (!new_on)
 		return -ENOMEM;
 
-	atomic_set(&new_od->refcnt, 0);
-	atomic_set(&new_od->event, 1);
-	init_waitqueue_head(&new_od->poll);
-	INIT_LIST_HEAD(&new_od->files);
+	atomic_set(&new_on->refcnt, 0);
+	atomic_set(&new_on->event, 1);
+	init_waitqueue_head(&new_on->poll);
+	INIT_LIST_HEAD(&new_on->files);
 	goto retry;
 }
 
 /**
- *	sysfs_put_open_dirent - put sysfs_open_dirent
+ *	sysfs_put_open_dirent - put kernfs_open_node
  *	@kn: target kernfs_nodet
- *	@of: associated sysfs_open_file
+ *	@of: associated kernfs_open_file
  *
  *	Put @kn->attr.open and unlink @of from the files list.  If
  *	reference count reaches zero, disassociate and free it.
@@ -538,33 +538,33 @@ static int sysfs_get_open_dirent(struct kernfs_node *kn,
  *	None.
  */
 static void sysfs_put_open_dirent(struct kernfs_node *kn,
-				  struct sysfs_open_file *of)
+				  struct kernfs_open_file *of)
 {
-	struct sysfs_open_dirent *od = kn->attr.open;
+	struct kernfs_open_node *on = kn->attr.open;
 	unsigned long flags;
 
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
 
 	if (of)
 		list_del(&of->list);
 
-	if (atomic_dec_and_test(&od->refcnt))
+	if (atomic_dec_and_test(&on->refcnt))
 		kn->attr.open = NULL;
 	else
-		od = NULL;
+		on = NULL;
 
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-	mutex_unlock(&sysfs_open_file_mutex);
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+	mutex_unlock(&kernfs_open_file_mutex);
 
-	kfree(od);
+	kfree(on);
 }
 
 static int kernfs_file_open(struct inode *inode, struct file *file)
 {
 	struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
 	const struct kernfs_ops *ops;
-	struct sysfs_open_file *of;
+	struct kernfs_open_file *of;
 	bool has_read, has_write, has_mmap;
 	int error = -EACCES;
 
@@ -586,9 +586,9 @@ static int kernfs_file_open(struct inode *inode, struct file *file)
 	    (!(inode->i_mode & S_IRUGO) || !has_read))
 		goto err_out;
 
-	/* allocate a sysfs_open_file for the file */
+	/* allocate a kernfs_open_file for the file */
 	error = -ENOMEM;
-	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
+	of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
 	if (!of)
 		goto err_out;
 
@@ -654,7 +654,7 @@ err_out:
 static int kernfs_file_release(struct inode *inode, struct file *filp)
 {
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_file *of = sysfs_of(filp);
+	struct kernfs_open_file *of = kernfs_of(filp);
 
 	sysfs_put_open_dirent(kn, of);
 	seq_release(inode, filp);
@@ -665,26 +665,26 @@ static int kernfs_file_release(struct inode *inode, struct file *filp)
 
 void sysfs_unmap_bin_file(struct kernfs_node *kn)
 {
-	struct sysfs_open_dirent *od;
-	struct sysfs_open_file *of;
+	struct kernfs_open_node *on;
+	struct kernfs_open_file *of;
 
 	if (!(kn->flags & SYSFS_FLAG_HAS_MMAP))
 		return;
 
-	spin_lock_irq(&sysfs_open_dirent_lock);
-	od = kn->attr.open;
-	if (od)
-		atomic_inc(&od->refcnt);
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	if (!od)
+	spin_lock_irq(&kernfs_open_node_lock);
+	on = kn->attr.open;
+	if (on)
+		atomic_inc(&on->refcnt);
+	spin_unlock_irq(&kernfs_open_node_lock);
+	if (!on)
 		return;
 
-	mutex_lock(&sysfs_open_file_mutex);
-	list_for_each_entry(of, &od->files, list) {
+	mutex_lock(&kernfs_open_file_mutex);
+	list_for_each_entry(of, &on->files, list) {
 		struct inode *inode = file_inode(of->file);
 		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
 	}
-	mutex_unlock(&sysfs_open_file_mutex);
+	mutex_unlock(&kernfs_open_file_mutex);
 
 	sysfs_put_open_dirent(kn, NULL);
 }
@@ -704,19 +704,19 @@ void sysfs_unmap_bin_file(struct kernfs_node *kn)
  */
 static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
 {
-	struct sysfs_open_file *of = sysfs_of(filp);
+	struct kernfs_open_file *of = kernfs_of(filp);
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_dirent *od = kn->attr.open;
+	struct kernfs_open_node *on = kn->attr.open;
 
 	/* need parent for the kobj, grab both */
 	if (!sysfs_get_active(kn))
 		goto trigger;
 
-	poll_wait(filp, &od->poll, wait);
+	poll_wait(filp, &on->poll, wait);
 
 	sysfs_put_active(kn);
 
-	if (of->event != atomic_read(&od->event))
+	if (of->event != atomic_read(&on->event))
 		goto trigger;
 
 	return DEFAULT_POLLMASK;
@@ -733,20 +733,20 @@ static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
  */
 void kernfs_notify(struct kernfs_node *kn)
 {
-	struct sysfs_open_dirent *od;
+	struct kernfs_open_node *on;
 	unsigned long flags;
 
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
 
 	if (!WARN_ON(sysfs_type(kn) != SYSFS_KOBJ_ATTR)) {
-		od = kn->attr.open;
-		if (od) {
-			atomic_inc(&od->event);
-			wake_up_interruptible(&od->poll);
+		on = kn->attr.open;
+		if (on) {
+			atomic_inc(&on->event);
+			wake_up_interruptible(&on->poll);
 		}
 	}
 
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
@@ -780,7 +780,7 @@ struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 					      void *priv, const void *ns,
 					      struct lock_class_key *key)
 {
-	struct sysfs_addrm_cxt acxt;
+	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 	int rc;
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index f6c0aae3dd5..a0e0038fd57 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -46,14 +46,14 @@ void __init sysfs_inode_init(void)
 		panic("failed to init sysfs_backing_dev_info");
 }
 
-static struct sysfs_inode_attrs *sysfs_inode_attrs(struct kernfs_node *kn)
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 {
 	struct iattr *iattrs;
 
 	if (kn->iattr)
 		return kn->iattr;
 
-	kn->iattr = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
 	if (!kn->iattr)
 		return NULL;
 	iattrs = &kn->iattr->ia_iattr;
@@ -71,11 +71,11 @@ static struct sysfs_inode_attrs *sysfs_inode_attrs(struct kernfs_node *kn)
 
 static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
-	struct sysfs_inode_attrs *attrs;
+	struct kernfs_iattrs *attrs;
 	struct iattr *iattrs;
 	unsigned int ia_valid = iattr->ia_valid;
 
-	attrs = sysfs_inode_attrs(kn);
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -144,11 +144,11 @@ out:
 static int sysfs_sd_setsecdata(struct kernfs_node *kn, void **secdata,
 			       u32 *secdata_len)
 {
-	struct sysfs_inode_attrs *attrs;
+	struct kernfs_iattrs *attrs;
 	void *old_secdata;
 	size_t old_secdata_len;
 
-	attrs = sysfs_inode_attrs(kn);
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -167,12 +167,12 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
-	struct sysfs_inode_attrs *attrs;
+	struct kernfs_iattrs *attrs;
 	void *secdata;
 	int error;
 	u32 secdata_len = 0;
 
-	attrs = sysfs_inode_attrs(kn);
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -205,9 +205,9 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 int sysfs_removexattr(struct dentry *dentry, const char *name)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
-	struct sysfs_inode_attrs *attrs;
+	struct kernfs_iattrs *attrs;
 
-	attrs = sysfs_inode_attrs(kn);
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -218,9 +218,9 @@ ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
 		       size_t size)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
-	struct sysfs_inode_attrs *attrs;
+	struct kernfs_iattrs *attrs;
 
-	attrs = sysfs_inode_attrs(kn);
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -230,9 +230,9 @@ ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
 ssize_t sysfs_listxattr(struct dentry *dentry, char *buf, size_t size)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
-	struct sysfs_inode_attrs *attrs;
+	struct kernfs_iattrs *attrs;
 
-	attrs = sysfs_inode_attrs(kn);
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -256,7 +256,7 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
 
 static void sysfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 {
-	struct sysfs_inode_attrs *attrs = kn->iattr;
+	struct kernfs_iattrs *attrs = kn->iattr;
 
 	inode->i_mode = kn->mode;
 	if (attrs) {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 2dbb1cb95e7..573f6698864 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -18,7 +18,7 @@
 
 #include <linux/kernfs.h>
 
-struct sysfs_inode_attrs {
+struct kernfs_iattrs {
 	struct iattr		ia_iattr;
 	void			*ia_secdata;
 	u32			ia_secdata_len;
@@ -47,14 +47,14 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
 /*
  * Context structure to be used while adding/removing nodes.
  */
-struct sysfs_addrm_cxt {
+struct kernfs_addrm_cxt {
 	struct kernfs_node	*removed;
 };
 
 /*
  * mount.c
  */
-struct sysfs_super_info {
+struct kernfs_super_info {
 	/*
 	 * The root associated with this super_block.  Each super_block is
 	 * identified by the root and ns it's associated with.
@@ -63,13 +63,13 @@ struct sysfs_super_info {
 
 	/*
 	 * Each sb is associated with one namespace tag, currently the
-	 * network namespace of the task which mounted this sysfs instance.
-	 * If multiple tags become necessary, make the following an array
-	 * and compare kernfs_node tag against every entry.
+	 * network namespace of the task which mounted this kernfs
+	 * instance.  If multiple tags become necessary, make the following
+	 * an array and compare kernfs_node tag against every entry.
 	 */
 	const void		*ns;
 };
-#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
+#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
 
 extern struct kmem_cache *sysfs_dir_cachep;
 
@@ -100,10 +100,10 @@ extern const struct inode_operations sysfs_dir_inode_operations;
 
 struct kernfs_node *sysfs_get_active(struct kernfs_node *kn);
 void sysfs_put_active(struct kernfs_node *kn);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct kernfs_node *kn,
+void sysfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 		  struct kernfs_node *parent);
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
+void sysfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
 struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 				     const char *name, umode_t mode, int type);
 
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 9dbbf37b1af..e0796dcb606 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -27,7 +27,7 @@ static const struct super_operations sysfs_ops = {
 
 static int sysfs_fill_super(struct super_block *sb)
 {
-	struct sysfs_super_info *info = sysfs_info(sb);
+	struct kernfs_super_info *info = kernfs_info(sb);
 	struct inode *inode;
 	struct dentry *root;
 
@@ -61,8 +61,8 @@ static int sysfs_fill_super(struct super_block *sb)
 
 static int sysfs_test_super(struct super_block *sb, void *data)
 {
-	struct sysfs_super_info *sb_info = sysfs_info(sb);
-	struct sysfs_super_info *info = data;
+	struct kernfs_super_info *sb_info = kernfs_info(sb);
+	struct kernfs_super_info *info = data;
 
 	return sb_info->root == info->root && sb_info->ns == info->ns;
 }
@@ -84,7 +84,7 @@ static int sysfs_set_super(struct super_block *sb, void *data)
  */
 const void *kernfs_super_ns(struct super_block *sb)
 {
-	struct sysfs_super_info *info = sysfs_info(sb);
+	struct kernfs_super_info *info = kernfs_info(sb);
 
 	return info->ns;
 }
@@ -107,7 +107,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 			       struct kernfs_root *root, const void *ns)
 {
 	struct super_block *sb;
-	struct sysfs_super_info *info;
+	struct kernfs_super_info *info;
 	int error;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -144,12 +144,12 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
  */
 void kernfs_kill_sb(struct super_block *sb)
 {
-	struct sysfs_super_info *info = sysfs_info(sb);
+	struct kernfs_super_info *info = kernfs_info(sb);
 	struct kernfs_node *root_kn = sb->s_root->d_fsdata;
 
 	/*
 	 * Remove the superblock from fs_supers/s_instances
-	 * so we can't find it, before freeing sysfs_super_info.
+	 * so we can't find it, before freeing kernfs_super_info.
 	 */
 	kill_anon_super(sb);
 	kfree(info);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 5ac1a57c380..f36e3f1b247 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -27,7 +27,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       struct kernfs_node *target)
 {
 	struct kernfs_node *kn;
-	struct sysfs_addrm_cxt acxt;
+	struct kernfs_addrm_cxt acxt;
 	int error;
 
 	kn = sysfs_new_dirent(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 887703a7906..fd104b282f8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -41,7 +41,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
  */
 static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
-	struct sysfs_open_file *of = sf->private;
+	struct kernfs_open_file *of = sf->private;
 	struct kobject *kobj = of->kn->parent->priv;
 	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
 	ssize_t count;
@@ -78,7 +78,7 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
+static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
 				 size_t count, loff_t pos)
 {
 	struct bin_attribute *battr = of->kn->priv;
@@ -102,7 +102,7 @@ static ssize_t sysfs_kf_bin_read(struct sysfs_open_file *of, char *buf,
 }
 
 /* kernfs write callback for regular sysfs files */
-static ssize_t sysfs_kf_write(struct sysfs_open_file *of, char *buf,
+static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
 			      size_t count, loff_t pos)
 {
 	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
@@ -115,7 +115,7 @@ static ssize_t sysfs_kf_write(struct sysfs_open_file *of, char *buf,
 }
 
 /* kernfs write callback for bin sysfs files */
-static ssize_t sysfs_kf_bin_write(struct sysfs_open_file *of, char *buf,
+static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
 				  size_t count, loff_t pos)
 {
 	struct bin_attribute *battr = of->kn->priv;
@@ -136,7 +136,7 @@ static ssize_t sysfs_kf_bin_write(struct sysfs_open_file *of, char *buf,
 	return battr->write(of->file, kobj, battr, buf, pos, count);
 }
 
-static int sysfs_kf_bin_mmap(struct sysfs_open_file *of,
+static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
 			     struct vm_area_struct *vma)
 {
 	struct bin_attribute *battr = of->kn->priv;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 092469f60e3..757647c4cb3 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -24,8 +24,8 @@ struct vm_area_struct;
 struct super_block;
 struct file_system_type;
 
-struct sysfs_open_dirent;
-struct sysfs_inode_attrs;
+struct kernfs_open_node;
+struct kernfs_iattrs;
 
 enum kernfs_node_type {
 	SYSFS_DIR		= 0x0001,
@@ -65,7 +65,7 @@ struct kernfs_elem_symlink {
 
 struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
-	struct sysfs_open_dirent *open;
+	struct kernfs_open_node	*open;
 	loff_t			size;
 };
 
@@ -108,7 +108,7 @@ struct kernfs_node {
 	unsigned short		flags;
 	umode_t			mode;
 	unsigned int		ino;
-	struct sysfs_inode_attrs *iattr;
+	struct kernfs_iattrs	*iattr;
 };
 
 struct kernfs_root {
@@ -119,7 +119,7 @@ struct kernfs_root {
 	struct ida		ino_ida;
 };
 
-struct sysfs_open_file {
+struct kernfs_open_file {
 	/* published fields */
 	struct kernfs_node	*kn;
 	struct file		*file;
@@ -140,7 +140,7 @@ struct kernfs_ops {
 	 * If seq_show() is present, seq_file path is active.  Other seq
 	 * operations are optional and if not implemented, the behavior is
 	 * equivalent to single_open().  @sf->private points to the
-	 * associated sysfs_open_file.
+	 * associated kernfs_open_file.
 	 *
 	 * read() is bounced through kernel buffer and a read larger than
 	 * PAGE_SIZE results in partial operation of PAGE_SIZE.
@@ -151,17 +151,17 @@ struct kernfs_ops {
 	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
 	void (*seq_stop)(struct seq_file *sf, void *v);
 
-	ssize_t (*read)(struct sysfs_open_file *of, char *buf, size_t bytes,
+	ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
 			loff_t off);
 
 	/*
 	 * write() is bounced through kernel buffer and a write larger than
 	 * PAGE_SIZE results in partial operation of PAGE_SIZE.
 	 */
-	ssize_t (*write)(struct sysfs_open_file *of, char *buf, size_t bytes,
+	ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
 			 loff_t off);
 
-	int (*mmap)(struct sysfs_open_file *of, struct vm_area_struct *vma);
+	int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key	lockdep_key;
-- 
cgit v1.2.3-70-g09d2


From df23fc39bce03bb26e63bea57fc5f5bf6882d74b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 14:11:56 -0500
Subject: kernfs: s/sysfs/kernfs/ in constants

kernfs has just been separated out from sysfs and we're already in
full conflict mode.  Nothing can make the situation any worse.  Let's
take the chance to name things properly.

This patch performs the following renames.

* s/SYSFS_DIR/KERNFS_DIR/
* s/SYSFS_KOBJ_ATTR/KERNFS_FILE/
* s/SYSFS_KOBJ_LINK/KERNFS_LINK/
* s/SYSFS_{TYPE_FLAGS}/KERNFS_{TYPE_FLAGS}/
* s/SYSFS_FLAG_{FLAG}/KERNFS_{FLAG}/
* s/sysfs_type()/kernfs_type()/
* s/SD_DEACTIVATED_BIAS/KN_DEACTIVATED_BIAS/

This patch is strictly rename only and doesn't introduce any
functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 50 ++++++++++++++++++++++-----------------------
 fs/kernfs/file.c            | 18 ++++++++--------
 fs/kernfs/inode.c           | 10 ++++-----
 fs/kernfs/kernfs-internal.h |  4 ++--
 fs/kernfs/symlink.c         |  2 +-
 fs/sysfs/dir.c              |  2 +-
 fs/sysfs/file.c             |  2 +-
 fs/sysfs/symlink.c          |  2 +-
 include/linux/kernfs.h      | 36 ++++++++++++++++----------------
 9 files changed, 63 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index d038bb204b5..bc8a3b367a8 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -78,7 +78,7 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
 	struct rb_node **node = &kn->parent->dir.children.rb_node;
 	struct rb_node *parent = NULL;
 
-	if (sysfs_type(kn) == SYSFS_DIR)
+	if (kernfs_type(kn) == KERNFS_DIR)
 		kn->parent->dir.subdirs++;
 
 	while (*node) {
@@ -113,7 +113,7 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
  */
 static void sysfs_unlink_sibling(struct kernfs_node *kn)
 {
-	if (sysfs_type(kn) == SYSFS_DIR)
+	if (kernfs_type(kn) == KERNFS_DIR)
 		kn->parent->dir.subdirs--;
 
 	rb_erase(&kn->rb, &kn->parent->dir.children);
@@ -137,7 +137,7 @@ struct kernfs_node *sysfs_get_active(struct kernfs_node *kn)
 	if (!atomic_inc_unless_negative(&kn->active))
 		return NULL;
 
-	if (kn->flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & KERNFS_LOCKDEP)
 		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
 	return kn;
 }
@@ -156,10 +156,10 @@ void sysfs_put_active(struct kernfs_node *kn)
 	if (unlikely(!kn))
 		return;
 
-	if (kn->flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & KERNFS_LOCKDEP)
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	v = atomic_dec_return(&kn->active);
-	if (likely(v != SD_DEACTIVATED_BIAS))
+	if (likely(v != KN_DEACTIVATED_BIAS))
 		return;
 
 	/*
@@ -180,9 +180,9 @@ static void sysfs_deactivate(struct kernfs_node *kn)
 	DECLARE_COMPLETION_ONSTACK(wait);
 	int v;
 
-	BUG_ON(!(kn->flags & SYSFS_FLAG_REMOVED));
+	BUG_ON(!(kn->flags & KERNFS_REMOVED));
 
-	if (!(sysfs_type(kn) & SYSFS_ACTIVE_REF))
+	if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
 		return;
 
 	kn->u.completion = (void *)&wait;
@@ -191,9 +191,9 @@ static void sysfs_deactivate(struct kernfs_node *kn)
 	/* atomic_add_return() is a mb(), put_active() will always see
 	 * the updated kn->u.completion.
 	 */
-	v = atomic_add_return(SD_DEACTIVATED_BIAS, &kn->active);
+	v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
 
-	if (v != SD_DEACTIVATED_BIAS) {
+	if (v != KN_DEACTIVATED_BIAS) {
 		lock_contended(&kn->dep_map, _RET_IP_);
 		wait_for_completion(&wait);
 	}
@@ -235,13 +235,13 @@ void kernfs_put(struct kernfs_node *kn)
 	 */
 	parent = kn->parent;
 
-	WARN(!(kn->flags & SYSFS_FLAG_REMOVED),
+	WARN(!(kn->flags & KERNFS_REMOVED),
 		"sysfs: free using entry: %s/%s\n",
 		parent ? parent->name : "", kn->name);
 
-	if (sysfs_type(kn) == SYSFS_KOBJ_LINK)
+	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
-	if (sysfs_type(kn) & SYSFS_COPY_NAME)
+	if (kernfs_type(kn) & KERNFS_COPY_NAME)
 		kfree(kn->name);
 	if (kn->iattr) {
 		if (kn->iattr->ia_secdata)
@@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(kernfs_put);
 static int sysfs_dentry_delete(const struct dentry *dentry)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
-	return !(kn && !(kn->flags & SYSFS_FLAG_REMOVED));
+	return !(kn && !(kn->flags & KERNFS_REMOVED));
 }
 
 static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
@@ -282,7 +282,7 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 	mutex_lock(&sysfs_mutex);
 
 	/* The sysfs dirent has been deleted */
-	if (kn->flags & SYSFS_FLAG_REMOVED)
+	if (kn->flags & KERNFS_REMOVED)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved? */
@@ -342,7 +342,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	int ret;
 
-	if (type & SYSFS_COPY_NAME) {
+	if (type & KERNFS_COPY_NAME) {
 		name = dup_name = kstrdup(name, GFP_KERNEL);
 		if (!name)
 			return NULL;
@@ -362,7 +362,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 
 	kn->name = name;
 	kn->mode = mode;
-	kn->flags = type | SYSFS_FLAG_REMOVED;
+	kn->flags = type | KERNFS_REMOVED;
 
 	return kn;
 
@@ -427,7 +427,7 @@ int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 		return -EINVAL;
 	}
 
-	if (sysfs_type(parent) != SYSFS_DIR)
+	if (kernfs_type(parent) != KERNFS_DIR)
 		return -EINVAL;
 
 	kn->hash = sysfs_name_hash(kn->name, kn->ns);
@@ -446,7 +446,7 @@ int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	}
 
 	/* Mark the entry added into directory tree */
-	kn->flags &= ~SYSFS_FLAG_REMOVED;
+	kn->flags &= ~KERNFS_REMOVED;
 
 	return 0;
 }
@@ -475,7 +475,7 @@ static void sysfs_remove_one(struct kernfs_addrm_cxt *acxt,
 	 * Removal can be called multiple times on the same node.  Only the
 	 * first invocation is effective and puts the base ref.
 	 */
-	if (kn->flags & SYSFS_FLAG_REMOVED)
+	if (kn->flags & KERNFS_REMOVED)
 		return;
 
 	if (kn->parent) {
@@ -489,7 +489,7 @@ static void sysfs_remove_one(struct kernfs_addrm_cxt *acxt,
 		}
 	}
 
-	kn->flags |= SYSFS_FLAG_REMOVED;
+	kn->flags |= KERNFS_REMOVED;
 	kn->u.removed_list = acxt->removed;
 	acxt->removed = kn;
 }
@@ -607,14 +607,14 @@ struct kernfs_root *kernfs_create_root(void *priv)
 
 	ida_init(&root->ino_ida);
 
-	kn = sysfs_new_dirent(root, "", S_IFDIR | S_IRUGO | S_IXUGO, SYSFS_DIR);
+	kn = sysfs_new_dirent(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR);
 	if (!kn) {
 		ida_destroy(&root->ino_ida);
 		kfree(root);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	kn->flags &= ~SYSFS_FLAG_REMOVED;
+	kn->flags &= ~KERNFS_REMOVED;
 	kn->priv = priv;
 	kn->dir.root = root;
 
@@ -654,7 +654,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	int rc;
 
 	/* allocate */
-	kn = sysfs_new_dirent(kernfs_root(parent), name, mode, SYSFS_DIR);
+	kn = sysfs_new_dirent(kernfs_root(parent), name, mode, KERNFS_DIR);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -732,7 +732,7 @@ static struct kernfs_node *sysfs_leftmost_descendant(struct kernfs_node *pos)
 
 		last = pos;
 
-		if (sysfs_type(pos) != SYSFS_DIR)
+		if (kernfs_type(pos) != KERNFS_DIR)
 			break;
 
 		rbn = rb_first(&pos->dir.children);
@@ -914,7 +914,7 @@ static struct kernfs_node *sysfs_dir_pos(const void *ns,
 	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
 {
 	if (pos) {
-		int valid = !(pos->flags & SYSFS_FLAG_REMOVED) &&
+		int valid = !(pos->flags & KERNFS_REMOVED) &&
 			pos->parent == parent && hash == pos->hash;
 		kernfs_put(pos);
 		if (!valid)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 2714a394cd8..abe93e12089 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -49,7 +49,7 @@ static struct kernfs_open_file *kernfs_of(struct file *file)
  */
 static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
 {
-	if (kn->flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & KERNFS_LOCKDEP)
 		lockdep_assert_held(kn);
 	return kn->attr.ops;
 }
@@ -189,7 +189,7 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 {
 	struct kernfs_open_file *of = kernfs_of(file);
 
-	if (of->kn->flags & SYSFS_FLAG_HAS_SEQ_SHOW)
+	if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
 		return seq_read(file, user_buf, count, ppos);
 	else
 		return kernfs_file_direct_read(of, user_buf, count, ppos);
@@ -428,7 +428,7 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
 	 * comment in kernfs_file_open() for more details.
 	 */
-	if (!(of->kn->flags & SYSFS_FLAG_HAS_MMAP))
+	if (!(of->kn->flags & KERNFS_HAS_MMAP))
 		return -ENODEV;
 
 	mutex_lock(&of->mutex);
@@ -668,7 +668,7 @@ void sysfs_unmap_bin_file(struct kernfs_node *kn)
 	struct kernfs_open_node *on;
 	struct kernfs_open_file *of;
 
-	if (!(kn->flags & SYSFS_FLAG_HAS_MMAP))
+	if (!(kn->flags & KERNFS_HAS_MMAP))
 		return;
 
 	spin_lock_irq(&kernfs_open_node_lock);
@@ -738,7 +738,7 @@ void kernfs_notify(struct kernfs_node *kn)
 
 	spin_lock_irqsave(&kernfs_open_node_lock, flags);
 
-	if (!WARN_ON(sysfs_type(kn) != SYSFS_KOBJ_ATTR)) {
+	if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
 		on = kn->attr.open;
 		if (on) {
 			atomic_inc(&on->event);
@@ -785,7 +785,7 @@ struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 	int rc;
 
 	kn = sysfs_new_dirent(kernfs_root(parent), name,
-			      (mode & S_IALLUGO) | S_IFREG, SYSFS_KOBJ_ATTR);
+			      (mode & S_IALLUGO) | S_IFREG, KERNFS_FILE);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -797,7 +797,7 @@ struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	if (key) {
 		lockdep_init_map(&kn->dep_map, "s_active", key, 0);
-		kn->flags |= SYSFS_FLAG_LOCKDEP;
+		kn->flags |= KERNFS_LOCKDEP;
 	}
 #endif
 
@@ -807,9 +807,9 @@ struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 	 * ref.  Cache their existence in flags.
 	 */
 	if (ops->seq_show)
-		kn->flags |= SYSFS_FLAG_HAS_SEQ_SHOW;
+		kn->flags |= KERNFS_HAS_SEQ_SHOW;
 	if (ops->mmap)
-		kn->flags |= SYSFS_FLAG_HAS_MMAP;
+		kn->flags |= KERNFS_HAS_MMAP;
 
 	sysfs_addrm_start(&acxt);
 	rc = sysfs_add_one(&acxt, kn, parent);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index a0e0038fd57..af92638d792 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -269,7 +269,7 @@ static void sysfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 					    attrs->ia_secdata_len);
 	}
 
-	if (sysfs_type(kn) == SYSFS_DIR)
+	if (kernfs_type(kn) == KERNFS_DIR)
 		set_nlink(inode, kn->dir.subdirs + 2);
 }
 
@@ -299,16 +299,16 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	sysfs_refresh_inode(kn, inode);
 
 	/* initialize inode according to type */
-	switch (sysfs_type(kn)) {
-	case SYSFS_DIR:
+	switch (kernfs_type(kn)) {
+	case KERNFS_DIR:
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
 		break;
-	case SYSFS_KOBJ_ATTR:
+	case KERNFS_FILE:
 		inode->i_size = kn->attr.size;
 		inode->i_fop = &kernfs_file_operations;
 		break;
-	case SYSFS_KOBJ_LINK:
+	case KERNFS_LINK:
 		inode->i_op = &sysfs_symlink_inode_operations;
 		break;
 	default:
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 573f6698864..c4bf5bf72f7 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,9 +26,9 @@ struct kernfs_iattrs {
 	struct simple_xattrs	xattrs;
 };
 
-#define SD_DEACTIVATED_BIAS		INT_MIN
+#define KN_DEACTIVATED_BIAS		INT_MIN
 
-/* SYSFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
+/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
 
 /**
  * kernfs_root - find out the kernfs_root a kernfs_node belongs to
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index f36e3f1b247..a92284d3c73 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -31,7 +31,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	int error;
 
 	kn = sysfs_new_dirent(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
-			      SYSFS_KOBJ_LINK);
+			      KERNFS_LINK);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4a800017558..aa007401bfc 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -113,7 +113,7 @@ void sysfs_remove_dir(struct kobject *kobj)
 	spin_unlock(&sysfs_symlink_target_lock);
 
 	if (kn) {
-		WARN_ON_ONCE(sysfs_type(kn) != SYSFS_DIR);
+		WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
 		kernfs_remove(kn);
 	}
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index fd104b282f8..fe6388fbd15 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -29,7 +29,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 {
 	struct kobject *kobj = kn->parent->priv;
 
-	if (kn->flags & SYSFS_FLAG_LOCKDEP)
+	if (kn->flags & KERNFS_LOCKDEP)
 		lockdep_assert_held(kn);
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 0d48ea91150..aecb15f8455 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -183,7 +183,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 		goto out;
 
 	result = -EINVAL;
-	if (sysfs_type(kn) != SYSFS_KOBJ_LINK)
+	if (kernfs_type(kn) != KERNFS_LINK)
 		goto out;
 	if (kn->symlink.target_kn->priv != targ)
 		goto out;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 757647c4cb3..e9c4e3a0396 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -28,22 +28,22 @@ struct kernfs_open_node;
 struct kernfs_iattrs;
 
 enum kernfs_node_type {
-	SYSFS_DIR		= 0x0001,
-	SYSFS_KOBJ_ATTR		= 0x0002,
-	SYSFS_KOBJ_LINK		= 0x0004,
+	KERNFS_DIR		= 0x0001,
+	KERNFS_FILE		= 0x0002,
+	KERNFS_LINK		= 0x0004,
 };
 
-#define SYSFS_TYPE_MASK		0x000f
-#define SYSFS_COPY_NAME		(SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF	SYSFS_KOBJ_ATTR
-#define SYSFS_FLAG_MASK		~SYSFS_TYPE_MASK
+#define KERNFS_TYPE_MASK	0x000f
+#define KERNFS_COPY_NAME	(KERNFS_DIR | KERNFS_LINK)
+#define KERNFS_ACTIVE_REF	KERNFS_FILE
+#define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
 enum kernfs_node_flag {
-	SYSFS_FLAG_REMOVED	= 0x0010,
-	SYSFS_FLAG_NS		= 0x0020,
-	SYSFS_FLAG_HAS_SEQ_SHOW	= 0x0040,
-	SYSFS_FLAG_HAS_MMAP	= 0x0080,
-	SYSFS_FLAG_LOCKDEP	= 0x0100,
+	KERNFS_REMOVED		= 0x0010,
+	KERNFS_NS		= 0x0020,
+	KERNFS_HAS_SEQ_SHOW	= 0x0040,
+	KERNFS_HAS_MMAP		= 0x0080,
+	KERNFS_LOCKDEP		= 0x0100,
 };
 
 /* type-specific structures for kernfs_node union members */
@@ -170,9 +170,9 @@ struct kernfs_ops {
 
 #ifdef CONFIG_SYSFS
 
-static inline enum kernfs_node_type sysfs_type(struct kernfs_node *kn)
+static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
 {
-	return kn->flags & SYSFS_TYPE_MASK;
+	return kn->flags & KERNFS_TYPE_MASK;
 }
 
 /**
@@ -185,9 +185,9 @@ static inline enum kernfs_node_type sysfs_type(struct kernfs_node *kn)
  */
 static inline void kernfs_enable_ns(struct kernfs_node *kn)
 {
-	WARN_ON_ONCE(sysfs_type(kn) != SYSFS_DIR);
+	WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
 	WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
-	kn->flags |= SYSFS_FLAG_NS;
+	kn->flags |= KERNFS_NS;
 }
 
 /**
@@ -198,7 +198,7 @@ static inline void kernfs_enable_ns(struct kernfs_node *kn)
  */
 static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 {
-	return kn->flags & SYSFS_FLAG_NS;
+	return kn->flags & KERNFS_NS;
 }
 
 struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
@@ -238,7 +238,7 @@ void kernfs_init(void);
 
 #else	/* CONFIG_SYSFS */
 
-static inline enum kernfs_node_type sysfs_type(struct kernfs_node *kn)
+static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
 { return 0; }	/* whatever */
 
 static inline void kernfs_enable_ns(struct kernfs_node *kn) { }
-- 
cgit v1.2.3-70-g09d2


From a797bfc30532388e8a11ca726df60cdd77aa8675 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 14:11:57 -0500
Subject: kernfs: s/sysfs/kernfs/ in global variables

kernfs has just been separated out from sysfs and we're already in
full conflict mode.  Nothing can make the situation any worse.  Let's
take the chance to name things properly.

This patch performs the following renames.

* s/sysfs_mutex/kernfs_mutex/
* s/sysfs_dentry_ops/kernfs_dops/
* s/sysfs_dir_operations/kernfs_dir_fops/
* s/sysfs_dir_inode_operations/kernfs_dir_iops/
* s/kernfs_file_operations/kernfs_file_fops/ - renamed for consistency
* s/sysfs_symlink_inode_operations/kernfs_symlink_iops/
* s/sysfs_aops/kernfs_aops/
* s/sysfs_backing_dev_info/kernfs_bdi/
* s/sysfs_inode_operations/kernfs_iops/
* s/sysfs_dir_cachep/kernfs_node_cache/
* s/sysfs_ops/kernfs_sops/

This patch is strictly rename only and doesn't introduce any
functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 64 ++++++++++++++++++++++-----------------------
 fs/kernfs/file.c            |  2 +-
 fs/kernfs/inode.c           | 46 ++++++++++++++++----------------
 fs/kernfs/kernfs-internal.h | 14 +++++-----
 fs/kernfs/mount.c           | 14 +++++-----
 fs/kernfs/symlink.c         |  6 ++---
 6 files changed, 73 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bc8a3b367a8..d3c66237474 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,7 @@
 
 #include "kernfs-internal.h"
 
-DEFINE_MUTEX(sysfs_mutex);
+DEFINE_MUTEX(kernfs_mutex);
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
@@ -68,7 +68,7 @@ static int sysfs_sd_compare(const struct kernfs_node *left,
  *	@kn->parent->dir.children.
  *
  *	Locking:
- *	mutex_lock(sysfs_mutex)
+ *	mutex_lock(kernfs_mutex)
  *
  *	RETURNS:
  *	0 on susccess -EEXIST on failure.
@@ -109,7 +109,7 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
  *	kn->parent->dir.children.
  *
  *	Locking:
- *	mutex_lock(sysfs_mutex)
+ *	mutex_lock(kernfs_mutex)
  */
 static void sysfs_unlink_sibling(struct kernfs_node *kn)
 {
@@ -251,7 +251,7 @@ void kernfs_put(struct kernfs_node *kn)
 	}
 	kfree(kn->iattr);
 	ida_simple_remove(&root->ino_ida, kn->ino);
-	kmem_cache_free(sysfs_dir_cachep, kn);
+	kmem_cache_free(kernfs_node_cache, kn);
 
 	kn = parent;
 	if (kn) {
@@ -279,7 +279,7 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 		return -ECHILD;
 
 	kn = dentry->d_fsdata;
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 
 	/* The sysfs dirent has been deleted */
 	if (kn->flags & KERNFS_REMOVED)
@@ -298,7 +298,7 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 	    kernfs_info(dentry->d_sb)->ns != kn->ns)
 		goto out_bad;
 
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 out_valid:
 	return 1;
 out_bad:
@@ -312,7 +312,7 @@ out_bad:
 	 * is performed at its new name the dentry will be readded
 	 * to the dcache hashes.
 	 */
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 
 	/* If we have submounts we must allow the vfs caches
 	 * to lie about the state of the filesystem to prevent
@@ -329,7 +329,7 @@ static void sysfs_dentry_release(struct dentry *dentry)
 	kernfs_put(dentry->d_fsdata);
 }
 
-const struct dentry_operations sysfs_dentry_ops = {
+const struct dentry_operations kernfs_dops = {
 	.d_revalidate	= sysfs_dentry_revalidate,
 	.d_delete	= sysfs_dentry_delete,
 	.d_release	= sysfs_dentry_release,
@@ -348,7 +348,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 			return NULL;
 	}
 
-	kn = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
+	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
 	if (!kn)
 		goto err_out1;
 
@@ -367,7 +367,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 	return kn;
 
  err_out2:
-	kmem_cache_free(sysfs_dir_cachep, kn);
+	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
 	kfree(dup_name);
 	return NULL;
@@ -378,19 +378,19 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
  *	@acxt: pointer to kernfs_addrm_cxt to be used
  *
  *	This function is called when the caller is about to add or remove
- *	kernfs_node.  This function acquires sysfs_mutex.  @acxt is used to
- *	keep and pass context to other addrm functions.
+ *	kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
+ *	to keep and pass context to other addrm functions.
  *
  *	LOCKING:
- *	Kernel thread context (may sleep).  sysfs_mutex is locked on
+ *	Kernel thread context (may sleep).  kernfs_mutex is locked on
  *	return.
  */
 void sysfs_addrm_start(struct kernfs_addrm_cxt *acxt)
-	__acquires(sysfs_mutex)
+	__acquires(kernfs_mutex)
 {
 	memset(acxt, 0, sizeof(*acxt));
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 }
 
 /**
@@ -503,13 +503,13 @@ static void sysfs_remove_one(struct kernfs_addrm_cxt *acxt,
  *	cleaned up.
  *
  *	LOCKING:
- *	sysfs_mutex is released.
+ *	kernfs_mutex is released.
  */
 void sysfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
-	__releases(sysfs_mutex)
+	__releases(kernfs_mutex)
 {
 	/* release resources acquired by sysfs_addrm_start() */
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 
 	/* kill removed kernfs_nodes */
 	while (acxt->removed) {
@@ -540,7 +540,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 	bool has_ns = kernfs_ns_enabled(parent);
 	unsigned int hash;
 
-	lockdep_assert_held(&sysfs_mutex);
+	lockdep_assert_held(&kernfs_mutex);
 
 	if (has_ns != (bool)ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
@@ -580,10 +580,10 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
 {
 	struct kernfs_node *kn;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 	kn = kernfs_find_ns(parent, name, ns);
 	kernfs_get(kn);
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 
 	return kn;
 }
@@ -683,7 +683,7 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	const void *ns = NULL;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 
 	if (kernfs_ns_enabled(parent))
 		ns = kernfs_info(dir->i_sb)->ns;
@@ -708,11 +708,11 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	/* instantiate and hash dentry */
 	ret = d_materialise_unique(dentry, inode);
  out_unlock:
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 	return ret;
 }
 
-const struct inode_operations sysfs_dir_inode_operations = {
+const struct inode_operations kernfs_dir_iops = {
 	.lookup		= sysfs_lookup,
 	.permission	= sysfs_permission,
 	.setattr	= sysfs_setattr,
@@ -759,7 +759,7 @@ static struct kernfs_node *sysfs_next_descendant_post(struct kernfs_node *pos,
 {
 	struct rb_node *rbn;
 
-	lockdep_assert_held(&sysfs_mutex);
+	lockdep_assert_held(&kernfs_mutex);
 
 	/* if first iteration, visit leftmost descendant which may be root */
 	if (!pos)
@@ -859,7 +859,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 {
 	int error;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 
 	error = 0;
 	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
@@ -894,7 +894,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 
 	error = 0;
  out:
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 	return error;
 }
 
@@ -968,7 +968,7 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 
 	if (kernfs_ns_enabled(parent))
 		ns = kernfs_info(dentry->d_sb)->ns;
@@ -985,12 +985,12 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 		file->private_data = pos;
 		kernfs_get(pos);
 
-		mutex_unlock(&sysfs_mutex);
+		mutex_unlock(&kernfs_mutex);
 		if (!dir_emit(ctx, name, len, ino, type))
 			return 0;
-		mutex_lock(&sysfs_mutex);
+		mutex_lock(&kernfs_mutex);
 	}
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 	file->private_data = NULL;
 	ctx->pos = INT_MAX;
 	return 0;
@@ -1008,7 +1008,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
 	return ret;
 }
 
-const struct file_operations sysfs_dir_operations = {
+const struct file_operations kernfs_dir_fops = {
 	.read		= generic_read_dir,
 	.iterate	= sysfs_readdir,
 	.release	= sysfs_dir_release,
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index abe93e12089..32364ddb24d 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -750,7 +750,7 @@ void kernfs_notify(struct kernfs_node *kn)
 }
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
-const struct file_operations kernfs_file_operations = {
+const struct file_operations kernfs_file_fops = {
 	.read		= kernfs_file_read,
 	.write		= kernfs_file_write,
 	.llseek		= generic_file_llseek,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index af92638d792..c5f231e8d36 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -18,19 +18,19 @@
 
 #include "kernfs-internal.h"
 
-static const struct address_space_operations sysfs_aops = {
+static const struct address_space_operations kernfs_aops = {
 	.readpage	= simple_readpage,
 	.write_begin	= simple_write_begin,
 	.write_end	= simple_write_end,
 };
 
-static struct backing_dev_info sysfs_backing_dev_info = {
-	.name		= "sysfs",
+static struct backing_dev_info kernfs_bdi = {
+	.name		= "kernfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
-static const struct inode_operations sysfs_inode_operations = {
+static const struct inode_operations kernfs_iops = {
 	.permission	= sysfs_permission,
 	.setattr	= sysfs_setattr,
 	.getattr	= sysfs_getattr,
@@ -42,8 +42,8 @@ static const struct inode_operations sysfs_inode_operations = {
 
 void __init sysfs_inode_init(void)
 {
-	if (bdi_init(&sysfs_backing_dev_info))
-		panic("failed to init sysfs_backing_dev_info");
+	if (bdi_init(&kernfs_bdi))
+		panic("failed to init kernfs_bdi");
 }
 
 static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
@@ -109,9 +109,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	int ret;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 	ret = __kernfs_setattr(kn, iattr);
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 	return ret;
 }
 
@@ -124,7 +124,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (!kn)
 		return -EINVAL;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 	error = inode_change_ok(inode, iattr);
 	if (error)
 		goto out;
@@ -137,7 +137,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	setattr_copy(inode, iattr);
 
 out:
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 	return error;
 }
 
@@ -187,9 +187,9 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		if (error)
 			return error;
 
-		mutex_lock(&sysfs_mutex);
+		mutex_lock(&kernfs_mutex);
 		error = sysfs_sd_setsecdata(kn, &secdata, &secdata_len);
-		mutex_unlock(&sysfs_mutex);
+		mutex_unlock(&kernfs_mutex);
 
 		if (secdata)
 			security_release_secctx(secdata, secdata_len);
@@ -279,9 +279,9 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct kernfs_node *kn = dentry->d_fsdata;
 	struct inode *inode = dentry->d_inode;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 	sysfs_refresh_inode(kn, inode);
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 
 	generic_fillattr(inode, stat);
 	return 0;
@@ -291,9 +291,9 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 {
 	kernfs_get(kn);
 	inode->i_private = kn;
-	inode->i_mapping->a_ops = &sysfs_aops;
-	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
-	inode->i_op = &sysfs_inode_operations;
+	inode->i_mapping->a_ops = &kernfs_aops;
+	inode->i_mapping->backing_dev_info = &kernfs_bdi;
+	inode->i_op = &kernfs_iops;
 
 	set_default_inode_attr(inode, kn->mode);
 	sysfs_refresh_inode(kn, inode);
@@ -301,15 +301,15 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	/* initialize inode according to type */
 	switch (kernfs_type(kn)) {
 	case KERNFS_DIR:
-		inode->i_op = &sysfs_dir_inode_operations;
-		inode->i_fop = &sysfs_dir_operations;
+		inode->i_op = &kernfs_dir_iops;
+		inode->i_fop = &kernfs_dir_fops;
 		break;
 	case KERNFS_FILE:
 		inode->i_size = kn->attr.size;
-		inode->i_fop = &kernfs_file_operations;
+		inode->i_fop = &kernfs_file_fops;
 		break;
 	case KERNFS_LINK:
-		inode->i_op = &sysfs_symlink_inode_operations;
+		inode->i_op = &kernfs_symlink_iops;
 		break;
 	default:
 		BUG();
@@ -369,9 +369,9 @@ int sysfs_permission(struct inode *inode, int mask)
 
 	kn = inode->i_private;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 	sysfs_refresh_inode(kn, inode);
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 
 	return generic_permission(inode, mask);
 }
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index c4bf5bf72f7..e62e8ec15d6 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -71,7 +71,7 @@ struct kernfs_super_info {
 };
 #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
 
-extern struct kmem_cache *sysfs_dir_cachep;
+extern struct kmem_cache *kernfs_node_cache;
 
 /*
  * inode.c
@@ -93,10 +93,10 @@ void sysfs_inode_init(void);
 /*
  * dir.c
  */
-extern struct mutex sysfs_mutex;
-extern const struct dentry_operations sysfs_dentry_ops;
-extern const struct file_operations sysfs_dir_operations;
-extern const struct inode_operations sysfs_dir_inode_operations;
+extern struct mutex kernfs_mutex;
+extern const struct dentry_operations kernfs_dops;
+extern const struct file_operations kernfs_dir_fops;
+extern const struct inode_operations kernfs_dir_iops;
 
 struct kernfs_node *sysfs_get_active(struct kernfs_node *kn);
 void sysfs_put_active(struct kernfs_node *kn);
@@ -110,13 +110,13 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 /*
  * file.c
  */
-extern const struct file_operations kernfs_file_operations;
+extern const struct file_operations kernfs_file_fops;
 
 void sysfs_unmap_bin_file(struct kernfs_node *kn);
 
 /*
  * symlink.c
  */
-extern const struct inode_operations sysfs_symlink_inode_operations;
+extern const struct inode_operations kernfs_symlink_iops;
 
 #endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e0796dcb606..27d967ba0bb 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -17,9 +17,9 @@
 
 #include "kernfs-internal.h"
 
-struct kmem_cache *sysfs_dir_cachep;
+struct kmem_cache *kernfs_node_cache;
 
-static const struct super_operations sysfs_ops = {
+static const struct super_operations kernfs_sops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= sysfs_evict_inode,
@@ -34,13 +34,13 @@ static int sysfs_fill_super(struct super_block *sb)
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = SYSFS_MAGIC;
-	sb->s_op = &sysfs_ops;
+	sb->s_op = &kernfs_sops;
 	sb->s_time_gran = 1;
 
 	/* get root inode, initialize and unlock it */
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 	inode = sysfs_get_inode(sb, info->root->kn);
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
 		return -ENOMEM;
@@ -55,7 +55,7 @@ static int sysfs_fill_super(struct super_block *sb)
 	kernfs_get(info->root->kn);
 	root->d_fsdata = info->root->kn;
 	sb->s_root = root;
-	sb->s_d_op = &sysfs_dentry_ops;
+	sb->s_d_op = &kernfs_dops;
 	return 0;
 }
 
@@ -158,7 +158,7 @@ void kernfs_kill_sb(struct super_block *sb)
 
 void __init kernfs_init(void)
 {
-	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
+	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
 					      sizeof(struct kernfs_node),
 					      0, SLAB_PANIC, NULL);
 	sysfs_inode_init();
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index a92284d3c73..4105bd04ea2 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -110,9 +110,9 @@ static int sysfs_getlink(struct dentry *dentry, char *path)
 	struct kernfs_node *target = kn->symlink.target_kn;
 	int error;
 
-	mutex_lock(&sysfs_mutex);
+	mutex_lock(&kernfs_mutex);
 	error = sysfs_get_target_path(parent, target, path);
-	mutex_unlock(&sysfs_mutex);
+	mutex_unlock(&kernfs_mutex);
 
 	return error;
 }
@@ -138,7 +138,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
 		free_page((unsigned long)page);
 }
 
-const struct inode_operations sysfs_symlink_inode_operations = {
+const struct inode_operations kernfs_symlink_iops = {
 	.setxattr	= sysfs_setxattr,
 	.removexattr	= sysfs_removexattr,
 	.getxattr	= sysfs_getxattr,
-- 
cgit v1.2.3-70-g09d2


From c637b8acbe079edb477d887041755b489036f146 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 14:11:58 -0500
Subject: kernfs: s/sysfs/kernfs/ in internal functions and whatever is left

kernfs has just been separated out from sysfs and we're already in
full conflict mode.  Nothing can make the situation any worse.  Let's
take the chance to name things properly.

This patch performs the following renames.

* s/sysfs_*()/kernfs_*()/ in all internal functions
* s/sysfs/kernfs/ in internal strings, comments and whatever is remaining
* Uniformly rename various vfs operations so that they're consistently
  named and distinguishable.

This patch is strictly rename only and doesn't introduce any
functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 212 ++++++++++++++++++++++----------------------
 fs/kernfs/file.c            | 121 ++++++++++++-------------
 fs/kernfs/inode.c           |  66 +++++++-------
 fs/kernfs/kernfs-internal.h |  44 ++++-----
 fs/kernfs/mount.c           |  18 ++--
 fs/kernfs/symlink.c         |  44 ++++-----
 6 files changed, 254 insertions(+), 251 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index d3c66237474..6520066c49e 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -22,13 +22,13 @@ DEFINE_MUTEX(kernfs_mutex);
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
 /**
- *	sysfs_name_hash
+ *	kernfs_name_hash
  *	@name: Null terminated string to hash
  *	@ns:   Namespace tag to hash
  *
  *	Returns 31 bit hash of ns + name (so it fits in an off_t )
  */
-static unsigned int sysfs_name_hash(const char *name, const void *ns)
+static unsigned int kernfs_name_hash(const char *name, const void *ns)
 {
 	unsigned long hash = init_name_hash();
 	unsigned int len = strlen(name);
@@ -44,8 +44,8 @@ static unsigned int sysfs_name_hash(const char *name, const void *ns)
 	return hash;
 }
 
-static int sysfs_name_compare(unsigned int hash, const char *name,
-			      const void *ns, const struct kernfs_node *kn)
+static int kernfs_name_compare(unsigned int hash, const char *name,
+			       const void *ns, const struct kernfs_node *kn)
 {
 	if (hash != kn->hash)
 		return hash - kn->hash;
@@ -54,14 +54,14 @@ static int sysfs_name_compare(unsigned int hash, const char *name,
 	return strcmp(name, kn->name);
 }
 
-static int sysfs_sd_compare(const struct kernfs_node *left,
-			    const struct kernfs_node *right)
+static int kernfs_sd_compare(const struct kernfs_node *left,
+			     const struct kernfs_node *right)
 {
-	return sysfs_name_compare(left->hash, left->name, left->ns, right);
+	return kernfs_name_compare(left->hash, left->name, left->ns, right);
 }
 
 /**
- *	sysfs_link_sibling - link kernfs_node into sibling rbtree
+ *	kernfs_link_sibling - link kernfs_node into sibling rbtree
  *	@kn: kernfs_node of interest
  *
  *	Link @kn into its sibling rbtree which starts from
@@ -73,7 +73,7 @@ static int sysfs_sd_compare(const struct kernfs_node *left,
  *	RETURNS:
  *	0 on susccess -EEXIST on failure.
  */
-static int sysfs_link_sibling(struct kernfs_node *kn)
+static int kernfs_link_sibling(struct kernfs_node *kn)
 {
 	struct rb_node **node = &kn->parent->dir.children.rb_node;
 	struct rb_node *parent = NULL;
@@ -87,7 +87,7 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
 
 		pos = rb_to_kn(*node);
 		parent = *node;
-		result = sysfs_sd_compare(kn, pos);
+		result = kernfs_sd_compare(kn, pos);
 		if (result < 0)
 			node = &pos->rb.rb_left;
 		else if (result > 0)
@@ -102,7 +102,7 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
 }
 
 /**
- *	sysfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ *	kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
  *	@kn: kernfs_node of interest
  *
  *	Unlink @kn from its sibling rbtree which starts from
@@ -111,7 +111,7 @@ static int sysfs_link_sibling(struct kernfs_node *kn)
  *	Locking:
  *	mutex_lock(kernfs_mutex)
  */
-static void sysfs_unlink_sibling(struct kernfs_node *kn)
+static void kernfs_unlink_sibling(struct kernfs_node *kn)
 {
 	if (kernfs_type(kn) == KERNFS_DIR)
 		kn->parent->dir.subdirs--;
@@ -120,7 +120,7 @@ static void sysfs_unlink_sibling(struct kernfs_node *kn)
 }
 
 /**
- *	sysfs_get_active - get an active reference to kernfs_node
+ *	kernfs_get_active - get an active reference to kernfs_node
  *	@kn: kernfs_node to get an active reference to
  *
  *	Get an active reference of @kn.  This function is noop if @kn
@@ -129,7 +129,7 @@ static void sysfs_unlink_sibling(struct kernfs_node *kn)
  *	RETURNS:
  *	Pointer to @kn on success, NULL on failure.
  */
-struct kernfs_node *sysfs_get_active(struct kernfs_node *kn)
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
 {
 	if (unlikely(!kn))
 		return NULL;
@@ -143,13 +143,13 @@ struct kernfs_node *sysfs_get_active(struct kernfs_node *kn)
 }
 
 /**
- *	sysfs_put_active - put an active reference to kernfs_node
+ *	kernfs_put_active - put an active reference to kernfs_node
  *	@kn: kernfs_node to put an active reference to
  *
  *	Put an active reference to @kn.  This function is noop if @kn
  *	is NULL.
  */
-void sysfs_put_active(struct kernfs_node *kn)
+void kernfs_put_active(struct kernfs_node *kn)
 {
 	int v;
 
@@ -170,12 +170,12 @@ void sysfs_put_active(struct kernfs_node *kn)
 }
 
 /**
- *	sysfs_deactivate - deactivate kernfs_node
+ *	kernfs_deactivate - deactivate kernfs_node
  *	@kn: kernfs_node to deactivate
  *
  *	Deny new active references and drain existing ones.
  */
-static void sysfs_deactivate(struct kernfs_node *kn)
+static void kernfs_deactivate(struct kernfs_node *kn)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	int v;
@@ -235,9 +235,8 @@ void kernfs_put(struct kernfs_node *kn)
 	 */
 	parent = kn->parent;
 
-	WARN(!(kn->flags & KERNFS_REMOVED),
-		"sysfs: free using entry: %s/%s\n",
-		parent ? parent->name : "", kn->name);
+	WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
+	     parent ? parent->name : "", kn->name);
 
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
@@ -265,13 +264,13 @@ void kernfs_put(struct kernfs_node *kn)
 }
 EXPORT_SYMBOL_GPL(kernfs_put);
 
-static int sysfs_dentry_delete(const struct dentry *dentry)
+static int kernfs_dop_delete(const struct dentry *dentry)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
 	return !(kn && !(kn->flags & KERNFS_REMOVED));
 }
 
-static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct kernfs_node *kn;
 
@@ -281,19 +280,19 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 	kn = dentry->d_fsdata;
 	mutex_lock(&kernfs_mutex);
 
-	/* The sysfs dirent has been deleted */
+	/* The kernfs node has been deleted */
 	if (kn->flags & KERNFS_REMOVED)
 		goto out_bad;
 
-	/* The sysfs dirent has been moved? */
+	/* The kernfs node has been moved? */
 	if (dentry->d_parent->d_fsdata != kn->parent)
 		goto out_bad;
 
-	/* The sysfs dirent has been renamed */
+	/* The kernfs node has been renamed */
 	if (strcmp(dentry->d_name.name, kn->name) != 0)
 		goto out_bad;
 
-	/* The sysfs dirent has been moved to a different namespace */
+	/* The kernfs node has been moved to a different namespace */
 	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
 	    kernfs_info(dentry->d_sb)->ns != kn->ns)
 		goto out_bad;
@@ -302,9 +301,10 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 out_valid:
 	return 1;
 out_bad:
-	/* Remove the dentry from the dcache hashes.
+	/*
+	 * Remove the dentry from the dcache hashes.
 	 * If this is a deleted dentry we use d_drop instead of d_delete
-	 * so sysfs doesn't need to cope with negative dentries.
+	 * so kernfs doesn't need to cope with negative dentries.
 	 *
 	 * If this is a dentry that has simply been renamed we
 	 * use d_drop to remove it from the dcache lookup on its
@@ -324,19 +324,19 @@ out_bad:
 	return 0;
 }
 
-static void sysfs_dentry_release(struct dentry *dentry)
+static void kernfs_dop_release(struct dentry *dentry)
 {
 	kernfs_put(dentry->d_fsdata);
 }
 
 const struct dentry_operations kernfs_dops = {
-	.d_revalidate	= sysfs_dentry_revalidate,
-	.d_delete	= sysfs_dentry_delete,
-	.d_release	= sysfs_dentry_release,
+	.d_revalidate	= kernfs_dop_revalidate,
+	.d_delete	= kernfs_dop_delete,
+	.d_release	= kernfs_dop_release,
 };
 
-struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
-				     const char *name, umode_t mode, int type)
+struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
+				    umode_t mode, int type)
 {
 	char *dup_name = NULL;
 	struct kernfs_node *kn;
@@ -374,7 +374,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
 }
 
 /**
- *	sysfs_addrm_start - prepare for kernfs_node add/remove
+ *	kernfs_addrm_start - prepare for kernfs_node add/remove
  *	@acxt: pointer to kernfs_addrm_cxt to be used
  *
  *	This function is called when the caller is about to add or remove
@@ -385,7 +385,7 @@ struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
  *	Kernel thread context (may sleep).  kernfs_mutex is locked on
  *	return.
  */
-void sysfs_addrm_start(struct kernfs_addrm_cxt *acxt)
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
 	__acquires(kernfs_mutex)
 {
 	memset(acxt, 0, sizeof(*acxt));
@@ -394,7 +394,7 @@ void sysfs_addrm_start(struct kernfs_addrm_cxt *acxt)
 }
 
 /**
- *	sysfs_add_one - add kernfs_node to parent without warning
+ *	kernfs_add_one - add kernfs_node to parent without warning
  *	@acxt: addrm context to use
  *	@kn: kernfs_node to be added
  *	@parent: the parent kernfs_node to add @kn to
@@ -404,17 +404,17 @@ void sysfs_addrm_start(struct kernfs_addrm_cxt *acxt)
  *	of the parent.
  *
  *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
+ *	the same @acxt as passed to kernfs_addrm_start().
  *
  *	LOCKING:
- *	Determined by sysfs_addrm_start().
+ *	Determined by kernfs_addrm_start().
  *
  *	RETURNS:
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 		  struct kernfs_node *parent)
 {
 	bool has_ns = kernfs_ns_enabled(parent);
@@ -422,7 +422,7 @@ int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	int ret;
 
 	if (has_ns != (bool)kn->ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
 		     has_ns ? "required" : "invalid", parent->name, kn->name);
 		return -EINVAL;
 	}
@@ -430,11 +430,11 @@ int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	if (kernfs_type(parent) != KERNFS_DIR)
 		return -EINVAL;
 
-	kn->hash = sysfs_name_hash(kn->name, kn->ns);
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kn->parent = parent;
 	kernfs_get(parent);
 
-	ret = sysfs_link_sibling(kn);
+	ret = kernfs_link_sibling(kn);
 	if (ret)
 		return ret;
 
@@ -452,7 +452,7 @@ int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 }
 
 /**
- *	sysfs_remove_one - remove kernfs_node from parent
+ *	kernfs_remove_one - remove kernfs_node from parent
  *	@acxt: addrm context to use
  *	@kn: kernfs_node to be removed
  *
@@ -460,14 +460,14 @@ int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
  *	directory.  @kn is unlinked from the children list.
  *
  *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to kernfs_addrm_start().
  *
  *	LOCKING:
- *	Determined by sysfs_addrm_start().
+ *	Determined by kernfs_addrm_start().
  */
-static void sysfs_remove_one(struct kernfs_addrm_cxt *acxt,
-			     struct kernfs_node *kn)
+static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
+			      struct kernfs_node *kn)
 {
 	struct kernfs_iattrs *ps_iattr;
 
@@ -479,7 +479,7 @@ static void sysfs_remove_one(struct kernfs_addrm_cxt *acxt,
 		return;
 
 	if (kn->parent) {
-		sysfs_unlink_sibling(kn);
+		kernfs_unlink_sibling(kn);
 
 		/* Update timestamps on the parent */
 		ps_iattr = kn->parent->iattr;
@@ -495,20 +495,20 @@ static void sysfs_remove_one(struct kernfs_addrm_cxt *acxt,
 }
 
 /**
- *	sysfs_addrm_finish - finish up kernfs_node add/remove
+ *	kernfs_addrm_finish - finish up kernfs_node add/remove
  *	@acxt: addrm context to finish up
  *
  *	Finish up kernfs_node add/remove.  Resources acquired by
- *	sysfs_addrm_start() are released and removed kernfs_nodes are
+ *	kernfs_addrm_start() are released and removed kernfs_nodes are
  *	cleaned up.
  *
  *	LOCKING:
  *	kernfs_mutex is released.
  */
-void sysfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 	__releases(kernfs_mutex)
 {
-	/* release resources acquired by sysfs_addrm_start() */
+	/* release resources acquired by kernfs_addrm_start() */
 	mutex_unlock(&kernfs_mutex);
 
 	/* kill removed kernfs_nodes */
@@ -517,8 +517,8 @@ void sysfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 
 		acxt->removed = kn->u.removed_list;
 
-		sysfs_deactivate(kn);
-		sysfs_unmap_bin_file(kn);
+		kernfs_deactivate(kn);
+		kernfs_unmap_bin_file(kn);
 		kernfs_put(kn);
 	}
 }
@@ -543,18 +543,18 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 	lockdep_assert_held(&kernfs_mutex);
 
 	if (has_ns != (bool)ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
 		     has_ns ? "required" : "invalid", parent->name, name);
 		return NULL;
 	}
 
-	hash = sysfs_name_hash(name, ns);
+	hash = kernfs_name_hash(name, ns);
 	while (node) {
 		struct kernfs_node *kn;
 		int result;
 
 		kn = rb_to_kn(node);
-		result = sysfs_name_compare(hash, name, ns, kn);
+		result = kernfs_name_compare(hash, name, ns, kn);
 		if (result < 0)
 			node = node->rb_left;
 		else if (result > 0)
@@ -607,7 +607,7 @@ struct kernfs_root *kernfs_create_root(void *priv)
 
 	ida_init(&root->ino_ida);
 
-	kn = sysfs_new_dirent(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR);
+	kn = kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR);
 	if (!kn) {
 		ida_destroy(&root->ino_ida);
 		kfree(root);
@@ -654,7 +654,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	int rc;
 
 	/* allocate */
-	kn = sysfs_new_dirent(kernfs_root(parent), name, mode, KERNFS_DIR);
+	kn = kernfs_new_node(kernfs_root(parent), name, mode, KERNFS_DIR);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -663,9 +663,9 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	kn->priv = priv;
 
 	/* link in */
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, kn, parent);
-	sysfs_addrm_finish(&acxt);
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
 
 	if (!rc)
 		return kn;
@@ -674,8 +674,9 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	return ERR_PTR(rc);
 }
 
-static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
-				   unsigned int flags)
+static struct dentry *kernfs_iop_lookup(struct inode *dir,
+					struct dentry *dentry,
+					unsigned int flags)
 {
 	struct dentry *ret = NULL;
 	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
@@ -699,7 +700,7 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	dentry->d_fsdata = kn;
 
 	/* attach dentry and inode */
-	inode = sysfs_get_inode(dir->i_sb, kn);
+	inode = kernfs_get_inode(dir->i_sb, kn);
 	if (!inode) {
 		ret = ERR_PTR(-ENOMEM);
 		goto out_unlock;
@@ -713,17 +714,17 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 }
 
 const struct inode_operations kernfs_dir_iops = {
-	.lookup		= sysfs_lookup,
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-	.removexattr	= sysfs_removexattr,
-	.getxattr	= sysfs_getxattr,
-	.listxattr	= sysfs_listxattr,
+	.lookup		= kernfs_iop_lookup,
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
 };
 
-static struct kernfs_node *sysfs_leftmost_descendant(struct kernfs_node *pos)
+static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
 {
 	struct kernfs_node *last;
 
@@ -746,7 +747,7 @@ static struct kernfs_node *sysfs_leftmost_descendant(struct kernfs_node *pos)
 }
 
 /**
- * sysfs_next_descendant_post - find the next descendant for post-order walk
+ * kernfs_next_descendant_post - find the next descendant for post-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: kernfs_node whose descendants to walk
  *
@@ -754,8 +755,8 @@ static struct kernfs_node *sysfs_leftmost_descendant(struct kernfs_node *pos)
  * descendants.  @root is included in the iteration and the last node to be
  * visited.
  */
-static struct kernfs_node *sysfs_next_descendant_post(struct kernfs_node *pos,
-						      struct kernfs_node *root)
+static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
+						       struct kernfs_node *root)
 {
 	struct rb_node *rbn;
 
@@ -763,7 +764,7 @@ static struct kernfs_node *sysfs_next_descendant_post(struct kernfs_node *pos,
 
 	/* if first iteration, visit leftmost descendant which may be root */
 	if (!pos)
-		return sysfs_leftmost_descendant(root);
+		return kernfs_leftmost_descendant(root);
 
 	/* if we visited @root, we're done */
 	if (pos == root)
@@ -772,7 +773,7 @@ static struct kernfs_node *sysfs_next_descendant_post(struct kernfs_node *pos,
 	/* if there's an unvisited sibling, visit its leftmost descendant */
 	rbn = rb_next(&pos->rb);
 	if (rbn)
-		return sysfs_leftmost_descendant(rb_to_kn(rbn));
+		return kernfs_leftmost_descendant(rb_to_kn(rbn));
 
 	/* no sibling left, visit parent */
 	return pos->parent;
@@ -786,14 +787,14 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 	if (!kn)
 		return;
 
-	pr_debug("sysfs %s: removing\n", kn->name);
+	pr_debug("kernfs %s: removing\n", kn->name);
 
 	next = NULL;
 	do {
 		pos = next;
-		next = sysfs_next_descendant_post(pos, kn);
+		next = kernfs_next_descendant_post(pos, kn);
 		if (pos)
-			sysfs_remove_one(acxt, pos);
+			kernfs_remove_one(acxt, pos);
 	} while (next);
 }
 
@@ -807,9 +808,9 @@ void kernfs_remove(struct kernfs_node *kn)
 {
 	struct kernfs_addrm_cxt acxt;
 
-	sysfs_addrm_start(&acxt);
+	kernfs_addrm_start(&acxt);
 	__kernfs_remove(&acxt, kn);
-	sysfs_addrm_finish(&acxt);
+	kernfs_addrm_finish(&acxt);
 }
 
 /**
@@ -828,18 +829,18 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 	struct kernfs_node *kn;
 
 	if (!parent) {
-		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
 			name);
 		return -ENOENT;
 	}
 
-	sysfs_addrm_start(&acxt);
+	kernfs_addrm_start(&acxt);
 
 	kn = kernfs_find_ns(parent, name, ns);
 	if (kn)
 		__kernfs_remove(&acxt, kn);
 
-	sysfs_addrm_finish(&acxt);
+	kernfs_addrm_finish(&acxt);
 
 	if (kn)
 		return 0;
@@ -884,13 +885,13 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	/*
 	 * Move to the appropriate place in the appropriate directories rbtree.
 	 */
-	sysfs_unlink_sibling(kn);
+	kernfs_unlink_sibling(kn);
 	kernfs_get(new_parent);
 	kernfs_put(kn->parent);
 	kn->ns = new_ns;
-	kn->hash = sysfs_name_hash(kn->name, kn->ns);
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kn->parent = new_parent;
-	sysfs_link_sibling(kn);
+	kernfs_link_sibling(kn);
 
 	error = 0;
  out:
@@ -904,13 +905,13 @@ static inline unsigned char dt_type(struct kernfs_node *kn)
 	return (kn->mode >> 12) & 15;
 }
 
-static int sysfs_dir_release(struct inode *inode, struct file *filp)
+static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
 {
 	kernfs_put(filp->private_data);
 	return 0;
 }
 
-static struct kernfs_node *sysfs_dir_pos(const void *ns,
+static struct kernfs_node *kernfs_dir_pos(const void *ns,
 	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
 {
 	if (pos) {
@@ -944,10 +945,10 @@ static struct kernfs_node *sysfs_dir_pos(const void *ns,
 	return pos;
 }
 
-static struct kernfs_node *sysfs_dir_next_pos(const void *ns,
+static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
 	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
 {
-	pos = sysfs_dir_pos(ns, parent, ino, pos);
+	pos = kernfs_dir_pos(ns, parent, ino, pos);
 	if (pos)
 		do {
 			struct rb_node *node = rb_next(&pos->rb);
@@ -959,7 +960,7 @@ static struct kernfs_node *sysfs_dir_next_pos(const void *ns,
 	return pos;
 }
 
-static int sysfs_readdir(struct file *file, struct dir_context *ctx)
+static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct kernfs_node *parent = dentry->d_fsdata;
@@ -973,9 +974,9 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 	if (kernfs_ns_enabled(parent))
 		ns = kernfs_info(dentry->d_sb)->ns;
 
-	for (pos = sysfs_dir_pos(ns, parent, ctx->pos, pos);
+	for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
 	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+	     pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
 		const char *name = pos->name;
 		unsigned int type = dt_type(pos);
 		int len = strlen(name);
@@ -996,7 +997,8 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 	return 0;
 }
 
-static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
+static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
+				    int whence)
 {
 	struct inode *inode = file_inode(file);
 	loff_t ret;
@@ -1010,7 +1012,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
 
 const struct file_operations kernfs_dir_fops = {
 	.read		= generic_read_dir,
-	.iterate	= sysfs_readdir,
-	.release	= sysfs_dir_release,
-	.llseek		= sysfs_dir_llseek,
+	.iterate	= kernfs_fop_readdir,
+	.release	= kernfs_dir_fop_release,
+	.llseek		= kernfs_dir_fop_llseek,
 };
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 32364ddb24d..053cfd9a6a4 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -64,7 +64,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return ERR_PTR(-ENODEV);
 
 	ops = kernfs_ops(of->kn);
@@ -104,7 +104,7 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
 	if (ops->seq_stop)
 		ops->seq_stop(sf, v);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	mutex_unlock(&of->mutex);
 }
 
@@ -147,7 +147,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->kn)) {
+	if (!kernfs_get_active(of->kn)) {
 		len = -ENODEV;
 		mutex_unlock(&of->mutex);
 		goto out_free;
@@ -159,7 +159,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 	else
 		len = -EINVAL;
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	mutex_unlock(&of->mutex);
 
 	if (len < 0)
@@ -178,14 +178,14 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
 }
 
 /**
- * kernfs_file_read - kernfs vfs read callback
+ * kernfs_fop_read - kernfs vfs read callback
  * @file: file pointer
  * @user_buf: data to write
  * @count: number of bytes
  * @ppos: starting offset
  */
-static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
-				size_t count, loff_t *ppos)
+static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos)
 {
 	struct kernfs_open_file *of = kernfs_of(file);
 
@@ -196,7 +196,7 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
 }
 
 /**
- * kernfs_file_write - kernfs vfs write callback
+ * kernfs_fop_write - kernfs vfs write callback
  * @file: file pointer
  * @user_buf: data to write
  * @count: number of bytes
@@ -211,8 +211,8 @@ static ssize_t kernfs_file_read(struct file *file, char __user *user_buf,
  * modify only the the value you're changing, then write entire buffer
  * back.
  */
-static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
-				 size_t count, loff_t *ppos)
+static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
+				size_t count, loff_t *ppos)
 {
 	struct kernfs_open_file *of = kernfs_of(file);
 	ssize_t len = min_t(size_t, count, PAGE_SIZE);
@@ -234,7 +234,7 @@ static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->kn)) {
+	if (!kernfs_get_active(of->kn)) {
 		mutex_unlock(&of->mutex);
 		len = -ENODEV;
 		goto out_free;
@@ -246,7 +246,7 @@ static ssize_t kernfs_file_write(struct file *file, const char __user *user_buf,
 	else
 		len = -EINVAL;
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	mutex_unlock(&of->mutex);
 
 	if (len > 0)
@@ -264,13 +264,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
 	if (!of->vm_ops)
 		return;
 
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return;
 
 	if (of->vm_ops->open)
 		of->vm_ops->open(vma);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 }
 
 static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -282,14 +282,14 @@ static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!of->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return VM_FAULT_SIGBUS;
 
 	ret = VM_FAULT_SIGBUS;
 	if (of->vm_ops->fault)
 		ret = of->vm_ops->fault(vma, vmf);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	return ret;
 }
 
@@ -303,7 +303,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return VM_FAULT_SIGBUS;
 
 	ret = 0;
@@ -312,7 +312,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 	else
 		file_update_time(file);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	return ret;
 }
 
@@ -326,14 +326,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
 	if (!of->vm_ops)
 		return -EINVAL;
 
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return -EINVAL;
 
 	ret = -EINVAL;
 	if (of->vm_ops->access)
 		ret = of->vm_ops->access(vma, addr, buf, len, write);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	return ret;
 }
 
@@ -348,14 +348,14 @@ static int kernfs_vma_set_policy(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return 0;
 
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return -EINVAL;
 
 	ret = 0;
 	if (of->vm_ops->set_policy)
 		ret = of->vm_ops->set_policy(vma, new);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	return ret;
 }
 
@@ -369,14 +369,14 @@ static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return vma->vm_policy;
 
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return vma->vm_policy;
 
 	pol = vma->vm_policy;
 	if (of->vm_ops->get_policy)
 		pol = of->vm_ops->get_policy(vma, addr);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	return pol;
 }
 
@@ -391,14 +391,14 @@ static int kernfs_vma_migrate(struct vm_area_struct *vma,
 	if (!of->vm_ops)
 		return 0;
 
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		return 0;
 
 	ret = 0;
 	if (of->vm_ops->migrate)
 		ret = of->vm_ops->migrate(vma, from, to, flags);
 
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 	return ret;
 }
 #endif
@@ -415,7 +415,7 @@ static const struct vm_operations_struct kernfs_vm_ops = {
 #endif
 };
 
-static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct kernfs_open_file *of = kernfs_of(file);
 	const struct kernfs_ops *ops;
@@ -434,7 +434,7 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	mutex_lock(&of->mutex);
 
 	rc = -ENODEV;
-	if (!sysfs_get_active(of->kn))
+	if (!kernfs_get_active(of->kn))
 		goto out_unlock;
 
 	ops = kernfs_ops(of->kn);
@@ -465,7 +465,7 @@ static int kernfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	of->vm_ops = vma->vm_ops;
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
-	sysfs_put_active(of->kn);
+	kernfs_put_active(of->kn);
 out_unlock:
 	mutex_unlock(&of->mutex);
 
@@ -473,7 +473,7 @@ out_unlock:
 }
 
 /**
- *	sysfs_get_open_dirent - get or create kernfs_open_node
+ *	kernfs_get_open_node - get or create kernfs_open_node
  *	@kn: target kernfs_node
  *	@of: kernfs_open_file for this instance of open
  *
@@ -486,8 +486,8 @@ out_unlock:
  *	RETURNS:
  *	0 on success, -errno on failure.
  */
-static int sysfs_get_open_dirent(struct kernfs_node *kn,
-				 struct kernfs_open_file *of)
+static int kernfs_get_open_node(struct kernfs_node *kn,
+				struct kernfs_open_file *of)
 {
 	struct kernfs_open_node *on, *new_on = NULL;
 
@@ -527,7 +527,7 @@ static int sysfs_get_open_dirent(struct kernfs_node *kn,
 }
 
 /**
- *	sysfs_put_open_dirent - put kernfs_open_node
+ *	kernfs_put_open_node - put kernfs_open_node
  *	@kn: target kernfs_nodet
  *	@of: associated kernfs_open_file
  *
@@ -537,8 +537,8 @@ static int sysfs_get_open_dirent(struct kernfs_node *kn,
  *	LOCKING:
  *	None.
  */
-static void sysfs_put_open_dirent(struct kernfs_node *kn,
-				  struct kernfs_open_file *of)
+static void kernfs_put_open_node(struct kernfs_node *kn,
+				 struct kernfs_open_file *of)
 {
 	struct kernfs_open_node *on = kn->attr.open;
 	unsigned long flags;
@@ -560,7 +560,7 @@ static void sysfs_put_open_dirent(struct kernfs_node *kn,
 	kfree(on);
 }
 
-static int kernfs_file_open(struct inode *inode, struct file *file)
+static int kernfs_fop_open(struct inode *inode, struct file *file)
 {
 	struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
 	const struct kernfs_ops *ops;
@@ -568,7 +568,7 @@ static int kernfs_file_open(struct inode *inode, struct file *file)
 	bool has_read, has_write, has_mmap;
 	int error = -EACCES;
 
-	if (!sysfs_get_active(kn))
+	if (!kernfs_get_active(kn))
 		return -ENODEV;
 
 	ops = kernfs_ops(kn);
@@ -633,13 +633,13 @@ static int kernfs_file_open(struct inode *inode, struct file *file)
 	if (file->f_mode & FMODE_WRITE)
 		file->f_mode |= FMODE_PWRITE;
 
-	/* make sure we have open dirent struct */
-	error = sysfs_get_open_dirent(kn, of);
+	/* make sure we have open node struct */
+	error = kernfs_get_open_node(kn, of);
 	if (error)
 		goto err_close;
 
 	/* open succeeded, put active references */
-	sysfs_put_active(kn);
+	kernfs_put_active(kn);
 	return 0;
 
 err_close:
@@ -647,23 +647,23 @@ err_close:
 err_free:
 	kfree(of);
 err_out:
-	sysfs_put_active(kn);
+	kernfs_put_active(kn);
 	return error;
 }
 
-static int kernfs_file_release(struct inode *inode, struct file *filp)
+static int kernfs_fop_release(struct inode *inode, struct file *filp)
 {
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
 	struct kernfs_open_file *of = kernfs_of(filp);
 
-	sysfs_put_open_dirent(kn, of);
+	kernfs_put_open_node(kn, of);
 	seq_release(inode, filp);
 	kfree(of);
 
 	return 0;
 }
 
-void sysfs_unmap_bin_file(struct kernfs_node *kn)
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
 {
 	struct kernfs_open_node *on;
 	struct kernfs_open_file *of;
@@ -686,10 +686,11 @@ void sysfs_unmap_bin_file(struct kernfs_node *kn)
 	}
 	mutex_unlock(&kernfs_open_file_mutex);
 
-	sysfs_put_open_dirent(kn, NULL);
+	kernfs_put_open_node(kn, NULL);
 }
 
-/* Sysfs attribute files are pollable.  The idea is that you read
+/*
+ * Kernfs attribute files are pollable.  The idea is that you read
  * the content and then you use 'poll' or 'select' to wait for
  * the content to change.  When the content changes (assuming the
  * manager for the kobject supports notification), poll will
@@ -702,19 +703,19 @@ void sysfs_unmap_bin_file(struct kernfs_node *kn)
  * to see if it supports poll (Neither 'poll' nor 'select' return
  * an appropriate error code).  When in doubt, set a suitable timeout value.
  */
-static unsigned int kernfs_file_poll(struct file *filp, poll_table *wait)
+static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
 {
 	struct kernfs_open_file *of = kernfs_of(filp);
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
 	struct kernfs_open_node *on = kn->attr.open;
 
 	/* need parent for the kobj, grab both */
-	if (!sysfs_get_active(kn))
+	if (!kernfs_get_active(kn))
 		goto trigger;
 
 	poll_wait(filp, &on->poll, wait);
 
-	sysfs_put_active(kn);
+	kernfs_put_active(kn);
 
 	if (of->event != atomic_read(&on->event))
 		goto trigger;
@@ -751,13 +752,13 @@ void kernfs_notify(struct kernfs_node *kn)
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
 const struct file_operations kernfs_file_fops = {
-	.read		= kernfs_file_read,
-	.write		= kernfs_file_write,
+	.read		= kernfs_fop_read,
+	.write		= kernfs_fop_write,
 	.llseek		= generic_file_llseek,
-	.mmap		= kernfs_file_mmap,
-	.open		= kernfs_file_open,
-	.release	= kernfs_file_release,
-	.poll		= kernfs_file_poll,
+	.mmap		= kernfs_fop_mmap,
+	.open		= kernfs_fop_open,
+	.release	= kernfs_fop_release,
+	.poll		= kernfs_fop_poll,
 };
 
 /**
@@ -784,8 +785,8 @@ struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 	struct kernfs_node *kn;
 	int rc;
 
-	kn = sysfs_new_dirent(kernfs_root(parent), name,
-			      (mode & S_IALLUGO) | S_IFREG, KERNFS_FILE);
+	kn = kernfs_new_node(kernfs_root(parent), name,
+			     (mode & S_IALLUGO) | S_IFREG, KERNFS_FILE);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -811,9 +812,9 @@ struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
 
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, kn, parent);
-	sysfs_addrm_finish(&acxt);
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
 
 	if (rc) {
 		kernfs_put(kn);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index c5f231e8d36..e55126f85bd 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -31,16 +31,16 @@ static struct backing_dev_info kernfs_bdi = {
 };
 
 static const struct inode_operations kernfs_iops = {
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-	.removexattr	= sysfs_removexattr,
-	.getxattr	= sysfs_getxattr,
-	.listxattr	= sysfs_listxattr,
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
 };
 
-void __init sysfs_inode_init(void)
+void __init kernfs_inode_init(void)
 {
 	if (bdi_init(&kernfs_bdi))
 		panic("failed to init kernfs_bdi");
@@ -115,7 +115,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 	return ret;
 }
 
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct kernfs_node *kn = dentry->d_fsdata;
@@ -141,8 +141,8 @@ out:
 	return error;
 }
 
-static int sysfs_sd_setsecdata(struct kernfs_node *kn, void **secdata,
-			       u32 *secdata_len)
+static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+				  u32 *secdata_len)
 {
 	struct kernfs_iattrs *attrs;
 	void *old_secdata;
@@ -163,8 +163,8 @@ static int sysfs_sd_setsecdata(struct kernfs_node *kn, void **secdata,
 	return 0;
 }
 
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
 	struct kernfs_iattrs *attrs;
@@ -188,7 +188,7 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 			return error;
 
 		mutex_lock(&kernfs_mutex);
-		error = sysfs_sd_setsecdata(kn, &secdata, &secdata_len);
+		error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
 		mutex_unlock(&kernfs_mutex);
 
 		if (secdata)
@@ -202,7 +202,7 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	return -EINVAL;
 }
 
-int sysfs_removexattr(struct dentry *dentry, const char *name)
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
 	struct kernfs_iattrs *attrs;
@@ -214,8 +214,8 @@ int sysfs_removexattr(struct dentry *dentry, const char *name)
 	return simple_xattr_remove(&attrs->xattrs, name);
 }
 
-ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
-		       size_t size)
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
 	struct kernfs_iattrs *attrs;
@@ -227,7 +227,7 @@ ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
 	return simple_xattr_get(&attrs->xattrs, name, buf, size);
 }
 
-ssize_t sysfs_listxattr(struct dentry *dentry, char *buf, size_t size)
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
 	struct kernfs_iattrs *attrs;
@@ -254,7 +254,7 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
 	inode->i_ctime = iattr->ia_ctime;
 }
 
-static void sysfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
+static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 {
 	struct kernfs_iattrs *attrs = kn->iattr;
 
@@ -273,21 +273,21 @@ static void sysfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 		set_nlink(inode, kn->dir.subdirs + 2);
 }
 
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat)
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
 	struct inode *inode = dentry->d_inode;
 
 	mutex_lock(&kernfs_mutex);
-	sysfs_refresh_inode(kn, inode);
+	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
 	generic_fillattr(inode, stat);
 	return 0;
 }
 
-static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
+static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 {
 	kernfs_get(kn);
 	inode->i_private = kn;
@@ -296,7 +296,7 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	inode->i_op = &kernfs_iops;
 
 	set_default_inode_attr(inode, kn->mode);
-	sysfs_refresh_inode(kn, inode);
+	kernfs_refresh_inode(kn, inode);
 
 	/* initialize inode according to type */
 	switch (kernfs_type(kn)) {
@@ -319,7 +319,7 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 }
 
 /**
- *	sysfs_get_inode - get inode for kernfs_node
+ *	kernfs_get_inode - get inode for kernfs_node
  *	@sb: super block
  *	@kn: kernfs_node to allocate inode for
  *
@@ -333,25 +333,25 @@ static void sysfs_init_inode(struct kernfs_node *kn, struct inode *inode)
  *	RETURNS:
  *	Pointer to allocated inode on success, NULL on failure.
  */
-struct inode *sysfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
 {
 	struct inode *inode;
 
 	inode = iget_locked(sb, kn->ino);
 	if (inode && (inode->i_state & I_NEW))
-		sysfs_init_inode(kn, inode);
+		kernfs_init_inode(kn, inode);
 
 	return inode;
 }
 
 /*
- * The kernfs_node serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take
- * a reference to kernfs_node from the sysfs inode.  A
+ * The kernfs_node serves as both an inode and a directory entry for
+ * kernfs.  To prevent the kernfs inode numbers from being freed
+ * prematurely we take a reference to kernfs_node from the kernfs inode.  A
  * super_operations.evict_inode() implementation is needed to drop that
  * reference upon inode destruction.
  */
-void sysfs_evict_inode(struct inode *inode)
+void kernfs_evict_inode(struct inode *inode)
 {
 	struct kernfs_node *kn = inode->i_private;
 
@@ -360,7 +360,7 @@ void sysfs_evict_inode(struct inode *inode)
 	kernfs_put(kn);
 }
 
-int sysfs_permission(struct inode *inode, int mask)
+int kernfs_iop_permission(struct inode *inode, int mask)
 {
 	struct kernfs_node *kn;
 
@@ -370,7 +370,7 @@ int sysfs_permission(struct inode *inode, int mask)
 	kn = inode->i_private;
 
 	mutex_lock(&kernfs_mutex);
-	sysfs_refresh_inode(kn, inode);
+	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
 	return generic_permission(inode, mask);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index e62e8ec15d6..a4ff491fd59 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -76,19 +76,19 @@ extern struct kmem_cache *kernfs_node_cache;
 /*
  * inode.c
  */
-struct inode *sysfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
-void sysfs_evict_inode(struct inode *inode);
-int sysfs_permission(struct inode *inode, int mask);
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		   size_t size, int flags);
-int sysfs_removexattr(struct dentry *dentry, const char *name);
-ssize_t sysfs_getxattr(struct dentry *dentry, const char *name, void *buf,
-		       size_t size);
-ssize_t sysfs_listxattr(struct dentry *dentry, char *buf, size_t size);
-void sysfs_inode_init(void);
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
+void kernfs_evict_inode(struct inode *inode);
+int kernfs_iop_permission(struct inode *inode, int mask);
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+			size_t size, int flags);
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size);
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+void kernfs_inode_init(void);
 
 /*
  * dir.c
@@ -98,21 +98,21 @@ extern const struct dentry_operations kernfs_dops;
 extern const struct file_operations kernfs_dir_fops;
 extern const struct inode_operations kernfs_dir_iops;
 
-struct kernfs_node *sysfs_get_active(struct kernfs_node *kn);
-void sysfs_put_active(struct kernfs_node *kn);
-void sysfs_addrm_start(struct kernfs_addrm_cxt *acxt);
-int sysfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
-		  struct kernfs_node *parent);
-void sysfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
-struct kernfs_node *sysfs_new_dirent(struct kernfs_root *root,
-				     const char *name, umode_t mode, int type);
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
+void kernfs_put_active(struct kernfs_node *kn);
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
+		   struct kernfs_node *parent);
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
+struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
+				    umode_t mode, int type);
 
 /*
  * file.c
  */
 extern const struct file_operations kernfs_file_fops;
 
-void sysfs_unmap_bin_file(struct kernfs_node *kn);
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
 
 /*
  * symlink.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 27d967ba0bb..0d6ce895a9e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -22,10 +22,10 @@ struct kmem_cache *kernfs_node_cache;
 static const struct super_operations kernfs_sops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.evict_inode	= sysfs_evict_inode,
+	.evict_inode	= kernfs_evict_inode,
 };
 
-static int sysfs_fill_super(struct super_block *sb)
+static int kernfs_fill_super(struct super_block *sb)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct inode *inode;
@@ -39,10 +39,10 @@ static int sysfs_fill_super(struct super_block *sb)
 
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&kernfs_mutex);
-	inode = sysfs_get_inode(sb, info->root->kn);
+	inode = kernfs_get_inode(sb, info->root->kn);
 	mutex_unlock(&kernfs_mutex);
 	if (!inode) {
-		pr_debug("sysfs: could not get root inode\n");
+		pr_debug("kernfs: could not get root inode\n");
 		return -ENOMEM;
 	}
 
@@ -59,7 +59,7 @@ static int sysfs_fill_super(struct super_block *sb)
 	return 0;
 }
 
-static int sysfs_test_super(struct super_block *sb, void *data)
+static int kernfs_test_super(struct super_block *sb, void *data)
 {
 	struct kernfs_super_info *sb_info = kernfs_info(sb);
 	struct kernfs_super_info *info = data;
@@ -67,7 +67,7 @@ static int sysfs_test_super(struct super_block *sb, void *data)
 	return sb_info->root == info->root && sb_info->ns == info->ns;
 }
 
-static int sysfs_set_super(struct super_block *sb, void *data)
+static int kernfs_set_super(struct super_block *sb, void *data)
 {
 	int error;
 	error = set_anon_super(sb, data);
@@ -117,13 +117,13 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 	info->root = root;
 	info->ns = ns;
 
-	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
+	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		kfree(info);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 	if (!sb->s_root) {
-		error = sysfs_fill_super(sb);
+		error = kernfs_fill_super(sb);
 		if (error) {
 			deactivate_locked_super(sb);
 			return ERR_PTR(error);
@@ -161,5 +161,5 @@ void __init kernfs_init(void)
 	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
 					      sizeof(struct kernfs_node),
 					      0, SLAB_PANIC, NULL);
-	sysfs_inode_init();
+	kernfs_inode_init();
 }
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 4105bd04ea2..a03e26036ef 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -30,8 +30,8 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	struct kernfs_addrm_cxt acxt;
 	int error;
 
-	kn = sysfs_new_dirent(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
-			      KERNFS_LINK);
+	kn = kernfs_new_node(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
+			     KERNFS_LINK);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -40,9 +40,9 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	kn->symlink.target_kn = target;
 	kernfs_get(target);	/* ref owned by symlink */
 
-	sysfs_addrm_start(&acxt);
-	error = sysfs_add_one(&acxt, kn, parent);
-	sysfs_addrm_finish(&acxt);
+	kernfs_addrm_start(&acxt);
+	error = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
 
 	if (!error)
 		return kn;
@@ -51,8 +51,8 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	return ERR_PTR(error);
 }
 
-static int sysfs_get_target_path(struct kernfs_node *parent,
-				 struct kernfs_node *target, char *path)
+static int kernfs_get_target_path(struct kernfs_node *parent,
+				  struct kernfs_node *target, char *path)
 {
 	struct kernfs_node *base, *kn;
 	char *s = path;
@@ -103,7 +103,7 @@ static int sysfs_get_target_path(struct kernfs_node *parent,
 	return 0;
 }
 
-static int sysfs_getlink(struct dentry *dentry, char *path)
+static int kernfs_getlink(struct dentry *dentry, char *path)
 {
 	struct kernfs_node *kn = dentry->d_fsdata;
 	struct kernfs_node *parent = kn->parent;
@@ -111,18 +111,18 @@ static int sysfs_getlink(struct dentry *dentry, char *path)
 	int error;
 
 	mutex_lock(&kernfs_mutex);
-	error = sysfs_get_target_path(parent, target, path);
+	error = kernfs_get_target_path(parent, target, path);
 	mutex_unlock(&kernfs_mutex);
 
 	return error;
 }
 
-static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
 	if (page) {
-		error = sysfs_getlink(dentry, (char *) page);
+		error = kernfs_getlink(dentry, (char *) page);
 		if (error < 0)
 			free_page((unsigned long)page);
 	}
@@ -130,8 +130,8 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return NULL;
 }
 
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			   void *cookie)
+static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
 {
 	char *page = nd_get_link(nd);
 	if (!IS_ERR(page))
@@ -139,14 +139,14 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
 }
 
 const struct inode_operations kernfs_symlink_iops = {
-	.setxattr	= sysfs_setxattr,
-	.removexattr	= sysfs_removexattr,
-	.getxattr	= sysfs_getxattr,
-	.listxattr	= sysfs_listxattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
 	.readlink	= generic_readlink,
-	.follow_link	= sysfs_follow_link,
-	.put_link	= sysfs_put_link,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.permission	= sysfs_permission,
+	.follow_link	= kernfs_iop_follow_link,
+	.put_link	= kernfs_iop_put_link,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.permission	= kernfs_iop_permission,
 };
-- 
cgit v1.2.3-70-g09d2


From c4fa6d7c5971baa45eca4f81b8f100299848486a Mon Sep 17 00:00:00 2001
From: Stanislav Kholmanskikh <stanislav.kholmanskikh@oracle.com>
Date: Wed, 11 Dec 2013 14:16:36 +0400
Subject: nfsd: revoking of suid/sgid bits after chown() in a consistent way

There is an inconsistency in the handling of SUID/SGID file
bits after chown() between NFS and other local file systems.

Local file systems (for example, ext3, ext4, xfs, btrfs) revoke
SUID/SGID bits after chown() on a regular file even if
the owner/group of the file has not been changed:

~# touch file; chmod ug+s file; chmod u+x file
~# ls -l file
-rwsr-Sr-- 1 root root 0 Dec  6 04:49 file
~# chown root file; ls -l file
-rwxr-Sr-- 1 root root 0 Dec  6 04:49 file

but NFS doesn't do that:

~# touch file; chmod ug+s file; chmod u+x file
~# ls -l file
-rwsr-Sr-- 1 root root 0 Dec  6 04:49 file
~# chown root file; ls -l file
-rwsr-Sr-- 1 root root 0 Dec  6 04:49 file

NFS does that only if the owner/group has been changed:

~# touch file; chmod ug+s file; chmod u+x file
~# ls -l file
-rwsr-Sr-- 1 root root 0 Dec  6 05:02 file
~# chown bin file; ls -l file
-rwxr-Sr-- 1 bin root 0 Dec  6 05:02 file

See: http://pubs.opengroup.org/onlinepubs/9699919799/functions/chown.html

 "If the specified file is a regular file, one or more of
 the S_IXUSR, S_IXGRP, or S_IXOTH bits of the file mode are set,
 and the process has appropriate privileges, it is
 implementation-defined whether the set-user-ID and set-group-ID
 bits are altered."

So both variants are acceptable by POSIX.

This patch makes NFS to behave like local file systems.

Signed-off-by: Stanislav Kholmanskikh <stanislav.kholmanskikh@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7eea63cada1..c8aa6ff9925 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -348,8 +348,7 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
 
 	/* Revoke setuid/setgid on chown */
 	if (!S_ISDIR(inode->i_mode) &&
-	    (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) ||
-	     ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) {
+	    ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) {
 		iap->ia_valid |= ATTR_KILL_PRIV;
 		if (iap->ia_valid & ATTR_MODE) {
 			/* we're setting mode too, just clear the s*id bits */
-- 
cgit v1.2.3-70-g09d2


From 9597df6b26a1988a5a04762711149f98ec6ab388 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:00:42 +1100
Subject: xfs: remove duplicate code in xlog_cil_insert_format_items

Share code that was previously duplicated in two branches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_log_cil.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5eb51fc5eb8..0a7a8cef601 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -254,29 +254,22 @@ xlog_cil_insert_format_items(
 			 */
 			*diff_iovecs -= lv->lv_niovecs;
 			*diff_len -= lv->lv_buf_len;
-
-			/* Ensure the lv is set up according to ->iop_size */
-			lv->lv_niovecs = niovecs;
-			lv->lv_buf = (char *)lv + buf_size - nbytes;
-
-			lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
-			goto insert;
+		} else {
+			/* allocate new data chunk */
+			lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+			lv->lv_item = lip;
+			lv->lv_size = buf_size;
+			if (ordered) {
+				/* track as an ordered logvec */
+				ASSERT(lip->li_lv == NULL);
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+				goto insert;
+			}
+			lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
 		}
 
-		/* allocate new data chunk */
-		lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
-		lv->lv_item = lip;
-		lv->lv_size = buf_size;
+		/* Ensure the lv is set up according to ->iop_size */
 		lv->lv_niovecs = niovecs;
-		if (ordered) {
-			/* track as an ordered logvec */
-			ASSERT(lip->li_lv == NULL);
-			lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
-			goto insert;
-		}
-
-		/* The allocated iovec region lies beyond the log vector. */
-		lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
 
 		/* The allocated data region lies beyond the iovec region */
 		lv->lv_buf = (char *)lv + buf_size - nbytes;
-- 
cgit v1.2.3-70-g09d2


From 7aeb72224120e0c49ba4c93d75f8f0d6a87f6afd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:00:43 +1100
Subject: xfs: refactor xfs_buf_item_format_segment

Add two helpers to make the code more readable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf_item.c | 72 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a64f67ba25d..a30c1fb1bec 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -182,6 +182,34 @@ xfs_buf_item_size(
 	trace_xfs_buf_item_size(bip);
 }
 
+static inline struct xfs_log_iovec *
+xfs_buf_item_copy_iovec(
+	struct xfs_log_iovec	*vecp,
+	struct xfs_buf		*bp,
+	uint			offset,
+	int			first_bit,
+	uint			nbits)
+{
+	offset += first_bit * XFS_BLF_CHUNK;
+
+	vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+	vecp->i_addr = xfs_buf_offset(bp, offset);
+	vecp->i_len = nbits * XFS_BLF_CHUNK;
+	return vecp + 1;
+}
+
+static inline bool
+xfs_buf_item_straddle(
+	struct xfs_buf		*bp,
+	uint			offset,
+	int			next_bit,
+	int			last_bit)
+{
+	return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
+		(xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
+		 XFS_BLF_CHUNK);
+}
+
 static struct xfs_log_iovec *
 xfs_buf_item_format_segment(
 	struct xfs_buf_log_item	*bip,
@@ -196,7 +224,6 @@ xfs_buf_item_format_segment(
 	int		last_bit;
 	int		next_bit;
 	uint		nbits;
-	uint		buffer_offset;
 
 	/* copy the flags across from the base format item */
 	blfp->blf_flags = bip->__bli_format.blf_flags;
@@ -239,7 +266,6 @@ xfs_buf_item_format_segment(
 	/*
 	 * Fill in an iovec for each set of contiguous chunks.
 	 */
-
 	last_bit = first_bit;
 	nbits = 1;
 	for (;;) {
@@ -252,42 +278,22 @@ xfs_buf_item_format_segment(
 		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 					(uint)last_bit + 1);
 		/*
-		 * If we run out of bits fill in the last iovec and get
-		 * out of the loop.
-		 * Else if we start a new set of bits then fill in the
-		 * iovec for the series we were looking at and start
-		 * counting the bits in the new one.
-		 * Else we're still in the same set of bits so just
-		 * keep counting and scanning.
+		 * If we run out of bits fill in the last iovec and get out of
+		 * the loop.  Else if we start a new set of bits then fill in
+		 * the iovec for the series we were looking at and start
+		 * counting the bits in the new one.  Else we're still in the
+		 * same set of bits so just keep counting and scanning.
 		 */
 		if (next_bit == -1) {
-			buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
-			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLF_CHUNK;
-			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+			xfs_buf_item_copy_iovec(vecp, bp, offset,
+						first_bit, nbits);
 			nvecs++;
 			break;
-		} else if (next_bit != last_bit + 1) {
-			buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
-			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLF_CHUNK;
-			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-			nvecs++;
-			vecp++;
-			first_bit = next_bit;
-			last_bit = next_bit;
-			nbits = 1;
-		} else if (xfs_buf_offset(bp, offset +
-					      (next_bit << XFS_BLF_SHIFT)) !=
-			   (xfs_buf_offset(bp, offset +
-					       (last_bit << XFS_BLF_SHIFT)) +
-			    XFS_BLF_CHUNK)) {
-			buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
-			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLF_CHUNK;
-			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+		} else if (next_bit != last_bit + 1 ||
+		           xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
+			vecp = xfs_buf_item_copy_iovec(vecp, bp, offset,
+						       first_bit, nbits);
 			nvecs++;
-			vecp++;
 			first_bit = next_bit;
 			last_bit = next_bit;
 			nbits = 1;
-- 
cgit v1.2.3-70-g09d2


From ce9641d6c981aad0463b2d1455f0b60e5c8671c5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:00:43 +1100
Subject: xfs: refactor xfs_inode_item_size

Split out two helpers to size the data and attribute to make the
function more readable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode_item.c | 62 ++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c0d391f9a6..050d2540f7b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -39,27 +39,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
 	return container_of(lip, struct xfs_inode_log_item, ili_item);
 }
 
-
-/*
- * This returns the number of iovecs needed to log the given inode item.
- *
- * We need one iovec for the inode log format structure, one for the
- * inode core, and possibly one for the inode data/extents/b-tree root
- * and one for the inode attribute data/extents/b-tree root.
- */
 STATIC void
-xfs_inode_item_size(
-	struct xfs_log_item	*lip,
+xfs_inode_item_data_fork_size(
+	struct xfs_inode_log_item *iip,
 	int			*nvecs,
 	int			*nbytes)
 {
-	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
 
-	*nvecs += 2;
-	*nbytes += sizeof(struct xfs_inode_log_format) +
-		   xfs_icdinode_size(ip->i_d.di_version);
-
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
@@ -70,7 +57,6 @@ xfs_inode_item_size(
 			*nvecs += 1;
 		}
 		break;
-
 	case XFS_DINODE_FMT_BTREE:
 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
 		    ip->i_df.if_broot_bytes > 0) {
@@ -78,7 +64,6 @@ xfs_inode_item_size(
 			*nvecs += 1;
 		}
 		break;
-
 	case XFS_DINODE_FMT_LOCAL:
 		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
 		    ip->i_df.if_bytes > 0) {
@@ -90,19 +75,20 @@ xfs_inode_item_size(
 	case XFS_DINODE_FMT_DEV:
 	case XFS_DINODE_FMT_UUID:
 		break;
-
 	default:
 		ASSERT(0);
 		break;
 	}
+}
 
-	if (!XFS_IFORK_Q(ip))
-		return;
-
+STATIC void
+xfs_inode_item_attr_fork_size(
+	struct xfs_inode_log_item *iip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
 
-	/*
-	 * Log any necessary attribute data.
-	 */
 	switch (ip->i_d.di_aformat) {
 	case XFS_DINODE_FMT_EXTENTS:
 		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
@@ -113,7 +99,6 @@ xfs_inode_item_size(
 			*nvecs += 1;
 		}
 		break;
-
 	case XFS_DINODE_FMT_BTREE:
 		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
 		    ip->i_afp->if_broot_bytes > 0) {
@@ -121,7 +106,6 @@ xfs_inode_item_size(
 			*nvecs += 1;
 		}
 		break;
-
 	case XFS_DINODE_FMT_LOCAL:
 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
 		    ip->i_afp->if_bytes > 0) {
@@ -129,13 +113,37 @@ xfs_inode_item_size(
 			*nvecs += 1;
 		}
 		break;
-
 	default:
 		ASSERT(0);
 		break;
 	}
 }
 
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	*nvecs += 2;
+	*nbytes += sizeof(struct xfs_inode_log_format) +
+		   xfs_icdinode_size(ip->i_d.di_version);
+
+	xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
+	if (XFS_IFORK_Q(ip))
+		xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
+}
+
 /*
  * xfs_inode_item_format_extents - convert in-core extents to on-disk form
  *
-- 
cgit v1.2.3-70-g09d2


From 3de559fbd04d67473b9be2bd183823c40c4b7557 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:00:43 +1100
Subject: xfs: refactor xfs_inode_item_format

Split out a function to handle the data and attr fork, as well as a
helper for the really old v1 inodes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode_item.c | 165 ++++++++++++++++++++++++++----------------------
 1 file changed, 89 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 050d2540f7b..2ad12dcf831 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -180,63 +180,41 @@ xfs_inode_item_format_extents(
 }
 
 /*
- * This is called to fill in the vector of log iovecs for the
- * given inode log item.  It fills the first item with an inode
- * log format structure, the second with the on-disk inode structure,
- * and a possible third and/or fourth with the inode data/extents/b-tree
- * root and inode attributes data/extents/b-tree root.
+ * If this is a v1 format inode, then we need to log it as such.  This means
+ * that we have to copy the link count from the new field to the old.  We
+ * don't have to worry about the new fields, because nothing trusts them as
+ * long as the old inode version number is there.
  */
 STATIC void
-xfs_inode_item_format(
-	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+xfs_inode_item_format_v1_inode(
+	struct xfs_inode	*ip)
+{
+	if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) {
+		/*
+		 * Convert it back.
+		 */
+		ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+		ip->i_d.di_onlink = ip->i_d.di_nlink;
+	} else {
+		/*
+		 * The superblock version has already been bumped,
+		 * so just make the conversion to the new inode
+		 * format permanent.
+		 */
+		ip->i_d.di_version = 2;
+		ip->i_d.di_onlink = 0;
+		memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+	}
+}
+
+STATIC struct xfs_log_iovec *
+xfs_inode_item_format_data_fork(
+	struct xfs_inode_log_item *iip,
+	struct xfs_log_iovec	*vecp,
+	int			*nvecs)
 {
-	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
-	uint			nvecs;
 	size_t			data_bytes;
-	xfs_mount_t		*mp;
-
-	vecp->i_addr = &iip->ili_format;
-	vecp->i_len  = sizeof(xfs_inode_log_format_t);
-	vecp->i_type = XLOG_REG_TYPE_IFORMAT;
-	vecp++;
-	nvecs	     = 1;
-
-	vecp->i_addr = &ip->i_d;
-	vecp->i_len  = xfs_icdinode_size(ip->i_d.di_version);
-	vecp->i_type = XLOG_REG_TYPE_ICORE;
-	vecp++;
-	nvecs++;
-
-	/*
-	 * If this is really an old format inode, then we need to
-	 * log it as such.  This means that we have to copy the link
-	 * count from the new field to the old.  We don't have to worry
-	 * about the new fields, because nothing trusts them as long as
-	 * the old inode version number is there.  If the superblock already
-	 * has a new version number, then we don't bother converting back.
-	 */
-	mp = ip->i_mount;
-	ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-	if (ip->i_d.di_version == 1) {
-		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-			/*
-			 * Convert it back.
-			 */
-			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
-			ip->i_d.di_onlink = ip->i_d.di_nlink;
-		} else {
-			/*
-			 * The superblock version has already been bumped,
-			 * so just make the conversion to the new inode
-			 * format permanent.
-			 */
-			ip->i_d.di_version = 2;
-			ip->i_d.di_onlink = 0;
-			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-		}
-	}
 
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
@@ -271,12 +249,11 @@ xfs_inode_item_format(
 			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
 			iip->ili_format.ilf_dsize = vecp->i_len;
 			vecp++;
-			nvecs++;
+			(*nvecs)++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_DEXT;
 		}
 		break;
-
 	case XFS_DINODE_FMT_BTREE:
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
@@ -289,7 +266,7 @@ xfs_inode_item_format(
 			vecp->i_len = ip->i_df.if_broot_bytes;
 			vecp->i_type = XLOG_REG_TYPE_IBROOT;
 			vecp++;
-			nvecs++;
+			(*nvecs)++;
 			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
 		} else {
 			ASSERT(!(iip->ili_fields &
@@ -297,7 +274,6 @@ xfs_inode_item_format(
 			iip->ili_fields &= ~XFS_ILOG_DBROOT;
 		}
 		break;
-
 	case XFS_DINODE_FMT_LOCAL:
 		iip->ili_fields &=
 			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
@@ -319,13 +295,12 @@ xfs_inode_item_format(
 			vecp->i_len = (int)data_bytes;
 			vecp->i_type = XLOG_REG_TYPE_ILOCAL;
 			vecp++;
-			nvecs++;
+			(*nvecs)++;
 			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_DDATA;
 		}
 		break;
-
 	case XFS_DINODE_FMT_DEV:
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
@@ -335,7 +310,6 @@ xfs_inode_item_format(
 				ip->i_df.if_u2.if_rdev;
 		}
 		break;
-
 	case XFS_DINODE_FMT_UUID:
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
@@ -345,20 +319,22 @@ xfs_inode_item_format(
 				ip->i_df.if_u2.if_uuid;
 		}
 		break;
-
 	default:
 		ASSERT(0);
 		break;
 	}
 
-	/*
-	 * If there are no attributes associated with the file, then we're done.
-	 */
-	if (!XFS_IFORK_Q(ip)) {
-		iip->ili_fields &=
-			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
-		goto out;
-	}
+	return vecp;
+}
+
+STATIC struct xfs_log_iovec *
+xfs_inode_item_format_attr_fork(
+	struct xfs_inode_log_item *iip,
+	struct xfs_log_iovec	*vecp,
+	int			*nvecs)
+{
+	struct xfs_inode	*ip = iip->ili_inode;
+	size_t			data_bytes;
 
 	switch (ip->i_d.di_aformat) {
 	case XFS_DINODE_FMT_EXTENTS:
@@ -386,12 +362,11 @@ xfs_inode_item_format(
 #endif
 			iip->ili_format.ilf_asize = vecp->i_len;
 			vecp++;
-			nvecs++;
+			(*nvecs)++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_AEXT;
 		}
 		break;
-
 	case XFS_DINODE_FMT_BTREE:
 		iip->ili_fields &=
 			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
@@ -404,13 +379,12 @@ xfs_inode_item_format(
 			vecp->i_len = ip->i_afp->if_broot_bytes;
 			vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
 			vecp++;
-			nvecs++;
+			(*nvecs)++;
 			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_ABROOT;
 		}
 		break;
-
 	case XFS_DINODE_FMT_LOCAL:
 		iip->ili_fields &=
 			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
@@ -431,19 +405,59 @@ xfs_inode_item_format(
 			vecp->i_len = (int)data_bytes;
 			vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
 			vecp++;
-			nvecs++;
+			(*nvecs)++;
 			iip->ili_format.ilf_asize = (unsigned)data_bytes;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_ADATA;
 		}
 		break;
-
 	default:
 		ASSERT(0);
 		break;
 	}
 
-out:
+	return vecp;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given inode
+ * log item.  It fills the first item with an inode log format structure,
+ * the second with the on-disk inode structure, and a possible third and/or
+ * fourth with the inode data/extents/b-tree root and inode attributes
+ * data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*vecp)
+{
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	uint			nvecs;
+
+	vecp->i_addr = &iip->ili_format;
+	vecp->i_len  = sizeof(xfs_inode_log_format_t);
+	vecp->i_type = XLOG_REG_TYPE_IFORMAT;
+	vecp++;
+	nvecs	     = 1;
+
+	vecp->i_addr = &ip->i_d;
+	vecp->i_len  = xfs_icdinode_size(ip->i_d.di_version);
+	vecp->i_type = XLOG_REG_TYPE_ICORE;
+	vecp++;
+	nvecs++;
+
+	if (ip->i_d.di_version == 1)
+		xfs_inode_item_format_v1_inode(ip);
+
+	vecp = xfs_inode_item_format_data_fork(iip, vecp, &nvecs);
+	if (XFS_IFORK_Q(ip)) {
+		vecp = xfs_inode_item_format_attr_fork(iip, vecp, &nvecs);
+	} else {
+		iip->ili_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+	}
+
 	/*
 	 * Now update the log format that goes out to disk from the in-core
 	 * values.  We always write the inode core to make the arithmetic
@@ -455,7 +469,6 @@ out:
 	iip->ili_format.ilf_size = nvecs;
 }
 
-
 /*
  * This is called to pin the inode associated with the inode log
  * item in memory so it cannot be written out.
-- 
cgit v1.2.3-70-g09d2


From 1234351cba958cd5d4338172ccfc869a687cd736 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:00:43 +1100
Subject: xfs: introduce xlog_copy_iovec

Add a helper to abstract out filling the log iovecs in the log item
format handlers.  This will allow us to change the way we do the log
item formatting more easily.

The copy in the name is a bit confusing for now as it just assigns a
pointer and lets the CIL code perform the copy, but that will change
soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf_item.c     |  30 +++++-------
 fs/xfs/xfs_dquot_item.c   |  25 +++++-----
 fs/xfs/xfs_extfree_item.c |  19 ++++----
 fs/xfs/xfs_icreate_item.c |   9 ++--
 fs/xfs/xfs_inode_item.c   | 115 +++++++++++++++++++++-------------------------
 fs/xfs/xfs_log.h          |  13 ++++++
 6 files changed, 103 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a30c1fb1bec..d49419d4bb4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -182,20 +182,18 @@ xfs_buf_item_size(
 	trace_xfs_buf_item_size(bip);
 }
 
-static inline struct xfs_log_iovec *
+static inline void
 xfs_buf_item_copy_iovec(
-	struct xfs_log_iovec	*vecp,
+	struct xfs_log_iovec	**vecp,
 	struct xfs_buf		*bp,
 	uint			offset,
 	int			first_bit,
 	uint			nbits)
 {
 	offset += first_bit * XFS_BLF_CHUNK;
-
-	vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-	vecp->i_addr = xfs_buf_offset(bp, offset);
-	vecp->i_len = nbits * XFS_BLF_CHUNK;
-	return vecp + 1;
+	xlog_copy_iovec(vecp, XLOG_REG_TYPE_BCHUNK,
+			xfs_buf_offset(bp, offset),
+			nbits * XFS_BLF_CHUNK);
 }
 
 static inline bool
@@ -210,10 +208,10 @@ xfs_buf_item_straddle(
 		 XFS_BLF_CHUNK);
 }
 
-static struct xfs_log_iovec *
+static void
 xfs_buf_item_format_segment(
 	struct xfs_buf_log_item	*bip,
-	struct xfs_log_iovec	*vecp,
+	struct xfs_log_iovec	**vecp,
 	uint			offset,
 	struct xfs_buf_log_format *blfp)
 {
@@ -245,10 +243,7 @@ xfs_buf_item_format_segment(
 		goto out;
 	}
 
-	vecp->i_addr = blfp;
-	vecp->i_len = base_size;
-	vecp->i_type = XLOG_REG_TYPE_BFORMAT;
-	vecp++;
+	xlog_copy_iovec(vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
 	nvecs = 1;
 
 	if (bip->bli_flags & XFS_BLI_STALE) {
@@ -291,8 +286,8 @@ xfs_buf_item_format_segment(
 			break;
 		} else if (next_bit != last_bit + 1 ||
 		           xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
-			vecp = xfs_buf_item_copy_iovec(vecp, bp, offset,
-						       first_bit, nbits);
+			xfs_buf_item_copy_iovec(vecp, bp, offset,
+						first_bit, nbits);
 			nvecs++;
 			first_bit = next_bit;
 			last_bit = next_bit;
@@ -304,7 +299,6 @@ xfs_buf_item_format_segment(
 	}
 out:
 	blfp->blf_size = nvecs;
-	return vecp;
 }
 
 /*
@@ -360,8 +354,8 @@ xfs_buf_item_format(
 	}
 
 	for (i = 0; i < bip->bli_format_count; i++) {
-		vecp = xfs_buf_item_format_segment(bip, vecp, offset,
-						&bip->bli_formats[i]);
+		xfs_buf_item_format_segment(bip, &vecp, offset,
+					    &bip->bli_formats[i]);
 		offset += bp->b_maps[i].bm_len;
 	}
 
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 92e5f62eefc..ca354a82183 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -57,20 +57,18 @@ xfs_qm_dquot_logitem_size(
 STATIC void
 xfs_qm_dquot_logitem_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*logvec)
+	struct xfs_log_iovec	*vecp)
 {
 	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
 
-	logvec->i_addr = &qlip->qli_format;
-	logvec->i_len  = sizeof(xfs_dq_logformat_t);
-	logvec->i_type = XLOG_REG_TYPE_QFORMAT;
-	logvec++;
-	logvec->i_addr = &qlip->qli_dquot->q_core;
-	logvec->i_len  = sizeof(xfs_disk_dquot_t);
-	logvec->i_type = XLOG_REG_TYPE_DQUOT;
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_QFORMAT,
+			&qlip->qli_format,
+			sizeof(struct xfs_dq_logformat));
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_DQUOT,
+			&qlip->qli_dquot->q_core,
+			sizeof(struct xfs_disk_dquot));
 
 	qlip->qli_format.qlf_size = 2;
-
 }
 
 /*
@@ -304,15 +302,16 @@ xfs_qm_qoff_logitem_size(
 STATIC void
 xfs_qm_qoff_logitem_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*log_vector)
+	struct xfs_log_iovec	*vecp)
 {
 	struct xfs_qoff_logitem	*qflip = QOFF_ITEM(lip);
 
 	ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
 
-	log_vector->i_addr = &qflip->qql_format;
-	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
-	log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_QUOTAOFF,
+			&qflip->qql_format,
+			sizeof(struct xfs_qoff_logitem));
+
 	qflip->qql_format.qf_size = 1;
 }
 
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3680d04f973..08823ecbcd8 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -26,6 +26,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_extfree_item.h"
+#include "xfs_log.h"
 
 
 kmem_zone_t	*xfs_efi_zone;
@@ -101,7 +102,7 @@ xfs_efi_item_size(
 STATIC void
 xfs_efi_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*log_vector)
+	struct xfs_log_iovec	*vecp)
 {
 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
 
@@ -111,10 +112,9 @@ xfs_efi_item_format(
 	efip->efi_format.efi_type = XFS_LI_EFI;
 	efip->efi_format.efi_size = 1;
 
-	log_vector->i_addr = &efip->efi_format;
-	log_vector->i_len = xfs_efi_item_sizeof(efip);
-	log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
-	ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_EFI_FORMAT,
+			&efip->efi_format,
+			xfs_efi_item_sizeof(efip));
 }
 
 
@@ -368,7 +368,7 @@ xfs_efd_item_size(
 STATIC void
 xfs_efd_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*log_vector)
+	struct xfs_log_iovec	*vecp)
 {
 	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
 
@@ -377,10 +377,9 @@ xfs_efd_item_format(
 	efdp->efd_format.efd_type = XFS_LI_EFD;
 	efdp->efd_format.efd_size = 1;
 
-	log_vector->i_addr = &efdp->efd_format;
-	log_vector->i_len = xfs_efd_item_sizeof(efdp);
-	log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
-	ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_EFD_FORMAT,
+			&efdp->efd_format,
+			xfs_efd_item_sizeof(efdp));
 }
 
 /*
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d2eaccfa73f..5751fa8580e 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -28,6 +28,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 #include "xfs_icreate_item.h"
+#include "xfs_log.h"
 
 kmem_zone_t	*xfs_icreate_zone;		/* inode create item zone */
 
@@ -58,13 +59,13 @@ xfs_icreate_item_size(
 STATIC void
 xfs_icreate_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*log_vector)
+	struct xfs_log_iovec	*vecp)
 {
 	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
 
-	log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
-	log_vector->i_len  = sizeof(struct xfs_icreate_log);
-	log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_ICREATE,
+			&icp->ic_format,
+			sizeof(struct xfs_icreate_log));
 }
 
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2ad12dcf831..c75e14beff0 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -30,6 +30,7 @@
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
 #include "xfs_dinode.h"
+#include "xfs_log.h"
 
 
 kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
@@ -159,14 +160,15 @@ xfs_inode_item_size(
  * here, so always use the physical fork size to determine the size of the
  * buffer we need to allocate.
  */
-STATIC void
+STATIC int
 xfs_inode_item_format_extents(
 	struct xfs_inode	*ip,
-	struct xfs_log_iovec	*vecp,
+	struct xfs_log_iovec	**vecp,
 	int			whichfork,
 	int			type)
 {
 	xfs_bmbt_rec_t		*ext_buffer;
+	int			len;
 
 	ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
 	if (whichfork == XFS_DATA_FORK)
@@ -174,9 +176,9 @@ xfs_inode_item_format_extents(
 	else
 		ip->i_itemp->ili_aextents_buf = ext_buffer;
 
-	vecp->i_addr = ext_buffer;
-	vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
-	vecp->i_type = type;
+	len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+	xlog_copy_iovec(vecp, type, ext_buffer, len);
+	return len;
 }
 
 /*
@@ -207,10 +209,10 @@ xfs_inode_item_format_v1_inode(
 	}
 }
 
-STATIC struct xfs_log_iovec *
+STATIC void
 xfs_inode_item_format_data_fork(
 	struct xfs_inode_log_item *iip,
-	struct xfs_log_iovec	*vecp,
+	struct xfs_log_iovec	**vecp,
 	int			*nvecs)
 {
 	struct xfs_inode	*ip = iip->ili_inode;
@@ -237,18 +239,18 @@ xfs_inode_item_format_data_fork(
 				 * extents, so just point to the
 				 * real extents array.
 				 */
-				vecp->i_addr = ip->i_df.if_u1.if_extents;
-				vecp->i_len = ip->i_df.if_bytes;
-				vecp->i_type = XLOG_REG_TYPE_IEXT;
+				xlog_copy_iovec(vecp, XLOG_REG_TYPE_IEXT,
+						ip->i_df.if_u1.if_extents,
+						ip->i_df.if_bytes);
+				iip->ili_format.ilf_dsize = ip->i_df.if_bytes;
 			} else
 #endif
 			{
-				xfs_inode_item_format_extents(ip, vecp,
-					XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
+				iip->ili_format.ilf_dsize =
+					xfs_inode_item_format_extents(ip, vecp,
+						XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
+				ASSERT(iip->ili_format.ilf_dsize <= ip->i_df.if_bytes);
 			}
-			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
-			iip->ili_format.ilf_dsize = vecp->i_len;
-			vecp++;
 			(*nvecs)++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_DEXT;
@@ -262,10 +264,9 @@ xfs_inode_item_format_data_fork(
 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
 		    ip->i_df.if_broot_bytes > 0) {
 			ASSERT(ip->i_df.if_broot != NULL);
-			vecp->i_addr = ip->i_df.if_broot;
-			vecp->i_len = ip->i_df.if_broot_bytes;
-			vecp->i_type = XLOG_REG_TYPE_IBROOT;
-			vecp++;
+			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IBROOT,
+					ip->i_df.if_broot,
+					ip->i_df.if_broot_bytes);
 			(*nvecs)++;
 			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
 		} else {
@@ -280,21 +281,18 @@ xfs_inode_item_format_data_fork(
 			  XFS_ILOG_DEV | XFS_ILOG_UUID);
 		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
 		    ip->i_df.if_bytes > 0) {
-			ASSERT(ip->i_df.if_u1.if_data != NULL);
-			ASSERT(ip->i_d.di_size > 0);
-
-			vecp->i_addr = ip->i_df.if_u1.if_data;
 			/*
 			 * Round i_bytes up to a word boundary.
 			 * The underlying memory is guaranteed to
 			 * to be there by xfs_idata_realloc().
 			 */
 			data_bytes = roundup(ip->i_df.if_bytes, 4);
-			ASSERT((ip->i_df.if_real_bytes == 0) ||
-			       (ip->i_df.if_real_bytes == data_bytes));
-			vecp->i_len = (int)data_bytes;
-			vecp->i_type = XLOG_REG_TYPE_ILOCAL;
-			vecp++;
+			ASSERT(ip->i_df.if_real_bytes == 0 ||
+			       ip->i_df.if_real_bytes == data_bytes);
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+			xlog_copy_iovec(vecp, XLOG_REG_TYPE_ILOCAL,
+					ip->i_df.if_u1.if_data, data_bytes);
 			(*nvecs)++;
 			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
 		} else {
@@ -323,14 +321,12 @@ xfs_inode_item_format_data_fork(
 		ASSERT(0);
 		break;
 	}
-
-	return vecp;
 }
 
-STATIC struct xfs_log_iovec *
+STATIC void
 xfs_inode_item_format_attr_fork(
 	struct xfs_inode_log_item *iip,
-	struct xfs_log_iovec	*vecp,
+	struct xfs_log_iovec	**vecp,
 	int			*nvecs)
 {
 	struct xfs_inode	*ip = iip->ili_inode;
@@ -352,16 +348,16 @@ xfs_inode_item_format_attr_fork(
 			 * There are not delayed allocation extents
 			 * for attributes, so just point at the array.
 			 */
-			vecp->i_addr = ip->i_afp->if_u1.if_extents;
-			vecp->i_len = ip->i_afp->if_bytes;
-			vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
+			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IATTR_EXT,
+					ip->i_afp->if_u1.if_extents,
+					ip->i_afp->if_bytes);
+			iip->ili_format.ilf_asize = ip->i_afp->if_bytes;
 #else
 			ASSERT(iip->ili_aextents_buf == NULL);
-			xfs_inode_item_format_extents(ip, vecp,
+			iip->ili_format.ilf_asize =
+				xfs_inode_item_format_extents(ip, vecp,
 					XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
 #endif
-			iip->ili_format.ilf_asize = vecp->i_len;
-			vecp++;
 			(*nvecs)++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_AEXT;
@@ -375,10 +371,9 @@ xfs_inode_item_format_attr_fork(
 		    ip->i_afp->if_broot_bytes > 0) {
 			ASSERT(ip->i_afp->if_broot != NULL);
 
-			vecp->i_addr = ip->i_afp->if_broot;
-			vecp->i_len = ip->i_afp->if_broot_bytes;
-			vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
-			vecp++;
+			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IATTR_BROOT,
+					ip->i_afp->if_broot,
+					ip->i_afp->if_broot_bytes);
 			(*nvecs)++;
 			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
 		} else {
@@ -391,20 +386,18 @@ xfs_inode_item_format_attr_fork(
 
 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
 		    ip->i_afp->if_bytes > 0) {
-			ASSERT(ip->i_afp->if_u1.if_data != NULL);
-
-			vecp->i_addr = ip->i_afp->if_u1.if_data;
 			/*
 			 * Round i_bytes up to a word boundary.
 			 * The underlying memory is guaranteed to
 			 * to be there by xfs_idata_realloc().
 			 */
 			data_bytes = roundup(ip->i_afp->if_bytes, 4);
-			ASSERT((ip->i_afp->if_real_bytes == 0) ||
-			       (ip->i_afp->if_real_bytes == data_bytes));
-			vecp->i_len = (int)data_bytes;
-			vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
-			vecp++;
+			ASSERT(ip->i_afp->if_real_bytes == 0 ||
+			       ip->i_afp->if_real_bytes == data_bytes);
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IATTR_LOCAL,
+					ip->i_afp->if_u1.if_data,
+					data_bytes);
 			(*nvecs)++;
 			iip->ili_format.ilf_asize = (unsigned)data_bytes;
 		} else {
@@ -415,8 +408,6 @@ xfs_inode_item_format_attr_fork(
 		ASSERT(0);
 		break;
 	}
-
-	return vecp;
 }
 
 /*
@@ -435,24 +426,22 @@ xfs_inode_item_format(
 	struct xfs_inode	*ip = iip->ili_inode;
 	uint			nvecs;
 
-	vecp->i_addr = &iip->ili_format;
-	vecp->i_len  = sizeof(xfs_inode_log_format_t);
-	vecp->i_type = XLOG_REG_TYPE_IFORMAT;
-	vecp++;
-	nvecs	     = 1;
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_IFORMAT,
+			&iip->ili_format,
+			sizeof(struct xfs_inode_log_format));
+	nvecs = 1;
 
-	vecp->i_addr = &ip->i_d;
-	vecp->i_len  = xfs_icdinode_size(ip->i_d.di_version);
-	vecp->i_type = XLOG_REG_TYPE_ICORE;
-	vecp++;
+	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_ICORE,
+			&ip->i_d,
+			xfs_icdinode_size(ip->i_d.di_version));
 	nvecs++;
 
 	if (ip->i_d.di_version == 1)
 		xfs_inode_item_format_v1_inode(ip);
 
-	vecp = xfs_inode_item_format_data_fork(iip, vecp, &nvecs);
+	xfs_inode_item_format_data_fork(iip, &vecp, &nvecs);
 	if (XFS_IFORK_Q(ip)) {
-		vecp = xfs_inode_item_format_attr_fork(iip, vecp, &nvecs);
+		xfs_inode_item_format_attr_fork(iip, &vecp, &nvecs);
 	} else {
 		iip->ili_fields &=
 			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index e148719e0a5..384c6c46966 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,6 +30,19 @@ struct xfs_log_vec {
 
 #define XFS_LOG_VEC_ORDERED	(-1)
 
+static inline void *
+xlog_copy_iovec(struct xfs_log_iovec **vecp, uint type, void *data, int len)
+{
+	struct xfs_log_iovec *vec = *vecp;
+
+	vec->i_type = type;
+	vec->i_addr = data;
+	vec->i_len = len;
+
+	*vecp = vec + 1;
+	return vec->i_addr;
+}
+
 /*
  * Structure used to pass callback function and the function's argument
  * to the log manager.
-- 
cgit v1.2.3-70-g09d2


From bde7cff67c39227c6ad503394e19e58debdbc5e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:34:02 +1100
Subject: xfs: format log items write directly into the linear CIL buffer

Instead of setting up pointers to memory locations in iop_format which then
get copied into the CIL linear buffer after return move the copy into
the individual inode items.  This avoids the need to always have a memory
block in the exact same layout that gets written into the log around, and
allow the log items to be much more flexible in their in-memory layouts.

The only caveat is that we need to properly align the data for each
iovec so that don't have structures misaligned in subsequent iovecs.

Note that all log item format routines now need to be careful to modify
the copy of the item that was placed into the CIL after calls to
xlog_copy_iovec instead of the in-memory copy.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf_item.c     | 29 ++++++++-------
 fs/xfs/xfs_dquot_item.c   | 19 +++++-----
 fs/xfs/xfs_extfree_item.c | 10 +++---
 fs/xfs/xfs_icreate_item.c |  5 +--
 fs/xfs/xfs_inode_item.c   | 92 +++++++++++++++++++++++------------------------
 fs/xfs/xfs_log.h          | 41 ++++++++++++++++++---
 fs/xfs/xfs_log_cil.c      | 41 +++++----------------
 fs/xfs/xfs_trans.h        |  2 +-
 8 files changed, 125 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d49419d4bb4..76411730543 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -184,6 +184,7 @@ xfs_buf_item_size(
 
 static inline void
 xfs_buf_item_copy_iovec(
+	struct xfs_log_vec	*lv,
 	struct xfs_log_iovec	**vecp,
 	struct xfs_buf		*bp,
 	uint			offset,
@@ -191,7 +192,7 @@ xfs_buf_item_copy_iovec(
 	uint			nbits)
 {
 	offset += first_bit * XFS_BLF_CHUNK;
-	xlog_copy_iovec(vecp, XLOG_REG_TYPE_BCHUNK,
+	xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
 			xfs_buf_offset(bp, offset),
 			nbits * XFS_BLF_CHUNK);
 }
@@ -211,13 +212,13 @@ xfs_buf_item_straddle(
 static void
 xfs_buf_item_format_segment(
 	struct xfs_buf_log_item	*bip,
+	struct xfs_log_vec	*lv,
 	struct xfs_log_iovec	**vecp,
 	uint			offset,
 	struct xfs_buf_log_format *blfp)
 {
 	struct xfs_buf	*bp = bip->bli_buf;
 	uint		base_size;
-	uint		nvecs;
 	int		first_bit;
 	int		last_bit;
 	int		next_bit;
@@ -233,18 +234,17 @@ xfs_buf_item_format_segment(
 	 */
 	base_size = xfs_buf_log_format_size(blfp);
 
-	nvecs = 0;
 	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
 	if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
 		/*
 		 * If the map is not be dirty in the transaction, mark
 		 * the size as zero and do not advance the vector pointer.
 		 */
-		goto out;
+		return;
 	}
 
-	xlog_copy_iovec(vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
-	nvecs = 1;
+	blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
+	blfp->blf_size = 1;
 
 	if (bip->bli_flags & XFS_BLI_STALE) {
 		/*
@@ -254,7 +254,7 @@ xfs_buf_item_format_segment(
 		 */
 		trace_xfs_buf_item_format_stale(bip);
 		ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
-		goto out;
+		return;
 	}
 
 
@@ -280,15 +280,15 @@ xfs_buf_item_format_segment(
 		 * same set of bits so just keep counting and scanning.
 		 */
 		if (next_bit == -1) {
-			xfs_buf_item_copy_iovec(vecp, bp, offset,
+			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
 						first_bit, nbits);
-			nvecs++;
+			blfp->blf_size++;
 			break;
 		} else if (next_bit != last_bit + 1 ||
 		           xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
-			xfs_buf_item_copy_iovec(vecp, bp, offset,
+			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
 						first_bit, nbits);
-			nvecs++;
+			blfp->blf_size++;
 			first_bit = next_bit;
 			last_bit = next_bit;
 			nbits = 1;
@@ -297,8 +297,6 @@ xfs_buf_item_format_segment(
 			nbits++;
 		}
 	}
-out:
-	blfp->blf_size = nvecs;
 }
 
 /*
@@ -310,10 +308,11 @@ out:
 STATIC void
 xfs_buf_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+	struct xfs_log_vec	*lv)
 {
 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 	struct xfs_buf		*bp = bip->bli_buf;
+	struct xfs_log_iovec	*vecp = NULL;
 	uint			offset = 0;
 	int			i;
 
@@ -354,7 +353,7 @@ xfs_buf_item_format(
 	}
 
 	for (i = 0; i < bip->bli_format_count; i++) {
-		xfs_buf_item_format_segment(bip, &vecp, offset,
+		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
 					    &bip->bli_formats[i]);
 		offset += bp->b_maps[i].bm_len;
 	}
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index ca354a82183..946d588070b 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -57,18 +57,19 @@ xfs_qm_dquot_logitem_size(
 STATIC void
 xfs_qm_dquot_logitem_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+	struct xfs_log_vec	*lv)
 {
 	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
 
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_QFORMAT,
+	qlip->qli_format.qlf_size = 2;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT,
 			&qlip->qli_format,
 			sizeof(struct xfs_dq_logformat));
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_DQUOT,
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
 			&qlip->qli_dquot->q_core,
 			sizeof(struct xfs_disk_dquot));
-
-	qlip->qli_format.qlf_size = 2;
 }
 
 /*
@@ -302,17 +303,17 @@ xfs_qm_qoff_logitem_size(
 STATIC void
 xfs_qm_qoff_logitem_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+	struct xfs_log_vec	*lv)
 {
 	struct xfs_qoff_logitem	*qflip = QOFF_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
 
 	ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
+	qflip->qql_format.qf_size = 1;
 
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_QUOTAOFF,
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF,
 			&qflip->qql_format,
 			sizeof(struct xfs_qoff_logitem));
-
-	qflip->qql_format.qf_size = 1;
 }
 
 /*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 08823ecbcd8..fb7a4c1ce1c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -102,9 +102,10 @@ xfs_efi_item_size(
 STATIC void
 xfs_efi_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+	struct xfs_log_vec	*lv)
 {
 	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
 
 	ASSERT(atomic_read(&efip->efi_next_extent) ==
 				efip->efi_format.efi_nextents);
@@ -112,7 +113,7 @@ xfs_efi_item_format(
 	efip->efi_format.efi_type = XFS_LI_EFI;
 	efip->efi_format.efi_size = 1;
 
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_EFI_FORMAT,
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
 			&efip->efi_format,
 			xfs_efi_item_sizeof(efip));
 }
@@ -368,16 +369,17 @@ xfs_efd_item_size(
 STATIC void
 xfs_efd_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+	struct xfs_log_vec	*lv)
 {
 	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
 
 	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
 
 	efdp->efd_format.efd_type = XFS_LI_EFD;
 	efdp->efd_format.efd_size = 1;
 
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_EFD_FORMAT,
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
 			&efdp->efd_format,
 			xfs_efd_item_sizeof(efdp));
 }
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 5751fa8580e..7e454923325 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -59,11 +59,12 @@ xfs_icreate_item_size(
 STATIC void
 xfs_icreate_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+	struct xfs_log_vec	*lv)
 {
 	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
 
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_ICREATE,
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
 			&icp->ic_format,
 			sizeof(struct xfs_icreate_log));
 }
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c75e14beff0..6ab318f80c9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -163,6 +163,7 @@ xfs_inode_item_size(
 STATIC int
 xfs_inode_item_format_extents(
 	struct xfs_inode	*ip,
+	struct xfs_log_vec	*lv,
 	struct xfs_log_iovec	**vecp,
 	int			whichfork,
 	int			type)
@@ -177,7 +178,7 @@ xfs_inode_item_format_extents(
 		ip->i_itemp->ili_aextents_buf = ext_buffer;
 
 	len = xfs_iextents_copy(ip, ext_buffer, whichfork);
-	xlog_copy_iovec(vecp, type, ext_buffer, len);
+	xlog_copy_iovec(lv, vecp, type, ext_buffer, len);
 	return len;
 }
 
@@ -212,8 +213,9 @@ xfs_inode_item_format_v1_inode(
 STATIC void
 xfs_inode_item_format_data_fork(
 	struct xfs_inode_log_item *iip,
-	struct xfs_log_iovec	**vecp,
-	int			*nvecs)
+	struct xfs_inode_log_format *ilf,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
 {
 	struct xfs_inode	*ip = iip->ili_inode;
 	size_t			data_bytes;
@@ -239,19 +241,19 @@ xfs_inode_item_format_data_fork(
 				 * extents, so just point to the
 				 * real extents array.
 				 */
-				xlog_copy_iovec(vecp, XLOG_REG_TYPE_IEXT,
+				xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IEXT,
 						ip->i_df.if_u1.if_extents,
 						ip->i_df.if_bytes);
-				iip->ili_format.ilf_dsize = ip->i_df.if_bytes;
+				ilf->ilf_dsize = ip->i_df.if_bytes;
 			} else
 #endif
 			{
-				iip->ili_format.ilf_dsize =
-					xfs_inode_item_format_extents(ip, vecp,
+				ilf->ilf_dsize =
+					xfs_inode_item_format_extents(ip, lv, vecp,
 						XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
 				ASSERT(iip->ili_format.ilf_dsize <= ip->i_df.if_bytes);
 			}
-			(*nvecs)++;
+			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_DEXT;
 		}
@@ -264,11 +266,11 @@ xfs_inode_item_format_data_fork(
 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
 		    ip->i_df.if_broot_bytes > 0) {
 			ASSERT(ip->i_df.if_broot != NULL);
-			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IBROOT,
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
 					ip->i_df.if_broot,
 					ip->i_df.if_broot_bytes);
-			(*nvecs)++;
-			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+			ilf->ilf_dsize = ip->i_df.if_broot_bytes;
+			ilf->ilf_size++;
 		} else {
 			ASSERT(!(iip->ili_fields &
 				 XFS_ILOG_DBROOT));
@@ -291,10 +293,10 @@ xfs_inode_item_format_data_fork(
 			       ip->i_df.if_real_bytes == data_bytes);
 			ASSERT(ip->i_df.if_u1.if_data != NULL);
 			ASSERT(ip->i_d.di_size > 0);
-			xlog_copy_iovec(vecp, XLOG_REG_TYPE_ILOCAL,
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
 					ip->i_df.if_u1.if_data, data_bytes);
-			(*nvecs)++;
-			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+			ilf->ilf_dsize = (unsigned)data_bytes;
+			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_DDATA;
 		}
@@ -303,19 +305,15 @@ xfs_inode_item_format_data_fork(
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
 			  XFS_ILOG_DEXT | XFS_ILOG_UUID);
-		if (iip->ili_fields & XFS_ILOG_DEV) {
-			iip->ili_format.ilf_u.ilfu_rdev =
-				ip->i_df.if_u2.if_rdev;
-		}
+		if (iip->ili_fields & XFS_ILOG_DEV)
+			ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
 		break;
 	case XFS_DINODE_FMT_UUID:
 		iip->ili_fields &=
 			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
 			  XFS_ILOG_DEXT | XFS_ILOG_DEV);
-		if (iip->ili_fields & XFS_ILOG_UUID) {
-			iip->ili_format.ilf_u.ilfu_uuid =
-				ip->i_df.if_u2.if_uuid;
-		}
+		if (iip->ili_fields & XFS_ILOG_UUID)
+			ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
 		break;
 	default:
 		ASSERT(0);
@@ -326,8 +324,9 @@ xfs_inode_item_format_data_fork(
 STATIC void
 xfs_inode_item_format_attr_fork(
 	struct xfs_inode_log_item *iip,
-	struct xfs_log_iovec	**vecp,
-	int			*nvecs)
+	struct xfs_inode_log_format *ilf,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
 {
 	struct xfs_inode	*ip = iip->ili_inode;
 	size_t			data_bytes;
@@ -348,17 +347,17 @@ xfs_inode_item_format_attr_fork(
 			 * There are not delayed allocation extents
 			 * for attributes, so just point at the array.
 			 */
-			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IATTR_EXT,
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT,
 					ip->i_afp->if_u1.if_extents,
 					ip->i_afp->if_bytes);
-			iip->ili_format.ilf_asize = ip->i_afp->if_bytes;
+			ilf->ilf_asize = ip->i_afp->if_bytes;
 #else
 			ASSERT(iip->ili_aextents_buf == NULL);
-			iip->ili_format.ilf_asize =
-				xfs_inode_item_format_extents(ip, vecp,
+			ilf->ilf_asize =
+				xfs_inode_item_format_extents(ip, lv, vecp,
 					XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
 #endif
-			(*nvecs)++;
+			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_AEXT;
 		}
@@ -371,11 +370,11 @@ xfs_inode_item_format_attr_fork(
 		    ip->i_afp->if_broot_bytes > 0) {
 			ASSERT(ip->i_afp->if_broot != NULL);
 
-			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IATTR_BROOT,
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
 					ip->i_afp->if_broot,
 					ip->i_afp->if_broot_bytes);
-			(*nvecs)++;
-			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+			ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_ABROOT;
 		}
@@ -395,11 +394,11 @@ xfs_inode_item_format_attr_fork(
 			ASSERT(ip->i_afp->if_real_bytes == 0 ||
 			       ip->i_afp->if_real_bytes == data_bytes);
 			ASSERT(ip->i_afp->if_u1.if_data != NULL);
-			xlog_copy_iovec(vecp, XLOG_REG_TYPE_IATTR_LOCAL,
+			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
 					ip->i_afp->if_u1.if_data,
 					data_bytes);
-			(*nvecs)++;
-			iip->ili_format.ilf_asize = (unsigned)data_bytes;
+			ilf->ilf_asize = (unsigned)data_bytes;
+			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_ADATA;
 		}
@@ -420,28 +419,28 @@ xfs_inode_item_format_attr_fork(
 STATIC void
 xfs_inode_item_format(
 	struct xfs_log_item	*lip,
-	struct xfs_log_iovec	*vecp)
+	struct xfs_log_vec	*lv)
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
-	uint			nvecs;
+	struct xfs_inode_log_format *ilf;
+	struct xfs_log_iovec	*vecp = NULL;
 
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_IFORMAT,
+	ilf = xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT,
 			&iip->ili_format,
 			sizeof(struct xfs_inode_log_format));
-	nvecs = 1;
-
-	xlog_copy_iovec(&vecp, XLOG_REG_TYPE_ICORE,
-			&ip->i_d,
-			xfs_icdinode_size(ip->i_d.di_version));
-	nvecs++;
+	ilf->ilf_size = 1;
 
 	if (ip->i_d.di_version == 1)
 		xfs_inode_item_format_v1_inode(ip);
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
+			&ip->i_d,
+			xfs_icdinode_size(ip->i_d.di_version));
+	ilf->ilf_size++;
 
-	xfs_inode_item_format_data_fork(iip, &vecp, &nvecs);
+	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
 	if (XFS_IFORK_Q(ip)) {
-		xfs_inode_item_format_attr_fork(iip, &vecp, &nvecs);
+		xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
 	} else {
 		iip->ili_fields &=
 			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
@@ -455,7 +454,6 @@ xfs_inode_item_format(
 	 */
 	iip->ili_format.ilf_fields = XFS_ILOG_CORE |
 		(iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
-	iip->ili_format.ilf_size = nvecs;
 }
 
 /*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 384c6c46966..b0f4ef77fa7 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -31,18 +31,51 @@ struct xfs_log_vec {
 #define XFS_LOG_VEC_ORDERED	(-1)
 
 static inline void *
-xlog_copy_iovec(struct xfs_log_iovec **vecp, uint type, void *data, int len)
+xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+		uint type)
 {
 	struct xfs_log_iovec *vec = *vecp;
 
+	if (vec) {
+		ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+		vec++;
+	} else {
+		vec = &lv->lv_iovecp[0];
+	}
+
 	vec->i_type = type;
-	vec->i_addr = data;
-	vec->i_len = len;
+	vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+	ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
 
-	*vecp = vec + 1;
+	*vecp = vec;
 	return vec->i_addr;
 }
 
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
+{
+	/*
+	 * We need to make sure the next buffer is naturally aligned for the
+	 * biggest basic data type we put into it.  We already accounted for
+	 * this when sizing the buffer.
+	 */
+	lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+	vec->i_len = len;
+}
+
+static inline void *
+xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+		uint type, void *data, int len)
+{
+	void *buf;
+
+	buf = xlog_prepare_iovec(lv, vecp, type);
+	memcpy(buf, data, len);
+	xlog_finish_iovec(lv, *vecp, len);
+	return buf;
+}
+
 /*
  * Structure used to pass callback function and the function's argument
  * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 0a7a8cef601..cdebd832c3d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -82,36 +82,6 @@ xlog_cil_init_post_recovery(
 								log->l_curr_block);
 }
 
-STATIC int
-xlog_cil_lv_item_format(
-	struct xfs_log_item	*lip,
-	struct xfs_log_vec	*lv)
-{
-	int	index;
-	char	*ptr;
-
-	/* format new vectors into array */
-	lip->li_ops->iop_format(lip, lv->lv_iovecp);
-
-	/* copy data into existing array */
-	ptr = lv->lv_buf;
-	for (index = 0; index < lv->lv_niovecs; index++) {
-		struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
-
-		memcpy(ptr, vec->i_addr, vec->i_len);
-		vec->i_addr = ptr;
-		ptr += vec->i_len;
-	}
-
-	/*
-	 * some size calculations for log vectors over-estimate, so the caller
-	 * doesn't know the amount of space actually used by the item. Return
-	 * the byte count to the caller so they can check and store it
-	 * appropriately.
-	 */
-	return ptr - lv->lv_buf;
-}
-
 /*
  * Prepare the log item for insertion into the CIL. Calculate the difference in
  * log space and vectors it will consume, and if it is a new item pin it as
@@ -232,6 +202,13 @@ xlog_cil_insert_format_items(
 			nbytes = 0;
 		}
 
+		/*
+		 * We 64-bit align the length of each iovec so that the start
+		 * of the next one is naturally aligned.  We'll need to
+		 * account for that slack space here.
+		 */
+		nbytes += niovecs * sizeof(uint64_t);
+
 		/* grab the old item if it exists for reservation accounting */
 		old_lv = lip->li_lv;
 
@@ -272,9 +249,9 @@ xlog_cil_insert_format_items(
 		lv->lv_niovecs = niovecs;
 
 		/* The allocated data region lies beyond the iovec region */
+		lv->lv_buf_len = 0;
 		lv->lv_buf = (char *)lv + buf_size - nbytes;
-
-		lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+		lip->li_ops->iop_format(lip, lv);
 insert:
 		ASSERT(lv->lv_buf_len <= nbytes);
 		xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9b96d35e483..b5bc1ab3c4d 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -64,7 +64,7 @@ typedef struct xfs_log_item {
 
 struct xfs_item_ops {
 	void (*iop_size)(xfs_log_item_t *, int *, int *);
-	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+	void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
 	void (*iop_pin)(xfs_log_item_t *);
 	void (*iop_unpin)(xfs_log_item_t *, int remove);
 	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
-- 
cgit v1.2.3-70-g09d2


From da7765031de15273d370d18a5354e1d8001ce2a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:34:04 +1100
Subject: xfs: format logged extents directly into the CIL

With the new iop_format scheme there is no need to have a temporary buffer
to format logged extents into, we can do so directly into the CIL.  This
also allows to remove the shortcut for big endian systems that probably
hasn't gotten a lot of test coverage for a long time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode_fork.c |  15 ++++---
 fs/xfs/xfs_inode_item.c | 113 ++++++++----------------------------------------
 fs/xfs/xfs_inode_item.h |   4 --
 3 files changed, 26 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index cfee14a83cf..06abaeef171 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -721,15 +721,16 @@ xfs_idestroy_fork(
 }
 
 /*
- * xfs_iextents_copy()
+ * Convert in-core extents to on-disk form
  *
- * This is called to copy the REAL extents (as opposed to the delayed
- * allocation extents) from the inode into the given buffer.  It
- * returns the number of bytes copied into the buffer.
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode.
  *
- * If there are no delayed allocation extents, then we can just
- * memcpy() the extents into the buffer.  Otherwise, we need to
- * examine each extent in turn and skip those which are delayed.
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
  */
 int
 xfs_iextents_copy(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6ab318f80c9..45224d28049 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -145,43 +145,6 @@ xfs_inode_item_size(
 		xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
 }
 
-/*
- * xfs_inode_item_format_extents - convert in-core extents to on-disk form
- *
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode. In this case, we
- * need to do this conversion before we write the extents into the log. Because
- * we don't have the disk inode to write into here, we allocate a buffer and
- * format the extents into it via xfs_iextents_copy(). We free the buffer in
- * the unlock routine after the copy for the log has been made.
- *
- * In the case of the data fork, the in-core and on-disk fork sizes can be
- * different due to delayed allocation extents. We only log on-disk extents
- * here, so always use the physical fork size to determine the size of the
- * buffer we need to allocate.
- */
-STATIC int
-xfs_inode_item_format_extents(
-	struct xfs_inode	*ip,
-	struct xfs_log_vec	*lv,
-	struct xfs_log_iovec	**vecp,
-	int			whichfork,
-	int			type)
-{
-	xfs_bmbt_rec_t		*ext_buffer;
-	int			len;
-
-	ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
-	if (whichfork == XFS_DATA_FORK)
-		ip->i_itemp->ili_extents_buf = ext_buffer;
-	else
-		ip->i_itemp->ili_aextents_buf = ext_buffer;
-
-	len = xfs_iextents_copy(ip, ext_buffer, whichfork);
-	xlog_copy_iovec(lv, vecp, type, ext_buffer, len);
-	return len;
-}
-
 /*
  * If this is a v1 format inode, then we need to log it as such.  This means
  * that we have to copy the link count from the new field to the old.  We
@@ -229,30 +192,18 @@ xfs_inode_item_format_data_fork(
 		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
 		    ip->i_d.di_nextents > 0 &&
 		    ip->i_df.if_bytes > 0) {
+			struct xfs_bmbt_rec *p;
+
 			ASSERT(ip->i_df.if_u1.if_extents != NULL);
 			ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
-			ASSERT(iip->ili_extents_buf == NULL);
-
-#ifdef XFS_NATIVE_HOST
-                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
-                                               (uint)sizeof(xfs_bmbt_rec_t)) {
-				/*
-				 * There are no delayed allocation
-				 * extents, so just point to the
-				 * real extents array.
-				 */
-				xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IEXT,
-						ip->i_df.if_u1.if_extents,
-						ip->i_df.if_bytes);
-				ilf->ilf_dsize = ip->i_df.if_bytes;
-			} else
-#endif
-			{
-				ilf->ilf_dsize =
-					xfs_inode_item_format_extents(ip, lv, vecp,
-						XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
-				ASSERT(iip->ili_format.ilf_dsize <= ip->i_df.if_bytes);
-			}
+
+			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
+			data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
+			xlog_finish_iovec(lv, *vecp, data_bytes);
+
+			ASSERT(data_bytes <= ip->i_df.if_bytes);
+
+			ilf->ilf_dsize = data_bytes;
 			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_DEXT;
@@ -339,24 +290,17 @@ xfs_inode_item_format_attr_fork(
 		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
 		    ip->i_d.di_anextents > 0 &&
 		    ip->i_afp->if_bytes > 0) {
+			struct xfs_bmbt_rec *p;
+
 			ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
 				ip->i_d.di_anextents);
 			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-#ifdef XFS_NATIVE_HOST
-			/*
-			 * There are not delayed allocation extents
-			 * for attributes, so just point at the array.
-			 */
-			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT,
-					ip->i_afp->if_u1.if_extents,
-					ip->i_afp->if_bytes);
-			ilf->ilf_asize = ip->i_afp->if_bytes;
-#else
-			ASSERT(iip->ili_aextents_buf == NULL);
-			ilf->ilf_asize =
-				xfs_inode_item_format_extents(ip, lv, vecp,
-					XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-#endif
+
+			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
+			data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
+			xlog_finish_iovec(lv, *vecp, data_bytes);
+
+			ilf->ilf_asize = data_bytes;
 			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_AEXT;
@@ -571,27 +515,6 @@ xfs_inode_item_unlock(
 	ASSERT(ip->i_itemp != NULL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
-	/*
-	 * If the inode needed a separate buffer with which to log
-	 * its extents, then free it now.
-	 */
-	if (iip->ili_extents_buf != NULL) {
-		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
-		ASSERT(ip->i_d.di_nextents > 0);
-		ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
-		ASSERT(ip->i_df.if_bytes > 0);
-		kmem_free(iip->ili_extents_buf);
-		iip->ili_extents_buf = NULL;
-	}
-	if (iip->ili_aextents_buf != NULL) {
-		ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
-		ASSERT(ip->i_d.di_anextents > 0);
-		ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
-		ASSERT(ip->i_afp->if_bytes > 0);
-		kmem_free(iip->ili_aextents_buf);
-		iip->ili_aextents_buf = NULL;
-	}
-
 	lock_flags = iip->ili_lock_flags;
 	iip->ili_lock_flags = 0;
 	if (lock_flags)
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index dce4d656768..29b5f2b6533 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,10 +34,6 @@ typedef struct xfs_inode_log_item {
 	unsigned short		ili_logged;	   /* flushed logged data */
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	unsigned int		ili_fields;	   /* fields to be logged */
-	struct xfs_bmbt_rec	*ili_extents_buf;  /* array of logged
-						      data exts */
-	struct xfs_bmbt_rec	*ili_aextents_buf; /* array of logged
-						      attr exts */
 	xfs_inode_log_format_t	ili_format;	   /* logged structure */
 } xfs_inode_log_item_t;
 
-- 
cgit v1.2.3-70-g09d2


From 2f251293b09065118d78ae4e883e5639cc22f94e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:34:05 +1100
Subject: xfs: remove the inode log format from the inode log item

No need to keep the inode log format around all the time, we can
easily generate it at iop_format time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode_item.c | 29 +++++++++++------------------
 fs/xfs/xfs_inode_item.h |  1 -
 2 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 45224d28049..686889b4a1e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -370,17 +370,21 @@ xfs_inode_item_format(
 	struct xfs_inode_log_format *ilf;
 	struct xfs_log_iovec	*vecp = NULL;
 
-	ilf = xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT,
-			&iip->ili_format,
-			sizeof(struct xfs_inode_log_format));
-	ilf->ilf_size = 1;
+	ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
+	ilf->ilf_type = XFS_LI_INODE;
+	ilf->ilf_ino = ip->i_ino;
+	ilf->ilf_blkno = ip->i_imap.im_blkno;
+	ilf->ilf_len = ip->i_imap.im_len;
+	ilf->ilf_boffset = ip->i_imap.im_boffset;
+	ilf->ilf_fields = XFS_ILOG_CORE;
+	ilf->ilf_size = 2; /* format + core */
+	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
 
 	if (ip->i_d.di_version == 1)
 		xfs_inode_item_format_v1_inode(ip);
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
 			&ip->i_d,
 			xfs_icdinode_size(ip->i_d.di_version));
-	ilf->ilf_size++;
 
 	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
 	if (XFS_IFORK_Q(ip)) {
@@ -390,14 +394,8 @@ xfs_inode_item_format(
 			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
 	}
 
-	/*
-	 * Now update the log format that goes out to disk from the in-core
-	 * values.  We always write the inode core to make the arithmetic
-	 * games in recovery easier, which isn't a big deal as just about any
-	 * transaction would dirty it anyway.
-	 */
-	iip->ili_format.ilf_fields = XFS_ILOG_CORE |
-		(iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
+	/* update the format with the exact fields we actually logged */
+	ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
 }
 
 /*
@@ -601,11 +599,6 @@ xfs_inode_item_init(
 	iip->ili_inode = ip;
 	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
 						&xfs_inode_item_ops);
-	iip->ili_format.ilf_type = XFS_LI_INODE;
-	iip->ili_format.ilf_ino = ip->i_ino;
-	iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
-	iip->ili_format.ilf_len = ip->i_imap.im_len;
-	iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
 }
 
 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 29b5f2b6533..488d81254e2 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,7 +34,6 @@ typedef struct xfs_inode_log_item {
 	unsigned short		ili_logged;	   /* flushed logged data */
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	unsigned int		ili_fields;	   /* fields to be logged */
-	xfs_inode_log_format_t	ili_format;	   /* logged structure */
 } xfs_inode_log_item_t;
 
 static inline int xfs_inode_clean(xfs_inode_t *ip)
-- 
cgit v1.2.3-70-g09d2


From ce8e962939ca12218092f8eb3c8cfb196cd8cc51 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:34:07 +1100
Subject: xfs: remove the dquot log format from the dquot log item

No need to keep the dquot log format around all the time, we can
easily generate it at iop_format time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_dquot_item.c | 25 +++++++++----------------
 fs/xfs/xfs_dquot_item.h |  1 -
 2 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 946d588070b..d4fffa90036 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -61,12 +61,17 @@ xfs_qm_dquot_logitem_format(
 {
 	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
 	struct xfs_log_iovec	*vecp = NULL;
+	struct xfs_dq_logformat	*qlf;
 
-	qlip->qli_format.qlf_size = 2;
+	qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
+	qlf->qlf_type = XFS_LI_DQUOT;
+	qlf->qlf_size = 2;
+	qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+	qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+	qlf->qlf_len = 1;
+	qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
 
-	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT,
-			&qlip->qli_format,
-			sizeof(struct xfs_dq_logformat));
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
 			&qlip->qli_dquot->q_core,
 			sizeof(struct xfs_disk_dquot));
@@ -256,18 +261,6 @@ xfs_qm_dquot_logitem_init(
 	xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
 					&xfs_dquot_item_ops);
 	lp->qli_dquot = dqp;
-	lp->qli_format.qlf_type = XFS_LI_DQUOT;
-	lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
-	lp->qli_format.qlf_blkno = dqp->q_blkno;
-	lp->qli_format.qlf_len = 1;
-	/*
-	 * This is just the offset of this dquot within its buffer
-	 * (which is currently 1 FSB and probably won't change).
-	 * Hence 32 bits for this offset should be just fine.
-	 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
-	 * here, and recompute it at recovery time.
-	 */
-	lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
 }
 
 /*------------------  QUOTAOFF LOG ITEMS  -------------------*/
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70..925cbe948c6 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,7 +27,6 @@ typedef struct xfs_dq_logitem {
 	xfs_log_item_t		 qli_item;	   /* common portion */
 	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
 	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
-	xfs_dq_logformat_t	 qli_format;	   /* logged structure */
 } xfs_dq_logitem_t;
 
 typedef struct xfs_qoff_logitem {
-- 
cgit v1.2.3-70-g09d2


From ffda4e83aa107ff55345dc583efdb24fca486fb5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 13 Dec 2013 11:34:08 +1100
Subject: xfs: remove the quotaoff log format from the quotaoff log item

This one doesn't save a whole lot of memory, but still makes the
code simpler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_dquot_item.c | 22 +++++++---------------
 fs/xfs/xfs_dquot_item.h |  2 +-
 2 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index d4fffa90036..f33fbaaa4d8 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -286,13 +286,6 @@ xfs_qm_qoff_logitem_size(
 	*nbytes += sizeof(struct xfs_qoff_logitem);
 }
 
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
 STATIC void
 xfs_qm_qoff_logitem_format(
 	struct xfs_log_item	*lip,
@@ -300,13 +293,13 @@ xfs_qm_qoff_logitem_format(
 {
 	struct xfs_qoff_logitem	*qflip = QOFF_ITEM(lip);
 	struct xfs_log_iovec	*vecp = NULL;
+	struct xfs_qoff_logformat *qlf;
 
-	ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
-	qflip->qql_format.qf_size = 1;
-
-	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF,
-			&qflip->qql_format,
-			sizeof(struct xfs_qoff_logitem));
+	qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
+	qlf->qf_type = XFS_LI_QUOTAOFF;
+	qlf->qf_size = 1;
+	qlf->qf_flags = qflip->qql_flags;
+	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
 }
 
 /*
@@ -446,8 +439,7 @@ xfs_qm_qoff_logitem_init(
 	xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
 			&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
 	qf->qql_item.li_mountp = mp;
-	qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
-	qf->qql_format.qf_flags = flags;
 	qf->qql_start_lip = start;
+	qf->qql_flags = flags;
 	return qf;
 }
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 925cbe948c6..502e9464634 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -32,7 +32,7 @@ typedef struct xfs_dq_logitem {
 typedef struct xfs_qoff_logitem {
 	xfs_log_item_t		 qql_item;	/* common portion */
 	struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
-	xfs_qoff_logformat_t	 qql_format;	/* logged structure */
+	unsigned int		qql_flags;
 } xfs_qoff_logitem_t;
 
 
-- 
cgit v1.2.3-70-g09d2


From 717834383c6ad2173323b823b97c521c9fb8fbbb Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:46 +1100
Subject: xfs: get rid of XFS_IALLOC_INODES macros

Get rid of XFS_IALLOC_INODES() marcos, use mp->m_ialloc_inos directly.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ialloc.c      | 12 ++++++------
 fs/xfs/xfs_ialloc.h      |  1 -
 fs/xfs/xfs_log_recover.c |  4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 7a728f9fc0b..2f87a53f74f 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -225,7 +225,7 @@ xfs_ialloc_inode_init(
 		 * they track in the AIL as if they were physically logged.
 		 */
 		if (tp)
-			xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+			xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
 					mp->m_sb.sb_inodesize, length, gen);
 	} else if (xfs_sb_version_hasnlink(&mp->m_sb))
 		version = 2;
@@ -329,7 +329,7 @@ xfs_ialloc_ag_alloc(
 	 * Locking will ensure that we don't have two callers in here
 	 * at one time.
 	 */
-	newlen = XFS_IALLOC_INODES(args.mp);
+	newlen = args.mp->m_ialloc_inos;
 	if (args.mp->m_maxicount &&
 	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
 		return XFS_ERROR(ENOSPC);
@@ -999,7 +999,7 @@ xfs_dialloc(
 	 * inode.
 	 */
 	if (mp->m_maxicount &&
-	    mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+	    mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
 		noroom = 1;
 		okalloc = 0;
 	}
@@ -1202,7 +1202,7 @@ xfs_difree(
 	 * When an inode cluster is free, it becomes eligible for removal
 	 */
 	if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-	    (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+	    (rec.ir_freecount == mp->m_ialloc_inos)) {
 
 		*delete = 1;
 		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
@@ -1212,7 +1212,7 @@ xfs_difree(
 		 * AGI and Superblock inode counts, and mark the disk space
 		 * to be freed when the transaction is committed.
 		 */
-		ilen = XFS_IALLOC_INODES(mp);
+		ilen = mp->m_ialloc_inos;
 		be32_add_cpu(&agi->agi_count, -ilen);
 		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1311,7 +1311,7 @@ xfs_imap_lookup(
 
 	/* check that the returned record contains the required inode */
 	if (rec.ir_startino > agino ||
-	    rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+	    rec.ir_startino + mp->m_ialloc_inos <= agino)
 		return EINVAL;
 
 	/* for untrusted inodes check it is allocated first */
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index a8f76a5ff41..4689b025f79 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -28,7 +28,6 @@ struct xfs_btree_cur;
 /*
  * Allocation parameters for inode allocation.
  */
-#define	XFS_IALLOC_INODES(mp)	(mp)->m_ialloc_inos
 #define	XFS_IALLOC_BLOCKS(mp)	(mp)->m_ialloc_blks
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 07ab52ca8ab..f76de2eadb6 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3208,9 +3208,9 @@ xlog_recover_do_icreate_pass2(
 	}
 
 	/* existing allocation is fixed value */
-	ASSERT(count == XFS_IALLOC_INODES(mp));
+	ASSERT(count == mp->m_ialloc_inos);
 	ASSERT(length == XFS_IALLOC_BLOCKS(mp));
-	if (count != XFS_IALLOC_INODES(mp) ||
+	if (count != mp->m_ialloc_inos ||
 	     length != XFS_IALLOC_BLOCKS(mp)) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
 		return EINVAL;
-- 
cgit v1.2.3-70-g09d2


From 0f49efd805229fc747761213ec820c1ba3ab64db Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:48 +1100
Subject: xfs: get rid of XFS_INODE_CLUSTER_SIZE macros

Get rid of XFS_INODE_CLUSTER_SIZE() macros, use mp->m_inode_cluster_size
directly.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ialloc.c      | 10 +++++-----
 fs/xfs/xfs_ialloc.h      |  5 +----
 fs/xfs/xfs_inode.c       | 10 +++++-----
 fs/xfs/xfs_itable.c      |  4 ++--
 fs/xfs/xfs_log_recover.c |  8 ++++----
 fs/xfs/xfs_trans_resv.c  |  2 +-
 6 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 2f87a53f74f..3ac5eb6bf85 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -52,7 +52,7 @@ xfs_ialloc_cluster_alignment(
 {
 	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
 	    args->mp->m_sb.sb_inoalignmt >=
-	     XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+	     XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
 		return args->mp->m_sb.sb_inoalignmt;
 	return 1;
 }
@@ -181,12 +181,12 @@ xfs_ialloc_inode_init(
 	 * For small block sizes, manipulate the inodes in buffers
 	 * which are multiples of the blocks size.
 	 */
-	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) {
 		blks_per_cluster = 1;
 		nbufs = length;
 		ninodes = mp->m_sb.sb_inopblock;
 	} else {
-		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+		blks_per_cluster = mp->m_inode_cluster_size /
 				   mp->m_sb.sb_blocksize;
 		nbufs = length / blks_per_cluster;
 		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
@@ -1384,7 +1384,7 @@ xfs_imap(
 		return XFS_ERROR(EINVAL);
 	}
 
-	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+	blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
 
 	/*
 	 * For bulkstat and handle lookups, we have an untrusted inode number
@@ -1405,7 +1405,7 @@ xfs_imap(
 	 * If the inode cluster size is the same as the blocksize or
 	 * smaller we get to the buffer by simple arithmetics.
 	 */
-	if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
+	if (mp->m_inode_cluster_size <= mp->m_sb.sb_blocksize) {
 		offset = XFS_INO_TO_OFFSET(mp, ino);
 		ASSERT(offset < mp->m_sb.sb_inopblock);
 
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 4689b025f79..98ce63e35bd 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -30,11 +30,8 @@ struct xfs_btree_cur;
  */
 #define	XFS_IALLOC_BLOCKS(mp)	(mp)->m_ialloc_blks
 
-/*
- * Move inodes in clusters of this size.
- */
+/* Move inodes in clusters of this size */
 #define	XFS_INODE_BIG_CLUSTER_SIZE	8192
-#define	XFS_INODE_CLUSTER_SIZE(mp)	(mp)->m_inode_cluster_size
 
 /*
  * Make an inode pointer out of the buffer/offset.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 001aa893ed5..c1642c9a7e6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2152,13 +2152,13 @@ xfs_ifree_cluster(
 	struct xfs_perag	*pag;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
-	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) {
 		blks_per_cluster = 1;
 		ninodes = mp->m_sb.sb_inopblock;
 		nbufs = XFS_IALLOC_BLOCKS(mp);
 	} else {
-		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
-					mp->m_sb.sb_blocksize;
+		blks_per_cluster = mp->m_inode_cluster_size /
+				   mp->m_sb.sb_blocksize;
 		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
 		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
 	}
@@ -2906,13 +2906,13 @@ xfs_iflush_cluster(
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
-	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+	inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
 	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!ilist)
 		goto out_put;
 
-	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+	mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
 	rcu_read_lock();
 	/* really need a gang lookup range call here */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c237ad15d50..0571012f67c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -243,9 +243,9 @@ xfs_bulkstat(
 	*done = 0;
 	fmterror = 0;
 	ubufp = ubuffer;
-	nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+	nicluster = mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size ?
 		mp->m_sb.sb_inopblock :
-		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+		(mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog);
 	nimask = ~(nicluster - 1);
 	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
 	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index f76de2eadb6..42458ab7a33 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2523,19 +2523,19 @@ xlog_recover_buffer_pass2(
 	 *
 	 * Also make sure that only inode buffers with good sizes stay in
 	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
-	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
+	 * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
 	 * buffers in the log can be a different size if the log was generated
 	 * by an older kernel using unclustered inode buffers or a newer kernel
 	 * running with a different inode cluster size.  Regardless, if the
-	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
-	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+	 * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
+	 * for *our* value of mp->m_inode_cluster_size, then we need to keep
 	 * the buffer out of the buffer cache so that the buffer won't
 	 * overlap with future reads of those inodes.
 	 */
 	if (XFS_DINODE_MAGIC ==
 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
 	    (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
-			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+			(__uint32_t)log->l_mp->m_inode_cluster_size))) {
 		xfs_buf_stale(bp);
 		error = xfs_bwrite(bp);
 	} else {
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2fd59c0dae6..51c181dde11 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -385,7 +385,7 @@ xfs_calc_ifree_reservation(
 		xfs_calc_inode_res(mp, 1) +
 		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
 		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-		max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) +
+		max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) +
 		xfs_calc_buf_res(1, 0) +
 		xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
 				 mp->m_in_maxlevels, 0) +
-- 
cgit v1.2.3-70-g09d2


From 126cd105d4408ff52437a72d681aecdb29cc80e8 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:48 +1100
Subject: xfs: get rid of XFS_IALLOC_BLOCKS macros

Get rid of XFS_IALLOC_BLOCKS() marcos, use mp->m_ialloc_blks directly.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ialloc.c      | 12 ++++++------
 fs/xfs/xfs_ialloc.h      |  5 -----
 fs/xfs/xfs_inode.c       |  4 ++--
 fs/xfs/xfs_log_recover.c |  4 ++--
 fs/xfs/xfs_trans_resv.c  |  8 ++++----
 fs/xfs/xfs_trans_space.h |  2 +-
 6 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3ac5eb6bf85..eacc59c7cb8 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -333,7 +333,7 @@ xfs_ialloc_ag_alloc(
 	if (args.mp->m_maxicount &&
 	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
 		return XFS_ERROR(ENOSPC);
-	args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+	args.minlen = args.maxlen = args.mp->m_ialloc_blks;
 	/*
 	 * First try to allocate inodes contiguous with the last-allocated
 	 * chunk of inodes.  If the filesystem is striped, this will fill
@@ -343,7 +343,7 @@ xfs_ialloc_ag_alloc(
 	newino = be32_to_cpu(agi->agi_newino);
 	agno = be32_to_cpu(agi->agi_seqno);
 	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-			XFS_IALLOC_BLOCKS(args.mp);
+		     args.mp->m_ialloc_blks;
 	if (likely(newino != NULLAGINO &&
 		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -585,7 +585,7 @@ xfs_ialloc_ag_select(
 		 * Is there enough free space for the file plus a block of
 		 * inodes? (if we need to allocate some)?
 		 */
-		ineed = XFS_IALLOC_BLOCKS(mp);
+		ineed = mp->m_ialloc_blks;
 		longest = pag->pagf_longest;
 		if (!longest)
 			longest = pag->pagf_flcount > 0;
@@ -1228,9 +1228,9 @@ xfs_difree(
 			goto error0;
 		}
 
-		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
-				agno, XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-				XFS_IALLOC_BLOCKS(mp), flist, mp);
+		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+				  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+				  mp->m_ialloc_blks, flist, mp);
 	} else {
 		*delete = 0;
 
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 98ce63e35bd..d86f8bdc963 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -25,11 +25,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_btree_cur;
 
-/*
- * Allocation parameters for inode allocation.
- */
-#define	XFS_IALLOC_BLOCKS(mp)	(mp)->m_ialloc_blks
-
 /* Move inodes in clusters of this size */
 #define	XFS_INODE_BIG_CLUSTER_SIZE	8192
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c1642c9a7e6..0c8c334f0f1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2155,12 +2155,12 @@ xfs_ifree_cluster(
 	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) {
 		blks_per_cluster = 1;
 		ninodes = mp->m_sb.sb_inopblock;
-		nbufs = XFS_IALLOC_BLOCKS(mp);
+		nbufs = mp->m_ialloc_blks;
 	} else {
 		blks_per_cluster = mp->m_inode_cluster_size /
 				   mp->m_sb.sb_blocksize;
 		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
-		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
+		nbufs = mp->m_ialloc_blks / blks_per_cluster;
 	}
 
 	for (j = 0; j < nbufs; j++, inum += ninodes) {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 42458ab7a33..22b6f35765c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3209,9 +3209,9 @@ xlog_recover_do_icreate_pass2(
 
 	/* existing allocation is fixed value */
 	ASSERT(count == mp->m_ialloc_inos);
-	ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+	ASSERT(length == mp->m_ialloc_blks);
 	if (count != mp->m_ialloc_inos ||
-	     length != XFS_IALLOC_BLOCKS(mp)) {
+	     length != mp->m_ialloc_blks) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
 		return EINVAL;
 	}
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 51c181dde11..2ffd3e331b4 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -174,7 +174,7 @@ xfs_calc_itruncate_reservation(
 		    xfs_calc_buf_res(5, 0) +
 		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
 				     XFS_FSB_TO_B(mp, 1)) +
-		    xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+		    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
 				     mp->m_in_maxlevels, 0)));
 }
 
@@ -282,7 +282,7 @@ xfs_calc_create_resv_modify(
  * For create we can allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
  *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
  *    the inode btree: max depth * blocksize
  *    the allocation btrees: 2 trees * (max depth - 1) * block size
  */
@@ -292,7 +292,7 @@ xfs_calc_create_resv_alloc(
 {
 	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
 		mp->m_sb.sb_sectsize +
-		xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
 		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
 		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
 				 XFS_FSB_TO_B(mp, 1));
@@ -387,7 +387,7 @@ xfs_calc_ifree_reservation(
 		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
 		max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) +
 		xfs_calc_buf_res(1, 0) +
-		xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+		xfs_calc_buf_res(2 + mp->m_ialloc_blks +
 				 mp->m_in_maxlevels, 0) +
 		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
 				 XFS_FSB_TO_B(mp, 1));
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7d2c920dfb9..af5dbe06cb6 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
 #define	XFS_DIRREMOVE_SPACE_RES(mp)	\
 	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define	XFS_IALLOC_SPACE_RES(mp)	\
-	(XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
+	((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1)
 
 /*
  * Space reservation values for various transactions.
-- 
cgit v1.2.3-70-g09d2


From 904957b75033149509dc0fecc0af34348f87c78c Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:48 +1100
Subject: xfs: introduce a common helper xfs_icluster_size_fsb

Introduce a common routine xfs_icluster_size_fsb() to calculate
and return the number of file system blocks per inode cluster.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ialloc.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index d86f8bdc963..812365d17e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -28,6 +28,16 @@ struct xfs_btree_cur;
 /* Move inodes in clusters of this size */
 #define	XFS_INODE_BIG_CLUSTER_SIZE	8192
 
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+		return 1;
+	return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
+
 /*
  * Make an inode pointer out of the buffer/offset.
  */
-- 
cgit v1.2.3-70-g09d2


From a2ba07b2d2215bed560aa67c84275304314d9691 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:48 +1100
Subject: xfs: use xfs_icluster_size_fsb in xfs_bulkstat

Use xfs_icluster_size_fsb() in xfs_bulkstat(), make the related
variables more meaningful and remove an unused variable nimask
from it.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 0571012f67c..f4633828515 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -209,9 +209,8 @@ xfs_bulkstat(
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
 	xfs_inobt_rec_incore_t	*irbufend; /* end of good irec buffer entries */
 	xfs_ino_t		lastino; /* last inode number returned */
-	int			nbcluster; /* # of blocks in a cluster */
-	int			nicluster; /* # of inodes in a cluster */
-	int			nimask;	/* mask for inode clusters */
+	int			blks_per_cluster; /* # of blocks per cluster */
+	int			inodes_per_cluster;/* # of inodes per cluster */
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
 	int			tmp;	/* result value from btree calls */
@@ -243,11 +242,8 @@ xfs_bulkstat(
 	*done = 0;
 	fmterror = 0;
 	ubufp = ubuffer;
-	nicluster = mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size ?
-		mp->m_sb.sb_inopblock :
-		(mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog);
-	nimask = ~(nicluster - 1);
-	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
 	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
 	if (!irbuf)
 		return ENOMEM;
@@ -390,12 +386,12 @@ xfs_bulkstat(
 				agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
 				for (chunkidx = 0;
 				     chunkidx < XFS_INODES_PER_CHUNK;
-				     chunkidx += nicluster,
-				     agbno += nbcluster) {
-					if (xfs_inobt_maskn(chunkidx, nicluster)
-							& ~r.ir_free)
+				     chunkidx += inodes_per_cluster,
+				     agbno += blks_per_cluster) {
+					if (xfs_inobt_maskn(chunkidx,
+					    inodes_per_cluster) & ~r.ir_free)
 						xfs_btree_reada_bufs(mp, agno,
-							agbno, nbcluster,
+							agbno, blks_per_cluster,
 							&xfs_inode_buf_ops);
 				}
 				blk_finish_plug(&plug);
-- 
cgit v1.2.3-70-g09d2


From 6e0c7b8c3ea62d684af267d34fc015253e7cd6e5 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:49 +1100
Subject: xfs: use xfs_icluster_size_fsb in xfs_ialloc_inode_init

Use xfs_icluster_size_fsb() in xfs_ialloc_inode_init(), rename variable
ninodes to inodes_per_cluster, the latter is more meaningful.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ialloc.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index eacc59c7cb8..6ee592cfc0d 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -170,27 +170,20 @@ xfs_ialloc_inode_init(
 {
 	struct xfs_buf		*fbuf;
 	struct xfs_dinode	*free;
-	int			blks_per_cluster, nbufs, ninodes;
+	int			nbufs, blks_per_cluster, inodes_per_cluster;
 	int			version;
 	int			i, j;
 	xfs_daddr_t		d;
 	xfs_ino_t		ino = 0;
 
 	/*
-	 * Loop over the new block(s), filling in the inodes.
-	 * For small block sizes, manipulate the inodes in buffers
-	 * which are multiples of the blocks size.
+	 * Loop over the new block(s), filling in the inodes.  For small block
+	 * sizes, manipulate the inodes in buffers  which are multiples of the
+	 * blocks size.
 	 */
-	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) {
-		blks_per_cluster = 1;
-		nbufs = length;
-		ninodes = mp->m_sb.sb_inopblock;
-	} else {
-		blks_per_cluster = mp->m_inode_cluster_size /
-				   mp->m_sb.sb_blocksize;
-		nbufs = length / blks_per_cluster;
-		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
-	}
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+	nbufs = length / blks_per_cluster;
 
 	/*
 	 * Figure out what version number to use in the inodes we create.  If
@@ -246,7 +239,7 @@ xfs_ialloc_inode_init(
 		/* Initialize the inode buffers and log them appropriately. */
 		fbuf->b_ops = &xfs_inode_buf_ops;
 		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
-		for (i = 0; i < ninodes; i++) {
+		for (i = 0; i < inodes_per_cluster; i++) {
 			int	ioffset = i << mp->m_sb.sb_inodelog;
 			uint	isize = xfs_dinode_size(version);
 
-- 
cgit v1.2.3-70-g09d2


From 982e939e4ddc8c00cb478fb4d725d0e3d18971cc Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:49 +1100
Subject: xfs: use xfs_icluster_size_fsb in xfs_ifree_cluster

Use xfs_icluster_size_fsb() in xfs_ifree_cluster(), rename variable
ninodes to inodes_per_cluster, the latter is more meaningful.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0c8c334f0f1..833028cf205 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2141,8 +2141,8 @@ xfs_ifree_cluster(
 {
 	xfs_mount_t		*mp = free_ip->i_mount;
 	int			blks_per_cluster;
+	int			inodes_per_cluster;
 	int			nbufs;
-	int			ninodes;
 	int			i, j;
 	xfs_daddr_t		blkno;
 	xfs_buf_t		*bp;
@@ -2152,18 +2152,11 @@ xfs_ifree_cluster(
 	struct xfs_perag	*pag;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
-	if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) {
-		blks_per_cluster = 1;
-		ninodes = mp->m_sb.sb_inopblock;
-		nbufs = mp->m_ialloc_blks;
-	} else {
-		blks_per_cluster = mp->m_inode_cluster_size /
-				   mp->m_sb.sb_blocksize;
-		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
-		nbufs = mp->m_ialloc_blks / blks_per_cluster;
-	}
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+	nbufs = mp->m_ialloc_blks / blks_per_cluster;
 
-	for (j = 0; j < nbufs; j++, inum += ninodes) {
+	for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
 					 XFS_INO_TO_AGBNO(mp, inum));
 
@@ -2225,7 +2218,7 @@ xfs_ifree_cluster(
 		 * transaction stale above, which means there is no point in
 		 * even trying to lock them.
 		 */
-		for (i = 0; i < ninodes; i++) {
+		for (i = 0; i < inodes_per_cluster; i++) {
 retry:
 			rcu_read_lock();
 			ip = radix_tree_lookup(&pag->pag_ici_root,
-- 
cgit v1.2.3-70-g09d2


From f9e5abcfc5b299a988cf8f9d0ad11e03da14806b Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 13 Dec 2013 15:51:49 +1100
Subject: xfs: use xfs_icluster_size_fsb in xfs_imap

Use xfs_icluster_size_fsb() in xfs_imap().

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ialloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 6ee592cfc0d..5d7f105a1c8 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1377,7 +1377,7 @@ xfs_imap(
 		return XFS_ERROR(EINVAL);
 	}
 
-	blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
 
 	/*
 	 * For bulkstat and handle lookups, we have an untrusted inode number
@@ -1398,7 +1398,7 @@ xfs_imap(
 	 * If the inode cluster size is the same as the blocksize or
 	 * smaller we get to the buffer by simple arithmetics.
 	 */
-	if (mp->m_inode_cluster_size <= mp->m_sb.sb_blocksize) {
+	if (blks_per_cluster == 1) {
 		offset = XFS_INO_TO_OFFSET(mp, ino);
 		ASSERT(offset < mp->m_sb.sb_inopblock);
 
-- 
cgit v1.2.3-70-g09d2


From 9f12bd119e408388233e7aeb1152f372a8b5dcad Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 20 Sep 2013 19:55:31 +0800
Subject: ceph: drop unconnected inodes

Positve dentry and corresponding inode are always accompanied in MDS reply.
So no need to keep inode in the cache after dropping all its aliases.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/inode.c | 10 ++++++++++
 fs/ceph/super.c |  1 +
 fs/ceph/super.h |  1 +
 3 files changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 278fd289128..d37b2dc01d3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -436,6 +436,16 @@ void ceph_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, ceph_i_callback);
 }
 
+int ceph_drop_inode(struct inode *inode)
+{
+	/*
+	 * Positve dentry and corresponding inode are always accompanied
+	 * in MDS reply. So no need to keep inode in the cache after
+	 * dropping all its aliases.
+	 */
+	return 1;
+}
+
 /*
  * Helpers to fill in size, ctime, mtime, and atime.  We have to be
  * careful because either the client or MDS may have more up to date
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6a0951e4304..e58bd4a23bf 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -686,6 +686,7 @@ static const struct super_operations ceph_super_ops = {
 	.alloc_inode	= ceph_alloc_inode,
 	.destroy_inode	= ceph_destroy_inode,
 	.write_inode    = ceph_write_inode,
+	.drop_inode	= ceph_drop_inode,
 	.sync_fs        = ceph_sync_fs,
 	.put_super	= ceph_put_super,
 	.show_options   = ceph_show_options,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ef4ac38bb61..8de94b564d6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -691,6 +691,7 @@ extern const struct inode_operations ceph_file_iops;
 
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
 extern void ceph_destroy_inode(struct inode *inode);
+extern int ceph_drop_inode(struct inode *inode);
 
 extern struct inode *ceph_get_inode(struct super_block *sb,
 				    struct ceph_vino vino);
-- 
cgit v1.2.3-70-g09d2


From e8344e668915a7488def414f016dbf7d9fce84b5 Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Thu, 12 Sep 2013 13:54:26 +0800
Subject: ceph: Implement writev/pwritev for sync operation.

For writev/pwritev sync-operatoin, ceph only do the first iov.

I divided the write-sync-operation into two functions. One for
direct-write, other for none-direct-sync-write. This is because for
none-direct-sync-write we can merge iovs to one. But for direct-write,
we can't merge iovs.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c | 273 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 193 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3de89829e2a..5cf034e915b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -489,83 +489,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 	}
 }
 
+
 /*
- * Synchronous write, straight from __user pointer or user pages (if
- * O_DIRECT).
+ * Synchronous write, straight from __user pointer or user pages.
  *
  * If write spans object boundary, just do multiple writes.  (For a
  * correct atomic write, we should e.g. take write locks on all
  * objects, rollback on failure, etc.)
  */
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
-			       size_t left, loff_t pos, loff_t *ppos)
+static ssize_t
+ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
+		       unsigned long nr_segs, size_t count)
 {
+	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_snap_context *snapc;
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
-	int num_ops = 1;
 	struct page **pages;
 	int num_pages;
-	u64 len;
 	int written = 0;
 	int flags;
 	int check_caps = 0;
-	int page_align, io_align;
-	unsigned long buf_align;
+	int page_align;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
-	bool own_pages = false;
+	loff_t pos = iocb->ki_pos;
+	struct iov_iter i;
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("sync_write on file %p %lld~%u %s\n", file, pos,
-	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+	dout("sync_direct_write on file %p %lld~%u\n", file, pos,
+	     (unsigned)count);
 
-	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
 	if (ret < 0)
 		return ret;
 
 	ret = invalidate_inode_pages2_range(inode->i_mapping,
 					    pos >> PAGE_CACHE_SHIFT,
-					    (pos + left) >> PAGE_CACHE_SHIFT);
+					    (pos + count) >> PAGE_CACHE_SHIFT);
 	if (ret < 0)
 		dout("invalidate_inode_pages2_range returned %d\n", ret);
 
 	flags = CEPH_OSD_FLAG_ORDERSNAP |
 		CEPH_OSD_FLAG_ONDISK |
 		CEPH_OSD_FLAG_WRITE;
-	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
-		flags |= CEPH_OSD_FLAG_ACK;
-	else
-		num_ops++;	/* Also include a 'startsync' command. */
 
-	/*
-	 * we may need to do multiple writes here if we span an object
-	 * boundary.  this isn't atomic, unfortunately.  :(
-	 */
-more:
-	io_align = pos & ~PAGE_MASK;
-	buf_align = (unsigned long)data & ~PAGE_MASK;
-	len = left;
-
-	snapc = ci->i_snap_realm->cached_context;
-	vino = ceph_vino(inode);
-	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-				    vino, pos, &len, num_ops,
-				    CEPH_OSD_OP_WRITE, flags, snapc,
-				    ci->i_truncate_seq, ci->i_truncate_size,
-				    false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	iov_iter_init(&i, iov, nr_segs, count, 0);
+
+	while (iov_iter_count(&i) > 0) {
+		void __user *data = i.iov->iov_base + i.iov_offset;
+		u64 len = i.iov->iov_len - i.iov_offset;
+
+		page_align = (unsigned long)data & ~PAGE_MASK;
+
+		snapc = ci->i_snap_realm->cached_context;
+		vino = ceph_vino(inode);
+		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					    vino, pos, &len,
+					    2,/*include a 'startsync' command*/
+					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			goto out;
+		}
 
-	/* write from beginning of first page, regardless of io alignment */
-	page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
-	num_pages = calc_pages_for(page_align, len);
-	if (file->f_flags & O_DIRECT) {
+		num_pages = calc_pages_for(page_align, len);
 		pages = ceph_get_direct_page_vector(data, num_pages, false);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
@@ -577,60 +573,175 @@ more:
 		 * may block.
 		 */
 		truncate_inode_pages_range(inode->i_mapping, pos,
-					   (pos+len) | (PAGE_CACHE_SIZE-1));
-	} else {
+				   (pos+len) | (PAGE_CACHE_SIZE-1));
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+						false, false);
+
+		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
+		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+
+		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+		ceph_put_page_vector(pages, num_pages, false);
+
+out:
+		ceph_osdc_put_request(req);
+		if (ret == 0) {
+			pos += len;
+			written += len;
+			iov_iter_advance(&i, (size_t)len);
+
+			if (pos > i_size_read(inode)) {
+				check_caps = ceph_inode_set_size(inode, pos);
+				if (check_caps)
+					ceph_check_caps(ceph_inode(inode),
+							CHECK_CAPS_AUTHONLY,
+							NULL);
+			}
+		} else
+			break;
+	}
+
+	if (ret != -EOLDSNAPC && written > 0) {
+		iocb->ki_pos = pos;
+		ret = written;
+	}
+	return ret;
+}
+
+
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, size_t count)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_vino vino;
+	struct ceph_osd_request *req;
+	struct page **pages;
+	u64 len;
+	int num_pages;
+	int written = 0;
+	int flags;
+	int check_caps = 0;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+	loff_t pos = iocb->ki_pos;
+	struct iov_iter i;
+
+	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + count) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE |
+		CEPH_OSD_FLAG_ACK;
+
+	iov_iter_init(&i, iov, nr_segs, count, 0);
+
+	while ((len = iov_iter_count(&i)) > 0) {
+		size_t left;
+		int n;
+
+		snapc = ci->i_snap_realm->cached_context;
+		vino = ceph_vino(inode);
+		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					    vino, pos, &len, 1,
+					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			goto out;
+		}
+
+		/*
+		 * write from beginning of first page,
+		 * regardless of io alignment
+		 */
+		num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
 		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
 		}
-		ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
+
+		left = len;
+		for (n = 0; n < num_pages; n++) {
+			size_t plen = min(left, PAGE_SIZE);
+			ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
+			if (ret != plen) {
+				ret = -EFAULT;
+				break;
+			}
+			left -= ret;
+			iov_iter_advance(&i, ret);
+		}
+
 		if (ret < 0) {
 			ceph_release_page_vector(pages, num_pages);
 			goto out;
 		}
 
-		if ((file->f_flags & O_SYNC) == 0) {
-			/* get a second commit callback */
-			req->r_unsafe_callback = ceph_sync_write_unsafe;
-			req->r_inode = inode;
-			own_pages = true;
-		}
-	}
-	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
-					false, own_pages);
+		/* get a second commit callback */
+		req->r_unsafe_callback = ceph_sync_write_unsafe;
+		req->r_inode = inode;
 
-	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
-	ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+						false, true);
 
-	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!ret)
-		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
+		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
 
-	if (file->f_flags & O_DIRECT)
-		ceph_put_page_vector(pages, num_pages, false);
-	else if (file->f_flags & O_SYNC)
-		ceph_release_page_vector(pages, num_pages);
+		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 out:
-	ceph_osdc_put_request(req);
-	if (ret == 0) {
-		pos += len;
-		written += len;
-		left -= len;
-		data += len;
-		if (left)
-			goto more;
+		ceph_osdc_put_request(req);
+		if (ret == 0) {
+			pos += len;
+			written += len;
+
+			if (pos > i_size_read(inode)) {
+				check_caps = ceph_inode_set_size(inode, pos);
+				if (check_caps)
+					ceph_check_caps(ceph_inode(inode),
+							CHECK_CAPS_AUTHONLY,
+							NULL);
+			}
+		} else
+			break;
+	}
 
+	if (ret != -EOLDSNAPC && written > 0) {
 		ret = written;
-		*ppos = pos;
-		if (pos > i_size_read(inode))
-			check_caps = ceph_inode_set_size(inode, pos);
-		if (check_caps)
-			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
-					NULL);
-	} else if (ret != -EOLDSNAPC && written > 0) {
-		ret = written;
+		iocb->ki_pos = pos;
 	}
 	return ret;
 }
@@ -772,11 +883,13 @@ retry_snap:
 	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
 
 	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
-	    (iocb->ki_filp->f_flags & O_DIRECT) ||
-	    (fi->flags & CEPH_F_SYNC)) {
+	    (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
 		mutex_unlock(&inode->i_mutex);
-		written = ceph_sync_write(file, iov->iov_base, count,
-					  pos, &iocb->ki_pos);
+		if (file->f_flags & O_DIRECT)
+			written = ceph_sync_direct_write(iocb, iov,
+							 nr_segs, count);
+		else
+			written = ceph_sync_write(iocb, iov, nr_segs, count);
 		if (written == -EOLDSNAPC) {
 			dout("aio_write %p %llx.%llx %llu~%u"
 				"got EOLDSNAPC, retrying\n",
-- 
cgit v1.2.3-70-g09d2


From 8eb4efb091c8d8f70a0e6822288b043f8691ec51 Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Thu, 26 Sep 2013 14:42:17 +0800
Subject: ceph: implement readv/preadv for sync operation

For readv/preadv sync-operatoin, ceph only do the first iov.
Now implement this.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/file.c | 162 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 116 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5cf034e915b..c4419e848a4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -408,51 +408,92 @@ more:
  *
  * If the read spans object boundary, just do multiple reads.
  */
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
-			      unsigned len, loff_t *poff, int *checkeof)
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
+				int *checkeof)
 {
+	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct page **pages;
-	u64 off = *poff;
+	u64 off = iocb->ki_pos;
 	int num_pages, ret;
+	size_t len = i->count;
 
-	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+	dout("sync_read on file %p %llu~%u %s\n", file, off,
+	     (unsigned)len,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
-	if (file->f_flags & O_DIRECT) {
-		num_pages = calc_pages_for((unsigned long)data, len);
-		pages = ceph_get_direct_page_vector(data, num_pages, true);
-	} else {
-		num_pages = calc_pages_for(off, len);
-		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
-	}
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-
 	/*
 	 * flush any page cache pages in this range.  this
 	 * will make concurrent normal and sync io slow,
 	 * but it will at least behave sensibly when they are
 	 * in sequence.
 	 */
-	ret = filemap_write_and_wait(inode->i_mapping);
+	ret = filemap_write_and_wait_range(inode->i_mapping, off,
+						off + len);
 	if (ret < 0)
-		goto done;
+		return ret;
 
-	ret = striped_read(inode, off, len, pages, num_pages, checkeof,
-			   file->f_flags & O_DIRECT,
-			   (unsigned long)data & ~PAGE_MASK);
+	if (file->f_flags & O_DIRECT) {
+		while (iov_iter_count(i)) {
+			void __user *data = i->iov[0].iov_base + i->iov_offset;
+			size_t len = i->iov[0].iov_len - i->iov_offset;
+
+			num_pages = calc_pages_for((unsigned long)data, len);
+			pages = ceph_get_direct_page_vector(data,
+							    num_pages, true);
+			if (IS_ERR(pages))
+				return PTR_ERR(pages);
+
+			ret = striped_read(inode, off, len,
+					   pages, num_pages, checkeof,
+					   1, (unsigned long)data & ~PAGE_MASK);
+			ceph_put_page_vector(pages, num_pages, true);
+
+			if (ret <= 0)
+				break;
+			off += ret;
+			iov_iter_advance(i, ret);
+			if (ret < len)
+				break;
+		}
+	} else {
+		num_pages = calc_pages_for(off, len);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		if (IS_ERR(pages))
+			return PTR_ERR(pages);
+		ret = striped_read(inode, off, len, pages,
+					num_pages, checkeof, 0, 0);
+		if (ret > 0) {
+			int l, k = 0;
+			size_t left = len = ret;
+
+			while (left) {
+				void __user *data = i->iov[0].iov_base
+							+ i->iov_offset;
+				l = min(i->iov[0].iov_len - i->iov_offset,
+					left);
+
+				ret = ceph_copy_page_vector_to_user(&pages[k],
+								    data, off,
+								    l);
+				if (ret > 0) {
+					iov_iter_advance(i, ret);
+					left -= ret;
+					off += ret;
+					k = calc_pages_for(iocb->ki_pos,
+							   len - left + 1) - 1;
+					BUG_ON(k >= num_pages && left);
+				} else
+					break;
+			}
+		}
+		ceph_release_page_vector(pages, num_pages);
+	}
 
-	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-		ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
-	if (ret >= 0)
-		*poff = off + ret;
+	if (off > iocb->ki_pos) {
+		ret = off - iocb->ki_pos;
+		iocb->ki_pos = off;
+	}
 
-done:
-	if (file->f_flags & O_DIRECT)
-		ceph_put_page_vector(pages, num_pages, true);
-	else
-		ceph_release_page_vector(pages, num_pages);
 	dout("sync_read result %d\n", ret);
 	return ret;
 }
@@ -758,55 +799,84 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *filp = iocb->ki_filp;
 	struct ceph_file_info *fi = filp->private_data;
-	loff_t *ppos = &iocb->ki_pos;
-	size_t len = iov->iov_len;
+	size_t len = iocb->ki_nbytes;
 	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	void __user *base = iov->iov_base;
 	ssize_t ret;
 	int want, got = 0;
 	int checkeof = 0, read = 0;
 
-	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
 again:
+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_CACHE;
 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
 	if (ret < 0)
-		goto out;
-	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)len,
-	     ceph_cap_string(got));
+		return ret;
 
 	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
-	    (fi->flags & CEPH_F_SYNC))
+	    (fi->flags & CEPH_F_SYNC)) {
+		struct iov_iter i;
+
+		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+		     ceph_cap_string(got));
+
+		if (!read) {
+			ret = generic_segment_checks(iov, &nr_segs,
+							&len, VERIFY_WRITE);
+			if (ret)
+				goto out;
+		}
+
+		iov_iter_init(&i, iov, nr_segs, len, read);
+
 		/* hmm, this isn't really async... */
-		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
-	else
-		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+		ret = ceph_sync_read(iocb, &i, &checkeof);
+	} else {
+		/*
+		 * We can't modify the content of iov,
+		 * so we only read from beginning.
+		 */
+		if (read) {
+			iocb->ki_pos = pos;
+			len = iocb->ki_nbytes;
+			read = 0;
+		}
+		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+		     inode, ceph_vinop(inode), pos, (unsigned)len,
+		     ceph_cap_string(got));
 
+		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+	}
 out:
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
 	ceph_put_cap_refs(ci, got);
 
 	if (checkeof && ret >= 0) {
-		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+		int statret = ceph_do_getattr(inode,
+					      CEPH_STAT_CAP_SIZE);
 
 		/* hit EOF or hole? */
-		if (statret == 0 && *ppos < inode->i_size) {
-			dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
+		if (statret == 0 && iocb->ki_pos < inode->i_size &&
+			ret < len) {
+			dout("sync_read hit hole, ppos %lld < size %lld"
+			     ", reading more\n", iocb->ki_pos,
+			     inode->i_size);
+
 			read += ret;
-			base += ret;
 			len -= ret;
 			checkeof = 0;
 			goto again;
 		}
 	}
+
 	if (ret >= 0)
 		ret += read;
 
-- 
cgit v1.2.3-70-g09d2


From f36132a75aafd0086aeb0eacf348654138d56b49 Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@ubuntukylin.com>
Date: Wed, 27 Nov 2013 22:28:13 +0800
Subject: ceph: Clean up if error occurred in finish_read()

Clean up if error occurred rather than going through normal process

Signed-off-by: Li Wang <liwang@ubuntukylin.com>
Signed-off-by: Yunchuan Wen <yunchuanwen@ubuntukylin.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/addr.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ec3ba43b9fa..c346b8479f9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -256,6 +256,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	for (i = 0; i < num_pages; i++) {
 		struct page *page = osd_data->pages[i];
 
+		if (rc < 0)
+			goto unlock;
 		if (bytes < (int)PAGE_CACHE_SIZE) {
 			/* zero (remainder of) page */
 			int s = bytes < 0 ? 0 : bytes;
@@ -266,6 +268,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 		flush_dcache_page(page);
 		SetPageUptodate(page);
 		ceph_readpage_to_fscache(inode, page);
+unlock:
 		unlock_page(page);
 		page_cache_release(page);
 		bytes -= PAGE_CACHE_SIZE;
-- 
cgit v1.2.3-70-g09d2


From aa8b60e077fa2a4383e79b092b65cd5455ea5ab2 Mon Sep 17 00:00:00 2001
From: Libo Chen <clbchenlibo.chen@huawei.com>
Date: Wed, 11 Dec 2013 13:49:11 +0800
Subject: fs: ceph: new helper: file_inode(file)

Signed-off-by: Libo Chen <clbchenlibo.chen@huawei.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c4419e848a4..d10510a4733 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1201,7 +1201,7 @@ static long ceph_fallocate(struct file *file, int mode,
 				loff_t offset, loff_t length)
 {
 	struct ceph_file_info *fi = file->private_data;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
-- 
cgit v1.2.3-70-g09d2


From ece35848c1847cdf3dd07954578d3e99238ebbae Mon Sep 17 00:00:00 2001
From: Dongmao Zhang <dmzhang@suse.com>
Date: Tue, 10 Dec 2013 10:52:22 -0600
Subject: dlm: set zero linger time on sctp socket

The recovery time for a failed node was taking a long
time because the failed node could not perform the full
shutdown process.  Removing the linger time speeds this
up.  The dlm does not care what happens to messages to
or from the failed node.

Signed-off-by: Dongmao Zhang <dmzhang@suse.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa..a5e34dd6a32 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con,
 				      struct msghdr *msg, char *buf)
 {
 	union sctp_notification *sn = (union sctp_notification *)buf;
+	struct linger linger;
 
 	switch (sn->sn_header.sn_type) {
 	case SCTP_SEND_FAILED:
@@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con,
 			}
 			add_sock(new_con->sock, new_con);
 
+			linger.l_onoff = 1;
+			linger.l_linger = 0;
+			ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
+						(char *)&linger, sizeof(linger));
+			if (ret < 0)
+				log_print("set socket option SO_LINGER failed");
+
 			log_print("connecting to %d sctp association %d",
 				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
 
-- 
cgit v1.2.3-70-g09d2


From bb8b9d095c5c56cce99576cfef0cf9b989f7120d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 16:02:55 -0500
Subject: kernfs: add @mode to kernfs_create_dir[_ns]()

sysfs assumed 0755 for all newly created directories and kernfs
inherited it.  This assumption is unnecessarily restrictive and
inconsistent with kernfs_create_file[_ns]().  This patch adds @mode
parameter to kernfs_create_dir[_ns]() and update uses in sysfs
accordingly.  Among others, this will be useful for implementations of
the planned ->mkdir() method.

This patch doesn't introduce any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        |  9 +++++----
 fs/sysfs/dir.c         |  3 ++-
 fs/sysfs/group.c       |  3 ++-
 include/linux/kernfs.h | 13 +++++++------
 4 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 6520066c49e..e55bb02f15a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -639,22 +639,23 @@ void kernfs_destroy_root(struct kernfs_root *root)
  * kernfs_create_dir_ns - create a directory
  * @parent: parent in which to create a new directory
  * @name: name of the new directory
+ * @mode: mode of the new directory
  * @priv: opaque data associated with the new directory
  * @ns: optional namespace tag of the directory
  *
  * Returns the created node on success, ERR_PTR() value on failure.
  */
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
-					 const char *name, void *priv,
-					 const void *ns)
+					 const char *name, umode_t mode,
+					 void *priv, const void *ns)
 {
-	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 	int rc;
 
 	/* allocate */
-	kn = kernfs_new_node(kernfs_root(parent), name, mode, KERNFS_DIR);
+	kn = kernfs_new_node(kernfs_root(parent), name, mode | S_IFDIR,
+			     KERNFS_DIR);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index aa007401bfc..ee0d761c317 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -73,7 +73,8 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (!parent)
 		return -ENOENT;
 
-	kn = kernfs_create_dir_ns(parent, kobject_name(kobj), kobj, ns);
+	kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
+				  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
 			sysfs_warn_dup(parent, kobject_name(kobj));
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 4d00d399647..6b579387c67 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -100,7 +100,8 @@ static int internal_create_group(struct kobject *kobj, int update,
 		return -EINVAL;
 	}
 	if (grp->name) {
-		kn = kernfs_create_dir(kobj->sd, grp->name, kobj);
+		kn = kernfs_create_dir(kobj->sd, grp->name,
+				       S_IRWXU | S_IRUGO | S_IXUGO, kobj);
 		if (IS_ERR(kn)) {
 			if (PTR_ERR(kn) == -EEXIST)
 				sysfs_warn_dup(kobj->sd, grp->name);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index e9c4e3a0396..0ca2aedfd31 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -210,8 +210,8 @@ struct kernfs_root *kernfs_create_root(void *priv);
 void kernfs_destroy_root(struct kernfs_root *root);
 
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
-					 const char *name, void *priv,
-					 const void *ns);
+					 const char *name, umode_t mode,
+					 void *priv, const void *ns);
 struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
 					      const char *name,
 					      umode_t mode, loff_t size,
@@ -260,8 +260,8 @@ static inline struct kernfs_root *kernfs_create_root(void *priv)
 static inline void kernfs_destroy_root(struct kernfs_root *root) { }
 
 static inline struct kernfs_node *
-kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, void *priv,
-		     const void *ns)
+kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
+		     umode_t mode, void *priv, const void *ns)
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct kernfs_node *
@@ -314,9 +314,10 @@ kernfs_find_and_get(struct kernfs_node *kn, const char *name)
 }
 
 static inline struct kernfs_node *
-kernfs_create_dir(struct kernfs_node *parent, const char *name, void *priv)
+kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
+		  void *priv)
 {
-	return kernfs_create_dir_ns(parent, name, priv, NULL);
+	return kernfs_create_dir_ns(parent, name, mode, priv, NULL);
 }
 
 static inline struct kernfs_node *
-- 
cgit v1.2.3-70-g09d2


From d0ae3d4347ee025cf23b850d484fe8593f7dd0f2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 16:02:56 -0500
Subject: kernfs: add REMOVED check to create and rename paths

kernfs currently assumes that the caller doesn't try to create a new
node under a removed parent, rename a removed node, or move a node
under a removed node.  While this works fine for sysfs, it'd be nice
to have protection against such cases especially given that kernfs is
planned to add support for mkdir, rmdir and rename requsts from
userland which may make race conditions more likely.

This patch updates create and rename paths to check REMOVED and fail
the operation with -ENOENT if performed on or towards removed nodes.
Note that remove path already has such check.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e55bb02f15a..ba5f372a226 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -430,6 +430,9 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	if (kernfs_type(parent) != KERNFS_DIR)
 		return -EINVAL;
 
+	if (parent->flags & KERNFS_REMOVED)
+		return -ENOENT;
+
 	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kn->parent = parent;
 	kernfs_get(parent);
@@ -863,6 +866,10 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 
 	mutex_lock(&kernfs_mutex);
 
+	error = -ENOENT;
+	if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
+		goto out;
+
 	error = 0;
 	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
 	    (strcmp(kn->name, new_name) == 0))
-- 
cgit v1.2.3-70-g09d2


From 2063d608f5110d120db60e896ec2c70c95bb7978 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 16:02:57 -0500
Subject: kernfs: mark static names with KERNFS_STATIC_NAME

Because sysfs used struct attribute which are supposed to stay
constant, sysfs didn't copy names when creating regular files.  The
specified string for name was supposed to stay constant.  Such
distinction isn't inherent for kernfs.  kernfs_create_file[_ns]()
should be able to take the same @name as kernfs_create_dir[_ns]()

As there can be huge number of sysfs attributes, we still want to be
able to use static names for sysfs attributes.  This patch renames
kernfs_create_file_ns_key() to __kernfs_create_file() and adds
@name_is_static parameter so that the caller can explicitly indicate
that @name can be used without copying.  kernfs is updated to use
KERNFS_STATIC_NAME to distinguish static and copied names.

This patch doesn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             |  8 ++++----
 fs/kernfs/file.c            | 23 +++++++++++++++--------
 fs/kernfs/kernfs-internal.h |  2 +-
 fs/sysfs/file.c             |  4 ++--
 include/linux/kernfs.h      | 27 ++++++++++++++-------------
 5 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index ba5f372a226..e1681775abd 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -240,7 +240,7 @@ void kernfs_put(struct kernfs_node *kn)
 
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
-	if (kernfs_type(kn) & KERNFS_COPY_NAME)
+	if (!(kn->flags & KERNFS_STATIC_NAME))
 		kfree(kn->name);
 	if (kn->iattr) {
 		if (kn->iattr->ia_secdata)
@@ -336,13 +336,13 @@ const struct dentry_operations kernfs_dops = {
 };
 
 struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
-				    umode_t mode, int type)
+				    umode_t mode, unsigned flags)
 {
 	char *dup_name = NULL;
 	struct kernfs_node *kn;
 	int ret;
 
-	if (type & KERNFS_COPY_NAME) {
+	if (!(flags & KERNFS_STATIC_NAME)) {
 		name = dup_name = kstrdup(name, GFP_KERNEL);
 		if (!name)
 			return NULL;
@@ -362,7 +362,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 
 	kn->name = name;
 	kn->mode = mode;
-	kn->flags = type | KERNFS_REMOVED;
+	kn->flags = flags | KERNFS_REMOVED;
 
 	return kn;
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 053cfd9a6a4..316604cc3a1 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -762,7 +762,7 @@ const struct file_operations kernfs_file_fops = {
 };
 
 /**
- * kernfs_create_file_ns_key - create a file
+ * __kernfs_create_file - kernfs internal function to create a file
  * @parent: directory to create the file in
  * @name: name of the file
  * @mode: mode of the file
@@ -770,23 +770,30 @@ const struct file_operations kernfs_file_fops = {
  * @ops: kernfs operations for the file
  * @priv: private data for the file
  * @ns: optional namespace tag of the file
+ * @static_name: don't copy file name
  * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
  *
  * Returns the created node on success, ERR_PTR() value on error.
  */
-struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
-					      const char *name,
-					      umode_t mode, loff_t size,
-					      const struct kernfs_ops *ops,
-					      void *priv, const void *ns,
-					      struct lock_class_key *key)
+struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
+					 const char *name,
+					 umode_t mode, loff_t size,
+					 const struct kernfs_ops *ops,
+					 void *priv, const void *ns,
+					 bool name_is_static,
+					 struct lock_class_key *key)
 {
 	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
+	unsigned flags;
 	int rc;
 
+	flags = KERNFS_FILE;
+	if (name_is_static)
+		flags |= KERNFS_STATIC_NAME;
+
 	kn = kernfs_new_node(kernfs_root(parent), name,
-			     (mode & S_IALLUGO) | S_IFREG, KERNFS_FILE);
+			     (mode & S_IALLUGO) | S_IFREG, flags);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index a4ff491fd59..c6ba5bc37a9 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -105,7 +105,7 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 		   struct kernfs_node *parent);
 void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
 struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
-				    umode_t mode, int type);
+				    umode_t mode, unsigned flags);
 
 /*
  * file.c
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index fe6388fbd15..810cf6e613e 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -252,8 +252,8 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 	if (!attr->ignore_lockdep)
 		key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
-	kn = kernfs_create_file_ns_key(parent, attr->name, mode, size,
-				       ops, (void *)attr, ns, key);
+	kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
+				  (void *)attr, ns, true, key);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
 			sysfs_warn_dup(parent, attr->name);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 0ca2aedfd31..321ed84ad4c 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -34,7 +34,6 @@ enum kernfs_node_type {
 };
 
 #define KERNFS_TYPE_MASK	0x000f
-#define KERNFS_COPY_NAME	(KERNFS_DIR | KERNFS_LINK)
 #define KERNFS_ACTIVE_REF	KERNFS_FILE
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
@@ -44,6 +43,7 @@ enum kernfs_node_flag {
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
 	KERNFS_LOCKDEP		= 0x0100,
+	KERNFS_STATIC_NAME	= 0x0200,
 };
 
 /* type-specific structures for kernfs_node union members */
@@ -212,12 +212,13 @@ void kernfs_destroy_root(struct kernfs_root *root);
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
 					 void *priv, const void *ns);
-struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
-					      const char *name,
-					      umode_t mode, loff_t size,
-					      const struct kernfs_ops *ops,
-					      void *priv, const void *ns,
-					      struct lock_class_key *key);
+struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
+					 const char *name,
+					 umode_t mode, loff_t size,
+					 const struct kernfs_ops *ops,
+					 void *priv, const void *ns,
+					 bool name_is_static,
+					 struct lock_class_key *key);
 struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
 				       struct kernfs_node *target);
@@ -265,10 +266,10 @@ kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct kernfs_node *
-kernfs_create_file_ns_key(struct kernfs_node *parent, const char *name,
-			  umode_t mode, loff_t size,
-			  const struct kernfs_ops *ops, void *priv,
-			  const void *ns, struct lock_class_key *key)
+__kernfs_create_file(struct kernfs_node *parent, const char *name,
+		     umode_t mode, loff_t size, const struct kernfs_ops *ops,
+		     void *priv, const void *ns, bool name_is_static,
+		     struct lock_class_key *key)
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct kernfs_node *
@@ -330,8 +331,8 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = (struct lock_class_key *)&ops->lockdep_key;
 #endif
-	return kernfs_create_file_ns_key(parent, name, mode, size, ops, priv,
-					 ns, key);
+	return __kernfs_create_file(parent, name, mode, size, ops, priv, ns,
+				    false, key);
 }
 
 static inline struct kernfs_node *
-- 
cgit v1.2.3-70-g09d2


From 47a52e91f485dfd935042dbd2f66df1ac3fdfbb9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 16:02:58 -0500
Subject: kernfs: update kernfs_rename_ns() to consider KERNFS_STATIC_NAME

kernfs_rename_ns() currently assumes that the target sysfs_dirent has
a copied name.  This has been okay because sysfs supports rename only
for directories which always have copied names; however, there's
nothing in kernfs interface which calls for such restriction and
currently invoking kernfs_rename_ns() on a regular file leads to oops
because it ends up trying to kfree() a static name.

This patch updates kernfs_rename_ns() so that it skips kfree() of the
old name if it's static.  This allows it to be used for all node
types.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e1681775abd..d33af95321f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -886,7 +886,11 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		if (!new_name)
 			goto out;
 
-		kfree(kn->name);
+		if (kn->flags & KERNFS_STATIC_NAME)
+			kn->flags &= ~KERNFS_STATIC_NAME;
+		else
+			kfree(kn->name);
+
 		kn->name = new_name;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 19bbb926203dbcf3a03915e934c36d7681bf6e13 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 16:02:59 -0500
Subject: kernfs: allow negative dentries

kernfs doesn't allow negative dentries - kernfs_iop_lookup() returns
ERR_PTR(-ENOENT) instead of NULL which short-circuits negative dentry
creation and kernfs's d_delete() callback, kernfs_dop_delete(),
returns 1 for all removed nodes.  This in turn allows
kernfs_dop_revalidate() to assume that there's no negative dentry for
kernfs.

This worked fine for sysfs but kernfs is scheduled to grow mkdir(2)
support which depend on negative dentries.  This patch updates so that
kernfs allows negative dentries.  The required changes are almost
trivial - kernfs_iop_lookup() now returns NULL instead of
ERR_PTR(-ENOENT) when the target kernfs_node doesn't exist,
kernfs_dop_delete() is removed and kernfs_dop_revalidate() is updated
to check whether the target dentry is negative and request fresh
lookup if so.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index d33af95321f..42c5b9f23b4 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -264,12 +264,6 @@ void kernfs_put(struct kernfs_node *kn)
 }
 EXPORT_SYMBOL_GPL(kernfs_put);
 
-static int kernfs_dop_delete(const struct dentry *dentry)
-{
-	struct kernfs_node *kn = dentry->d_fsdata;
-	return !(kn && !(kn->flags & KERNFS_REMOVED));
-}
-
 static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct kernfs_node *kn;
@@ -277,6 +271,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
+	/* Always perform fresh lookup for negatives */
+	if (!dentry->d_inode)
+		goto out_bad_unlocked;
+
 	kn = dentry->d_fsdata;
 	mutex_lock(&kernfs_mutex);
 
@@ -301,22 +299,14 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 out_valid:
 	return 1;
 out_bad:
-	/*
-	 * Remove the dentry from the dcache hashes.
-	 * If this is a deleted dentry we use d_drop instead of d_delete
-	 * so kernfs doesn't need to cope with negative dentries.
-	 *
-	 * If this is a dentry that has simply been renamed we
-	 * use d_drop to remove it from the dcache lookup on its
-	 * old parent.  If this dentry persists later when a lookup
-	 * is performed at its new name the dentry will be readded
-	 * to the dcache hashes.
-	 */
 	mutex_unlock(&kernfs_mutex);
-
-	/* If we have submounts we must allow the vfs caches
-	 * to lie about the state of the filesystem to prevent
-	 * leaks and other nasty things.
+out_bad_unlocked:
+	/*
+	 * @dentry doesn't match the underlying kernfs node, drop the
+	 * dentry and force lookup.  If we have submounts we must allow the
+	 * vfs caches to lie about the state of the filesystem to prevent
+	 * leaks and other nasty things, so use check_submounts_and_drop()
+	 * instead of d_drop().
 	 */
 	if (check_submounts_and_drop(dentry) != 0)
 		goto out_valid;
@@ -331,7 +321,6 @@ static void kernfs_dop_release(struct dentry *dentry)
 
 const struct dentry_operations kernfs_dops = {
 	.d_revalidate	= kernfs_dop_revalidate,
-	.d_delete	= kernfs_dop_delete,
 	.d_release	= kernfs_dop_release,
 };
 
@@ -682,7 +671,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 					struct dentry *dentry,
 					unsigned int flags)
 {
-	struct dentry *ret = NULL;
+	struct dentry *ret;
 	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
 	struct kernfs_node *kn;
 	struct inode *inode;
@@ -697,7 +686,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 
 	/* no such entry */
 	if (!kn) {
-		ret = ERR_PTR(-ENOENT);
+		ret = NULL;
 		goto out_unlock;
 	}
 	kernfs_get(kn);
-- 
cgit v1.2.3-70-g09d2


From 80b9bbefc345079bddc4959de016ba4074b0c8d6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Dec 2013 16:03:00 -0500
Subject: kernfs: add kernfs_dir_ops

Add support for mkdir(2), rmdir(2) and rename(2) syscalls.  This is
implemented through optional kernfs_dir_ops callback table which can
be specified on kernfs_create_root().  An implemented callback is
invoked when the matching syscall is invoked.

As kernfs keep dcache syncs with internal representation and
revalidates dentries on each access, the implementation of these
methods is extremely simple.  Each just discovers the relevant
kernfs_node(s) and invokes the requested callback which is allowed to
do any kernfs operations and the end result doesn't necessarily have
to match the expected semantics of the syscall.

This will be used to convert cgroup to use kernfs instead of its own
filesystem implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 44 +++++++++++++++++++++++++++++++++++++++++++-
 fs/sysfs/mount.c       |  2 +-
 include/linux/kernfs.h | 21 +++++++++++++++++++--
 3 files changed, 63 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 42c5b9f23b4..510b5062ef3 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -583,12 +583,13 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 
 /**
  * kernfs_create_root - create a new kernfs hierarchy
+ * @kdops: optional directory syscall operations for the hierarchy
  * @priv: opaque data associated with the new directory
  *
  * Returns the root of the new hierarchy on success, ERR_PTR() value on
  * failure.
  */
-struct kernfs_root *kernfs_create_root(void *priv)
+struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 {
 	struct kernfs_root *root;
 	struct kernfs_node *kn;
@@ -610,6 +611,7 @@ struct kernfs_root *kernfs_create_root(void *priv)
 	kn->priv = priv;
 	kn->dir.root = root;
 
+	root->dir_ops = kdops;
 	root->kn = kn;
 
 	return root;
@@ -706,6 +708,42 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 	return ret;
 }
 
+static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+			    umode_t mode)
+{
+	struct kernfs_node *parent = dir->i_private;
+	struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops;
+
+	if (!kdops || !kdops->mkdir)
+		return -EPERM;
+
+	return kdops->mkdir(parent, dentry->d_name.name, mode);
+}
+
+static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct kernfs_node *kn  = dentry->d_fsdata;
+	struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+
+	if (!kdops || !kdops->rmdir)
+		return -EPERM;
+
+	return kdops->rmdir(kn);
+}
+
+static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct kernfs_node *kn  = old_dentry->d_fsdata;
+	struct kernfs_node *new_parent = new_dir->i_private;
+	struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+
+	if (!kdops || !kdops->rename)
+		return -EPERM;
+
+	return kdops->rename(kn, new_parent, new_dentry->d_name.name);
+}
+
 const struct inode_operations kernfs_dir_iops = {
 	.lookup		= kernfs_iop_lookup,
 	.permission	= kernfs_iop_permission,
@@ -715,6 +753,10 @@ const struct inode_operations kernfs_dir_iops = {
 	.removexattr	= kernfs_iop_removexattr,
 	.getxattr	= kernfs_iop_getxattr,
 	.listxattr	= kernfs_iop_listxattr,
+
+	.mkdir		= kernfs_iop_mkdir,
+	.rmdir		= kernfs_iop_rmdir,
+	.rename		= kernfs_iop_rename,
 };
 
 static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 701a56f341c..6211230814f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -62,7 +62,7 @@ int __init sysfs_init(void)
 {
 	int err;
 
-	sysfs_root = kernfs_create_root(NULL);
+	sysfs_root = kernfs_create_root(NULL, NULL);
 	if (IS_ERR(sysfs_root))
 		return PTR_ERR(sysfs_root);
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 321ed84ad4c..d2c439db4ef 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -111,12 +111,27 @@ struct kernfs_node {
 	struct kernfs_iattrs	*iattr;
 };
 
+/*
+ * kernfs_dir_ops may be specified on kernfs_create_root() to support
+ * directory manipulation syscalls.  These optional callbacks are invoked
+ * on the matching syscalls and can perform any kernfs operations which
+ * don't necessarily have to be the exact operation requested.
+ */
+struct kernfs_dir_ops {
+	int (*mkdir)(struct kernfs_node *parent, const char *name,
+		     umode_t mode);
+	int (*rmdir)(struct kernfs_node *kn);
+	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		      const char *new_name);
+};
+
 struct kernfs_root {
 	/* published fields */
 	struct kernfs_node	*kn;
 
 	/* private fields, do not use outside kernfs proper */
 	struct ida		ino_ida;
+	struct kernfs_dir_ops	*dir_ops;
 };
 
 struct kernfs_open_file {
@@ -206,7 +221,8 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
 void kernfs_get(struct kernfs_node *kn);
 void kernfs_put(struct kernfs_node *kn);
 
-struct kernfs_root *kernfs_create_root(void *priv);
+struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops,
+				       void *priv);
 void kernfs_destroy_root(struct kernfs_root *root);
 
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
@@ -255,7 +271,8 @@ kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
 static inline void kernfs_get(struct kernfs_node *kn) { }
 static inline void kernfs_put(struct kernfs_node *kn) { }
 
-static inline struct kernfs_root *kernfs_create_root(void *priv)
+static inline struct kernfs_root *
+kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 { return ERR_PTR(-ENOSYS); }
 
 static inline void kernfs_destroy_root(struct kernfs_root *root) { }
-- 
cgit v1.2.3-70-g09d2


From 30ba7ad54335e4715d3cc9cc8f43cbf1b3535e46 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:07 -0800
Subject: xfs: no need to lock the inode in xfs_find_handle

Both the inode number and the generation do not change on a live inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ioctl.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 33ad9a77791..518aa56b8f2 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -112,15 +112,11 @@ xfs_find_handle(
 		memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
 		hsize = sizeof(xfs_fsid_t);
 	} else {
-		int		lock_mode;
-
-		lock_mode = xfs_ilock_map_shared(ip);
 		handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
 					sizeof(handle.ha_fid.fid_len);
 		handle.ha_fid.fid_pad = 0;
 		handle.ha_fid.fid_gen = ip->i_d.di_gen;
 		handle.ha_fid.fid_ino = ip->i_ino;
-		xfs_iunlock_map_shared(ip, lock_mode);
 
 		hsize = XFS_HSIZE(handle);
 	}
-- 
cgit v1.2.3-70-g09d2


From 01f4f3277556d4f4f833371db0219b0ca11c5409 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:08 -0800
Subject: xfs: remove xfs_iunlock_map_shared

We can just use xfs_iunlock without any loss of clarity.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c |  2 +-
 fs/xfs/xfs_file.c      |  4 ++--
 fs/xfs/xfs_inode.c     | 17 ++---------------
 fs/xfs/xfs_inode.h     |  1 -
 4 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5887e41c032..2f32d7ee141 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -737,7 +737,7 @@ xfs_getbmap(
  out_free_map:
 	kmem_free(map);
  out_unlock_ilock:
-	xfs_iunlock_map_shared(ip, lock);
+	xfs_iunlock(ip, lock);
  out_unlock_iolock:
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52c91e14372..349bfa28aa3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1294,7 +1294,7 @@ out:
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
-	xfs_iunlock_map_shared(ip, lock);
+	xfs_iunlock(ip, lock);
 
 	if (error)
 		return -error;
@@ -1402,7 +1402,7 @@ out:
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
-	xfs_iunlock_map_shared(ip, lock);
+	xfs_iunlock(ip, lock);
 
 	if (error)
 		return -error;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 001aa893ed5..967f90625ea 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -88,8 +88,7 @@ xfs_get_extsz_hint(
  * have been read in yet, and only lock the inode exclusively if they have not.
  *
  * The function returns a value which should be given to the corresponding
- * xfs_iunlock_map_shared().  This value is the mode in which the lock was
- * actually taken.
+ * xfs_iunlock() call.
  */
 uint
 xfs_ilock_map_shared(
@@ -109,18 +108,6 @@ xfs_ilock_map_shared(
 	return lock_mode;
 }
 
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
-	xfs_inode_t	*ip,
-	unsigned int	lock_mode)
-{
-	xfs_iunlock(ip, lock_mode);
-}
-
 /*
  * The xfs inode contains 2 locks: a multi-reader lock called the
  * i_iolock and a multi-reader lock called the i_lock.  This routine
@@ -590,7 +577,7 @@ xfs_lookup(
 
 	lock_mode = xfs_ilock_map_shared(dp);
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
-	xfs_iunlock_map_shared(dp, lock_mode);
+	xfs_iunlock(dp, lock_mode);
 
 	if (error)
 		goto out;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9e6efccbae0..5e2bd17cf2b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -338,7 +338,6 @@ void		xfs_iunlock(xfs_inode_t *, uint);
 void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
-void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
 			   xfs_nlink_t, xfs_dev_t, prid_t, int,
 			   struct xfs_buf **, xfs_inode_t **);
-- 
cgit v1.2.3-70-g09d2


From 309ecac8e7c937c5811ef8f0efc14b3d1bd18775 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:09 -0800
Subject: xfs: rename xfs_ilock_map_shared

Make it clear that we're only locking against the extent map on the data
fork.  Also clean the function up a little bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c      |  2 +-
 fs/xfs/xfs_bmap_util.c |  2 +-
 fs/xfs/xfs_file.c      |  6 +++---
 fs/xfs/xfs_inode.c     | 17 ++++++-----------
 fs/xfs/xfs_inode.h     |  2 +-
 5 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71c8c9d2b88..a26739451b5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1217,7 +1217,7 @@ __xfs_get_blocks(
 		lockmode = XFS_ILOCK_EXCL;
 		xfs_ilock(ip, lockmode);
 	} else {
-		lockmode = xfs_ilock_map_shared(ip);
+		lockmode = xfs_ilock_data_map_shared(ip);
 	}
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f32d7ee141..460aeb87c04 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -632,7 +632,7 @@ xfs_getbmap(
 		 */
 	}
 
-	lock = xfs_ilock_map_shared(ip);
+	lock = xfs_ilock_data_map_shared(ip);
 
 	/*
 	 * Don't let nex be bigger than the number of extents
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 349bfa28aa3..e0012159263 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -912,7 +912,7 @@ xfs_dir_open(
 	 * If there are any blocks, read-ahead block 0 as we're almost
 	 * certain to have the next operation be a read there.
 	 */
-	mode = xfs_ilock_map_shared(ip);
+	mode = xfs_ilock_data_map_shared(ip);
 	if (ip->i_d.di_nextents > 0)
 		xfs_dir3_data_readahead(NULL, ip, 0, -1);
 	xfs_iunlock(ip, mode);
@@ -1215,7 +1215,7 @@ xfs_seek_data(
 	uint			lock;
 	int			error;
 
-	lock = xfs_ilock_map_shared(ip);
+	lock = xfs_ilock_data_map_shared(ip);
 
 	isize = i_size_read(inode);
 	if (start >= isize) {
@@ -1319,7 +1319,7 @@ xfs_seek_hole(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -XFS_ERROR(EIO);
 
-	lock = xfs_ilock_map_shared(ip);
+	lock = xfs_ilock_data_map_shared(ip);
 
 	isize = i_size_read(inode);
 	if (start >= isize) {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 967f90625ea..fdd48378336 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -91,20 +91,15 @@ xfs_get_extsz_hint(
  * xfs_iunlock() call.
  */
 uint
-xfs_ilock_map_shared(
-	xfs_inode_t	*ip)
+xfs_ilock_data_map_shared(
+	struct xfs_inode	*ip)
 {
-	uint	lock_mode;
+	uint			lock_mode = XFS_ILOCK_SHARED;
 
-	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
-	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+	    (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
 		lock_mode = XFS_ILOCK_EXCL;
-	} else {
-		lock_mode = XFS_ILOCK_SHARED;
-	}
-
 	xfs_ilock(ip, lock_mode);
-
 	return lock_mode;
 }
 
@@ -575,7 +570,7 @@ xfs_lookup(
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return XFS_ERROR(EIO);
 
-	lock_mode = xfs_ilock_map_shared(dp);
+	lock_mode = xfs_ilock_data_map_shared(dp);
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 	xfs_iunlock(dp, lock_mode);
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 5e2bd17cf2b..fde368624ea 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -337,7 +337,7 @@ int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
 void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
-uint		xfs_ilock_map_shared(xfs_inode_t *);
+uint		xfs_ilock_data_map_shared(struct xfs_inode *);
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
 			   xfs_nlink_t, xfs_dev_t, prid_t, int,
 			   struct xfs_buf **, xfs_inode_t **);
-- 
cgit v1.2.3-70-g09d2


From efa70be165497826f674846f681e6e2364af906c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Dec 2013 02:14:39 -0800
Subject: xfs: add xfs_ilock_attr_map_shared

Equivalent to xfs_ilock_data_map_shared, except for the attribute fork.

Make xfs_getbmap use it if called for the attribute fork instead of
xfs_ilock_data_map_shared.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c | 27 ++++++++++++++++-----------
 fs/xfs/xfs_inode.c     | 34 ++++++++++++++++++++++++----------
 fs/xfs/xfs_inode.h     |  1 +
 3 files changed, 41 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 460aeb87c04..374ba050942 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -617,22 +617,27 @@ xfs_getbmap(
 		return XFS_ERROR(ENOMEM);
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-		if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
+	if (whichfork == XFS_DATA_FORK) {
+		if (!(iflags & BMV_IF_DELALLOC) &&
+		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
 			error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
 			if (error)
 				goto out_unlock_iolock;
+
+			/*
+			 * Even after flushing the inode, there can still be
+			 * delalloc blocks on the inode beyond EOF due to
+			 * speculative preallocation.  These are not removed
+			 * until the release function is called or the inode
+			 * is inactivated.  Hence we cannot assert here that
+			 * ip->i_delayed_blks == 0.
+			 */
 		}
-		/*
-		 * even after flushing the inode, there can still be delalloc
-		 * blocks on the inode beyond EOF due to speculative
-		 * preallocation. These are not removed until the release
-		 * function is called or the inode is inactivated. Hence we
-		 * cannot assert here that ip->i_delayed_blks == 0.
-		 */
-	}
 
-	lock = xfs_ilock_data_map_shared(ip);
+		lock = xfs_ilock_data_map_shared(ip);
+	} else {
+		lock = xfs_ilock_attr_map_shared(ip);
+	}
 
 	/*
 	 * Don't let nex be bigger than the number of extents
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fdd48378336..e655bb07e8b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -77,17 +77,18 @@ xfs_get_extsz_hint(
 }
 
 /*
- * This is a wrapper routine around the xfs_ilock() routine used to centralize
- * some grungy code.  It is used in places that wish to lock the inode solely
- * for reading the extents.  The reason these places can't just call
- * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format.  If the inode is in b-tree
- * format, then we need to lock the inode exclusively until the extents are read
- * in.  Locking it exclusively all the time would limit our parallelism
- * unnecessarily, though.  What we do instead is check to see if the extents
- * have been read in yet, and only lock the inode exclusively if they have not.
+ * These two are wrapper routines around the xfs_ilock() routine used to
+ * centralize some grungy code.  They are used in places that wish to lock the
+ * inode solely for reading the extents.  The reason these places can't just
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
+ * bringing in of the extents from disk for a file in b-tree format.  If the
+ * inode is in b-tree format, then we need to lock the inode exclusively until
+ * the extents are read in.  Locking it exclusively all the time would limit
+ * our parallelism unnecessarily, though.  What we do instead is check to see
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
  *
- * The function returns a value which should be given to the corresponding
+ * The functions return a value which should be given to the corresponding
  * xfs_iunlock() call.
  */
 uint
@@ -103,6 +104,19 @@ xfs_ilock_data_map_shared(
 	return lock_mode;
 }
 
+uint
+xfs_ilock_attr_map_shared(
+	struct xfs_inode	*ip)
+{
+	uint			lock_mode = XFS_ILOCK_SHARED;
+
+	if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+	    (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+		lock_mode = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lock_mode);
+	return lock_mode;
+}
+
 /*
  * The xfs inode contains 2 locks: a multi-reader lock called the
  * i_iolock and a multi-reader lock called the i_lock.  This routine
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fde368624ea..65e2350f449 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -338,6 +338,7 @@ void		xfs_iunlock(xfs_inode_t *, uint);
 void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_data_map_shared(struct xfs_inode *);
+uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
 			   xfs_nlink_t, xfs_dev_t, prid_t, int,
 			   struct xfs_buf **, xfs_inode_t **);
-- 
cgit v1.2.3-70-g09d2


From 40194ecc6d78327d98e66de3213db96ca0a31e6f Mon Sep 17 00:00:00 2001
From: Ben Myers <bpm@sgi.com>
Date: Fri, 6 Dec 2013 12:30:11 -0800
Subject: xfs: reinstate the ilock in xfs_readdir

Although it was removed in commit 051e7cd44ab8, ilock needs to be taken in
xfs_readdir because we might have to read the extent list in from disk.  This
keeps other threads from reading from or writing to the extent list while it is
being read in and is still in a transitional state.

This has been associated with "Access to block zero" messages on directories
with large numbers of extents resulting from excessive filesytem fragmentation,
as well as extent list corruption.  Unfortunately no test case at this point.

Signed-off-by: Ben Myers <bpm@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_dir2_readdir.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index c4e50c6ed58..aead369e1c3 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -674,6 +674,7 @@ xfs_readdir(
 {
 	int		rval;		/* return value */
 	int		v;		/* type-checking value */
+	uint		lock_mode;
 
 	trace_xfs_readdir(dp);
 
@@ -683,6 +684,7 @@ xfs_readdir(
 	ASSERT(S_ISDIR(dp->i_d.di_mode));
 	XFS_STATS_INC(xs_dir_getdents);
 
+	lock_mode = xfs_ilock_data_map_shared(dp);
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_getdents(dp, ctx);
 	else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
@@ -691,5 +693,7 @@ xfs_readdir(
 		rval = xfs_dir2_block_getdents(dp, ctx);
 	else
 		rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
+	xfs_iunlock(dp, lock_mode);
+
 	return rval;
 }
-- 
cgit v1.2.3-70-g09d2


From 4f317369d46956ccd76b5d28cf66b3f8b24f3480 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:12 -0800
Subject: xfs: take the ilock around xfs_bmapi_read in xfs_zero_remaining_bytes

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 374ba050942..202a51f7c45 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1173,9 +1173,15 @@ xfs_zero_remaining_bytes(
 	xfs_buf_unlock(bp);
 
 	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+		uint lock_mode;
+
 		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 		nimap = 1;
+
+		lock_mode = xfs_ilock_data_map_shared(ip);
 		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+		xfs_iunlock(ip, lock_mode);
+
 		if (error || nimap < 1)
 			break;
 		ASSERT(imap.br_blockcount >= 1);
-- 
cgit v1.2.3-70-g09d2


From f4df8adc8325127ff015ef9c2a8f005edaaedd07 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:13 -0800
Subject: xfs: use xfs_ilock_data_map_shared in xfs_qm_dqtobp

We might not have read in the extent list at this point, so make sure we
take the ilock exclusively if we have to do so.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dquot.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 6b1e695caf0..7aeb4c895b3 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -469,16 +469,17 @@ xfs_qm_dqtobp(
 	struct xfs_mount	*mp = dqp->q_mount;
 	xfs_dqid_t		id = be32_to_cpu(dqp->q_core.d_id);
 	struct xfs_trans	*tp = (tpp ? *tpp : NULL);
+	uint			lock_mode;
 
 	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 
-	xfs_ilock(quotip, XFS_ILOCK_SHARED);
+	lock_mode = xfs_ilock_data_map_shared(quotip);
 	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
 		/*
 		 * Return if this type of quotas is turned off while we
 		 * didn't have the quota inode lock.
 		 */
-		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		xfs_iunlock(quotip, lock_mode);
 		return ESRCH;
 	}
 
@@ -488,7 +489,7 @@ xfs_qm_dqtobp(
 	error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
 			       XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
 
-	xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+	xfs_iunlock(quotip, lock_mode);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3-70-g09d2


From da51d32d4596a14ee33917b9eca056d4bf41706a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:14 -0800
Subject: xfs: use xfs_ilock_data_map_shared in xfs_qm_dqiterate

We might not have read in the extent list at this point, so make sure we
take the ilock exclusively if we have to do so.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_qm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index dd88f0e27bd..348e4d2ed6e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1222,16 +1222,18 @@ xfs_qm_dqiterate(
 	lblkno = 0;
 	maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	do {
+		uint		lock_mode;
+
 		nmaps = XFS_DQITER_MAP_SIZE;
 		/*
 		 * We aren't changing the inode itself. Just changing
 		 * some of its data. No new blocks are added here, and
 		 * the inode is never added to the transaction.
 		 */
-		xfs_ilock(qip, XFS_ILOCK_SHARED);
+		lock_mode = xfs_ilock_data_map_shared(qip);
 		error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
 				       map, &nmaps, 0);
-		xfs_iunlock(qip, XFS_ILOCK_SHARED);
+		xfs_iunlock(qip, lock_mode);
 		if (error)
 			break;
 
-- 
cgit v1.2.3-70-g09d2


From 683cb941598d1d81283c940c100e0ce40f494105 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:15 -0800
Subject: xfs: use xfs_ilock_attr_map_shared in xfs_attr_get

We might not have read in the extent list at this point, so make sure we
take the ilock exclusively if we have to do so.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b86127072ac..01b6a0102fb 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -164,6 +164,7 @@ xfs_attr_get(
 {
 	int		error;
 	struct xfs_name	xname;
+	uint		lock_mode;
 
 	XFS_STATS_INC(xs_attr_get);
 
@@ -174,9 +175,9 @@ xfs_attr_get(
 	if (error)
 		return error;
 
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	lock_mode = xfs_ilock_attr_map_shared(ip);
 	error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	xfs_iunlock(ip, lock_mode);
 	return(error);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 568d994e9f53657cb6b3e9c95a83c130d36f83c9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:16 -0800
Subject: xfs: use xfs_ilock_attr_map_shared in xfs_attr_list_int

We might not have read in the extent list at this point, so make sure we
take the ilock exclusively if we have to do so.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_list.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d174b12815..01db96f60cf 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -507,17 +507,17 @@ xfs_attr_list_int(
 {
 	int error;
 	xfs_inode_t *dp = context->dp;
+	uint		lock_mode;
 
 	XFS_STATS_INC(xs_attr_list);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return EIO;
 
-	xfs_ilock(dp, XFS_ILOCK_SHARED);
-
 	/*
 	 * Decide on what work routines to call based on the inode size.
 	 */
+	lock_mode = xfs_ilock_attr_map_shared(dp);
 	if (!xfs_inode_hasattr(dp)) {
 		error = 0;
 	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
@@ -527,9 +527,7 @@ xfs_attr_list_int(
 	} else {
 		error = xfs_attr_node_list(context);
 	}
-
-	xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
+	xfs_iunlock(dp, lock_mode);
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From eef334e5776c8ef547ada4cec17549929fe590b4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 6 Dec 2013 12:30:17 -0800
Subject: xfs: assert that we hold the ilock for extent map access

Make sure that xfs_bmapi_read has the ilock held in some way, and that
xfs_bmapi_write, xfs_bmapi_delay, xfs_bunmapi and xfs_iread_extents are
called with the ilock held exclusively.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap.c       | 4 ++++
 fs/xfs/xfs_inode_fork.c | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8401f11f378..dbf27384b74 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4002,6 +4002,7 @@ xfs_bmapi_read(
 	ASSERT(*nmap >= 1);
 	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
 			   XFS_BMAPI_IGSTATE)));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
 
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4196,6 +4197,7 @@ xfs_bmapi_delay(
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
 	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
@@ -4489,6 +4491,7 @@ xfs_bmapi_write(
 	ASSERT(tp != NULL);
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5040,6 +5043,7 @@ xfs_bunmapi(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
 
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index cfee14a83cf..e16985e1c2f 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -431,6 +431,8 @@ xfs_iread_extents(
 	xfs_ifork_t	*ifp;
 	xfs_extnum_t	nextents;
 
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
 	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
 				 ip->i_mount);
-- 
cgit v1.2.3-70-g09d2


From 77d84ff87e9d38072abcca665ca22cb1da41cb86 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Mon, 9 Dec 2013 00:22:53 +0900
Subject: treewide: Fix typos in printk

Correct spelling typo in various part of kernel

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-imx/mach-pca100.c               | 2 +-
 arch/arm64/kvm/handle_exit.c                  | 2 +-
 arch/mips/ralink/cevt-rt3352.c                | 2 +-
 arch/mips/ralink/timer.c                      | 2 +-
 arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c | 2 +-
 drivers/char/msm_smd_pkt.c                    | 2 +-
 drivers/devfreq/exynos/exynos5_bus.c          | 2 +-
 drivers/gpu/drm/exynos/exynos_drm_fimc.c      | 2 +-
 drivers/gpu/drm/exynos/exynos_drm_gsc.c       | 4 ++--
 drivers/i2c/busses/i2c-exynos5.c              | 2 +-
 drivers/iio/adc/twl6030-gpadc.c               | 2 +-
 drivers/isdn/hisax/hfc4s8s_l1.c               | 2 +-
 drivers/media/platform/ti-vpe/vpdma.c         | 4 ++--
 drivers/net/wireless/libertas/cmdresp.c       | 2 +-
 drivers/video/udlfb.c                         | 2 +-
 fs/btrfs/tests/free-space-tests.c             | 2 +-
 net/nfc/digital_dep.c                         | 2 +-
 17 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/mach-imx/mach-pca100.c b/arch/arm/mach-imx/mach-pca100.c
index c5f95674e9b..bf3ac51d5ac 100644
--- a/arch/arm/mach-imx/mach-pca100.c
+++ b/arch/arm/mach-imx/mach-pca100.c
@@ -249,7 +249,7 @@ static int pca100_sdhc2_init(struct device *dev, irq_handler_t detect_irq,
 			  "imx-mmc-detect", data);
 	if (ret)
 		printk(KERN_ERR
-			"pca100: Failed to reuest irq for sd/mmc detection\n");
+			"pca100: Failed to request irq for sd/mmc detection\n");
 
 	return ret;
 }
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 8da56067c30..42a0f1bddfe 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -90,7 +90,7 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
 
 	if (hsr_ec >= ARRAY_SIZE(arm_exit_handlers) ||
 	    !arm_exit_handlers[hsr_ec]) {
-		kvm_err("Unkown exception class: hsr: %#08x\n",
+		kvm_err("Unknown exception class: hsr: %#08x\n",
 			(unsigned int)kvm_vcpu_get_hsr(vcpu));
 		BUG();
 	}
diff --git a/arch/mips/ralink/cevt-rt3352.c b/arch/mips/ralink/cevt-rt3352.c
index cc17566d193..24bf057a361 100644
--- a/arch/mips/ralink/cevt-rt3352.c
+++ b/arch/mips/ralink/cevt-rt3352.c
@@ -138,7 +138,7 @@ static void __init ralink_systick_init(struct device_node *np)
 
 	clockevents_register_device(&systick.dev);
 
-	pr_info("%s: runing - mult: %d, shift: %d\n",
+	pr_info("%s: running - mult: %d, shift: %d\n",
 			np->name, systick.dev.mult, systick.dev.shift);
 }
 
diff --git a/arch/mips/ralink/timer.c b/arch/mips/ralink/timer.c
index 20278570944..e38692a44e6 100644
--- a/arch/mips/ralink/timer.c
+++ b/arch/mips/ralink/timer.c
@@ -147,7 +147,7 @@ static int rt_timer_probe(struct platform_device *pdev)
 	rt_timer_config(rt, 2);
 	rt_timer_enable(rt);
 
-	dev_info(&pdev->dev, "maximum frequncy is %luHz\n", rt->timer_freq);
+	dev_info(&pdev->dev, "maximum frequency is %luHz\n", rt->timer_freq);
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
index be7b1aa4d54..37f7a89c10f 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c
@@ -245,7 +245,7 @@ static irqreturn_t mpc52xx_lpbfifo_irq(int irq, void *dev_id)
 
 	if (dma && !write) {
 		spin_unlock_irqrestore(&lpbfifo.lock, flags);
-		pr_err("bogus LPBFIFO IRQ (dma and not writting)\n");
+		pr_err("bogus LPBFIFO IRQ (dma and not writing)\n");
 		return IRQ_HANDLED;
 	}
 
diff --git a/drivers/char/msm_smd_pkt.c b/drivers/char/msm_smd_pkt.c
index 8eca55deb3a..ba82a06d968 100644
--- a/drivers/char/msm_smd_pkt.c
+++ b/drivers/char/msm_smd_pkt.c
@@ -182,7 +182,7 @@ static int smd_pkt_write(struct file *file, const char __user *buf,
 	if (count > MAX_BUF_SIZE)
 		return -EINVAL;
 
-	DBG("writting %d bytes\n", count);
+	DBG("writing %d bytes\n", count);
 
 	smd_pkt_devp = file->private_data;
 	if (!smd_pkt_devp || !smd_pkt_devp->ch)
diff --git a/drivers/devfreq/exynos/exynos5_bus.c b/drivers/devfreq/exynos/exynos5_bus.c
index a60da3c1c48..6eef1f7397c 100644
--- a/drivers/devfreq/exynos/exynos5_bus.c
+++ b/drivers/devfreq/exynos/exynos5_bus.c
@@ -152,7 +152,7 @@ static int exynos5_busfreq_int_target(struct device *dev, unsigned long *_freq,
 	if (old_freq == freq)
 		return 0;
 
-	dev_dbg(dev, "targetting %lukHz %luuV\n", freq, volt);
+	dev_dbg(dev, "targeting %lukHz %luuV\n", freq, volt);
 
 	mutex_lock(&data->lock);
 
diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimc.c b/drivers/gpu/drm/exynos/exynos_drm_fimc.c
index 8adfc8f1e08..30d76b2ff9c 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_fimc.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_fimc.c
@@ -345,7 +345,7 @@ static bool fimc_check_ovf(struct fimc_context *ctx)
 
 		fimc_write(cfg, EXYNOS_CIWDOFST);
 
-		dev_err(ippdrv->dev, "occured overflow at %d, status 0x%x.\n",
+		dev_err(ippdrv->dev, "occurred overflow at %d, status 0x%x.\n",
 			ctx->id, status);
 		return true;
 	}
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gsc.c b/drivers/gpu/drm/exynos/exynos_drm_gsc.c
index cd6aebd53bd..fa75059a610 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gsc.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gsc.c
@@ -1301,13 +1301,13 @@ static irqreturn_t gsc_irq_handler(int irq, void *dev_id)
 
 	status = gsc_read(GSC_IRQ);
 	if (status & GSC_IRQ_STATUS_OR_IRQ) {
-		dev_err(ippdrv->dev, "occured overflow at %d, status 0x%x.\n",
+		dev_err(ippdrv->dev, "occurred overflow at %d, status 0x%x.\n",
 			ctx->id, status);
 		return IRQ_NONE;
 	}
 
 	if (status & GSC_IRQ_STATUS_OR_FRM_DONE) {
-		dev_dbg(ippdrv->dev, "occured frame done at %d, status 0x%x.\n",
+		dev_dbg(ippdrv->dev, "occurred frame done at %d, status 0x%x.\n",
 			ctx->id, status);
 
 		buf_id[EXYNOS_DRM_OPS_SRC] = gsc_get_src_buf_index(ctx);
diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c
index c1ef228095b..044f85b01d0 100644
--- a/drivers/i2c/busses/i2c-exynos5.c
+++ b/drivers/i2c/busses/i2c-exynos5.c
@@ -571,7 +571,7 @@ static int exynos5_i2c_xfer(struct i2c_adapter *adap,
 	int i = 0, ret = 0, stop = 0;
 
 	if (i2c->suspended) {
-		dev_err(i2c->dev, "HS-I2C is not initialzed.\n");
+		dev_err(i2c->dev, "HS-I2C is not initialized.\n");
 		return -EIO;
 	}
 
diff --git a/drivers/iio/adc/twl6030-gpadc.c b/drivers/iio/adc/twl6030-gpadc.c
index 53e1c645cee..53a24ebb92c 100644
--- a/drivers/iio/adc/twl6030-gpadc.c
+++ b/drivers/iio/adc/twl6030-gpadc.c
@@ -969,7 +969,7 @@ static int twl6030_gpadc_suspend(struct device *pdev)
 	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, TWL6030_GPADCR,
 				TWL6030_REG_TOGGLE1);
 	if (ret)
-		dev_err(pdev, "error reseting GPADC (%d)!\n", ret);
+		dev_err(pdev, "error resetting GPADC (%d)!\n", ret);
 
 	return 0;
 };
diff --git a/drivers/isdn/hisax/hfc4s8s_l1.c b/drivers/isdn/hisax/hfc4s8s_l1.c
index c49c294fc81..414dbf6da89 100644
--- a/drivers/isdn/hisax/hfc4s8s_l1.c
+++ b/drivers/isdn/hisax/hfc4s8s_l1.c
@@ -1620,7 +1620,7 @@ hfc4s8s_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 #else
 	if (!request_region(hw->iobase, 8, hw->card_name)) {
 		printk(KERN_INFO
-		       "HFC-4S/8S: failed to rquest address space at 0x%04x\n",
+		       "HFC-4S/8S: failed to request address space at 0x%04x\n",
 		       hw->iobase);
 		goto out;
 	}
diff --git a/drivers/media/platform/ti-vpe/vpdma.c b/drivers/media/platform/ti-vpe/vpdma.c
index af0a5ffcaa9..fcbe48a09cf 100644
--- a/drivers/media/platform/ti-vpe/vpdma.c
+++ b/drivers/media/platform/ti-vpe/vpdma.c
@@ -577,8 +577,8 @@ static void dump_dtd(struct vpdma_dtd *dtd)
 		pr_debug("word5: max_width %d, max_height %d\n",
 			dtd_get_max_width(dtd), dtd_get_max_height(dtd));
 
-	pr_debug("word6: client specfic attr0 = 0x%08x\n", dtd->client_attr0);
-	pr_debug("word7: client specfic attr1 = 0x%08x\n", dtd->client_attr1);
+	pr_debug("word6: client specific attr0 = 0x%08x\n", dtd->client_attr0);
+	pr_debug("word7: client specific attr1 = 0x%08x\n", dtd->client_attr1);
 }
 
 /*
diff --git a/drivers/net/wireless/libertas/cmdresp.c b/drivers/net/wireless/libertas/cmdresp.c
index 178b222b3ce..65f18f1e869 100644
--- a/drivers/net/wireless/libertas/cmdresp.c
+++ b/drivers/net/wireless/libertas/cmdresp.c
@@ -248,7 +248,7 @@ int lbs_process_event(struct lbs_private *priv, u32 event)
 		/* handle unexpected PS SLEEP event */
 		if (priv->psstate == PS_STATE_FULL_POWER) {
 			lbs_deb_cmd(
-			       "EVENT: in FULL POWER mode, ignoreing PS_SLEEP\n");
+			       "EVENT: in FULL POWER mode, ignoring PS_SLEEP\n");
 			break;
 		}
 		priv->psstate = PS_STATE_PRE_SLEEP;
diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c
index 025f14e30ee..77b890e4d29 100644
--- a/drivers/video/udlfb.c
+++ b/drivers/video/udlfb.c
@@ -1624,7 +1624,7 @@ static int dlfb_usb_probe(struct usb_interface *interface,
 	}
 
 	if (pixel_limit) {
-		pr_warn("DL chip limit of %d overriden"
+		pr_warn("DL chip limit of %d overridden"
 			" by module param to %d\n",
 			dev->sku_pixel_limit, pixel_limit);
 		dev->sku_pixel_limit = pixel_limit;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 6fc82010dc1..790f118c327 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
 
 	ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
 	if (ret) {
-		test_msg("Error removing middle peice %d\n", ret);
+		test_msg("Error removing middle piece %d\n", ret);
 		return ret;
 	}
 
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index 07bbc24fb4c..8b362e802d2 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -563,7 +563,7 @@ static void digital_tg_recv_psl_req(struct nfc_digital_dev *ddev, void *arg,
 		rf_tech = NFC_DIGITAL_RF_TECH_424F;
 		break;
 	default:
-		pr_err("Unsuported dsi value %d\n", dsi);
+		pr_err("Unsupported dsi value %d\n", dsi);
 		goto exit;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 9a7f143ab529352ebef13d3f0f4a09f13efa9435 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Fri, 15 Nov 2013 10:42:51 +0900
Subject: f2fs: introduce __find_rev_next(_zero)_bit

When f2fs_set_bit is used, in a byte MSB and LSB is reversed,
in that case we can use __find_rev_next_bit or __find_rev_next_zero_bit.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
[Jaegeuk Kim: change the function names]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fa284d39719..aa1d30d7671 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,154 @@
 #include "node.h"
 #include <trace/events/f2fs.h>
 
+#define __reverse_ffz(x) __reverse_ffs(~(x))
+
+/*
+ * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
+ * MSB and LSB are reversed in a byte by f2fs_set_bit.
+ */
+static inline unsigned long __reverse_ffs(unsigned long word)
+{
+	int num = 0;
+
+#if BITS_PER_LONG == 64
+	if ((word & 0xffffffff) == 0) {
+		num += 32;
+		word >>= 32;
+	}
+#endif
+	if ((word & 0xffff) == 0) {
+		num += 16;
+		word >>= 16;
+	}
+	if ((word & 0xff) == 0) {
+		num += 8;
+		word >>= 8;
+	}
+	if ((word & 0xf0) == 0)
+		num += 4;
+	else
+		word >>= 4;
+	if ((word & 0xc) == 0)
+		num += 2;
+	else
+		word >>= 2;
+	if ((word & 0x2) == 0)
+		num += 1;
+	return num;
+}
+
+/*
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * Example:
+ *                             LSB <--> MSB
+ *   f2fs_set_bit(0, bitmap) => 0000 0001
+ *   f2fs_set_bit(7, bitmap) => 1000 0000
+ */
+static unsigned long __find_rev_next_bit(const unsigned long *addr,
+			unsigned long size, unsigned long offset)
+{
+	const unsigned long *p = addr + BIT_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long tmp;
+	unsigned long mask, submask;
+	unsigned long quot, rest;
+
+	if (offset >= size)
+		return size;
+
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (!offset)
+		goto aligned;
+
+	tmp = *(p++);
+	quot = (offset >> 3) << 3;
+	rest = offset & 0x7;
+	mask = ~0UL << quot;
+	submask = (unsigned char)(0xff << rest) >> rest;
+	submask <<= quot;
+	mask &= submask;
+	tmp &= mask;
+	if (size < BITS_PER_LONG)
+		goto found_first;
+	if (tmp)
+		goto found_middle;
+
+	size -= BITS_PER_LONG;
+	result += BITS_PER_LONG;
+aligned:
+	while (size & ~(BITS_PER_LONG-1)) {
+		tmp = *(p++);
+		if (tmp)
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+found_first:
+	tmp &= (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;   /* Nope. */
+found_middle:
+	return result + __reverse_ffs(tmp);
+}
+
+static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
+			unsigned long size, unsigned long offset)
+{
+	const unsigned long *p = addr + BIT_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long tmp;
+	unsigned long mask, submask;
+	unsigned long quot, rest;
+
+	if (offset >= size)
+		return size;
+
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (!offset)
+		goto aligned;
+
+	tmp = *(p++);
+	quot = (offset >> 3) << 3;
+	rest = offset & 0x7;
+	mask = ~(~0UL << quot);
+	submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
+	submask <<= quot;
+	mask += submask;
+	tmp |= mask;
+	if (size < BITS_PER_LONG)
+		goto found_first;
+	if (~tmp)
+		goto found_middle;
+
+	size -= BITS_PER_LONG;
+	result += BITS_PER_LONG;
+aligned:
+	while (size & ~(BITS_PER_LONG - 1)) {
+		tmp = *(p++);
+		if (~tmp)
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp |= ~0UL << size;
+	if (tmp == ~0UL)        /* Are any bits zero? */
+		return result + size;   /* Nope. */
+found_middle:
+	return result + __reverse_ffz(tmp);
+}
+
 /*
  * This function balances dirty node and dentry pages.
  * In addition, it controls garbage collection.
-- 
cgit v1.2.3-70-g09d2


From e81c93cf8c7bd413898798cf8c67f18b1fef3360 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Fri, 15 Nov 2013 13:21:16 +0900
Subject: f2fs: improve searching speed of __next_free_blkoff

To find a zero bit using the result of OR operation between ckpt_valid_map
and cur_valid_map is more fast than find a zero bit in each bitmap.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
[Jaegeuk Kim: adjust changed function name]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index aa1d30d7671..67f1e5bbdac 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -607,13 +607,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
 			struct curseg_info *seg, block_t start)
 {
 	struct seg_entry *se = get_seg_entry(sbi, seg->segno);
-	block_t ofs;
-	for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
-		if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
-			&& !f2fs_test_bit(ofs, se->cur_valid_map))
-			break;
-	}
-	seg->next_blkoff = ofs;
+	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+	unsigned long target_map[entries];
+	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+	int i, pos;
+
+	for (i = 0; i < entries; i++)
+		target_map[i] = ckpt_map[i] | cur_map[i];
+
+	pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+
+	seg->next_blkoff = pos;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 7fd9e544fbb10c6ae4b4953f6063560c8eeae6e8 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 15 Nov 2013 13:55:58 +0900
Subject: f2fs: add a slab cache entry for small discards

This patch adds a slab cache entry for small discards.

Each entry consists of:

struct discard_entry {
	struct list_head list;	/* list head */
	block_t blkaddr;	/* block address to be discarded */
	int len;		/* # of consecutive blocks of the discard */
};

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h    | 14 ++++++++++++++
 fs/f2fs/segment.c | 20 ++++++++++++++++++++
 fs/f2fs/super.c   |  7 ++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 89dc7508faf..c73e3dff6f1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -97,6 +97,13 @@ struct dir_inode_entry {
 	struct inode *inode;	/* vfs inode pointer */
 };
 
+/* for the list of blockaddresses to be discarded */
+struct discard_entry {
+	struct list_head list;	/* list head */
+	block_t blkaddr;	/* block address to be discarded */
+	int len;		/* # of consecutive blocks of the discard */
+};
+
 /* for the list of fsync inodes, used only during recovery */
 struct fsync_inode_entry {
 	struct list_head list;	/* list head */
@@ -308,6 +315,11 @@ struct f2fs_sm_info {
 
 	/* a threshold to reclaim prefree segments */
 	unsigned int rec_prefree_segments;
+
+	/* for small discard management */
+	struct list_head discard_list;		/* 4KB discard list */
+	int nr_discards;			/* # of discards in the list */
+	int max_discards;			/* max. discards to be issued */
 };
 
 /*
@@ -1079,6 +1091,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
 void flush_sit_entries(struct f2fs_sb_info *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
+int __init create_segment_manager_caches(void);
+void destroy_segment_manager_caches(void);
 
 /*
  * checkpoint.c
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 67f1e5bbdac..823526ec524 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -22,6 +22,8 @@
 
 #define __reverse_ffz(x) __reverse_ffs(~(x))
 
+static struct kmem_cache *discard_entry_slab;
+
 /*
  * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
  * MSB and LSB are reversed in a byte by f2fs_set_bit.
@@ -1798,6 +1800,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
 	sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
 
+	INIT_LIST_HEAD(&sm_info->discard_list);
+	sm_info->nr_discards = 0;
+	sm_info->max_discards = 0;
+
 	err = build_sit_info(sbi);
 	if (err)
 		return err;
@@ -1913,3 +1919,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
 	sbi->sm_info = NULL;
 	kfree(sm_info);
 }
+
+int __init create_segment_manager_caches(void)
+{
+	discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+			sizeof(struct discard_entry), NULL);
+	if (!discard_entry_slab)
+		return -ENOMEM;
+	return 0;
+}
+
+void destroy_segment_manager_caches(void)
+{
+	kmem_cache_destroy(discard_entry_slab);
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bafff72de8e..e9aa3f79f42 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1078,9 +1078,12 @@ static int __init init_f2fs_fs(void)
 	err = create_node_manager_caches();
 	if (err)
 		goto free_inodecache;
-	err = create_gc_caches();
+	err = create_segment_manager_caches();
 	if (err)
 		goto free_node_manager_caches;
+	err = create_gc_caches();
+	if (err)
+		goto free_segment_manager_caches;
 	err = create_checkpoint_caches();
 	if (err)
 		goto free_gc_caches;
@@ -1102,6 +1105,8 @@ free_checkpoint_caches:
 	destroy_checkpoint_caches();
 free_gc_caches:
 	destroy_gc_caches();
+free_segment_manager_caches:
+	destroy_segment_manager_caches();
 free_node_manager_caches:
 	destroy_node_manager_caches();
 free_inodecache:
-- 
cgit v1.2.3-70-g09d2


From b29555505d81e496fdbd125190c8043f3c09a83c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 12 Nov 2013 14:49:56 +0900
Subject: f2fs: add key functions for small discards

This patch adds key functions to activate the small discard feature.

Note that this procedure is conducted during the checkpoint only.

In flush_sit_entries(), when a new dirty sit entry is flushed, f2fs calls
add_discard_addrs() which searches candidates to be discarded.
The candidates should be marked *invalidated* and also previous checkpoint
recognizes it as *valid*.

At the end of a checkpoint procedure, f2fs throws discards based on the
discard entry list.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 823526ec524..505a8894cfa 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -266,6 +266,47 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
+static void add_discard_addrs(struct f2fs_sb_info *sbi,
+			unsigned int segno, struct seg_entry *se)
+{
+	struct list_head *head = &SM_I(sbi)->discard_list;
+	struct discard_entry *new;
+	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+	int max_blocks = sbi->blocks_per_seg;
+	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+	unsigned long dmap[entries];
+	unsigned int start = 0, end = -1;
+	int i;
+
+	if (!test_opt(sbi, DISCARD))
+		return;
+
+	/* zero block will be discarded through the prefree list */
+	if (!se->valid_blocks || se->valid_blocks == max_blocks)
+		return;
+
+	/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
+	for (i = 0; i < entries; i++)
+		dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+
+	while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+		start = __find_rev_next_bit(dmap, max_blocks, end + 1);
+		if (start >= max_blocks)
+			break;
+
+		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+
+		new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+		INIT_LIST_HEAD(&new->list);
+		new->blkaddr = START_BLOCK(sbi, segno) + start;
+		new->len = end - start;
+
+		list_add_tail(&new->list, head);
+		SM_I(sbi)->nr_discards += end - start;
+	}
+}
+
 /*
  * Should call clear_prefree_segments after checkpoint is done.
  */
@@ -288,6 +329,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
+	struct list_head *head = &(SM_I(sbi)->discard_list);
+	struct list_head *this, *next;
+	struct discard_entry *entry;
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
 	unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -318,6 +362,18 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
 				GFP_NOFS, 0);
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
+
+	/* send small discards */
+	list_for_each_safe(this, next, head) {
+		entry = list_entry(this, struct discard_entry, list);
+		blkdev_issue_discard(sbi->sb->s_bdev,
+				entry->blkaddr << sbi->log_sectors_per_block,
+				(1 << sbi->log_sectors_per_block) * entry->len,
+				GFP_NOFS, 0);
+		list_del(&entry->list);
+		SM_I(sbi)->nr_discards -= entry->len;
+		kmem_cache_free(discard_entry_slab, entry);
+	}
 }
 
 static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -1469,6 +1525,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
 
 		sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
 
+		/* add discard candidates */
+		if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
+			add_discard_addrs(sbi, segno, se);
+
 		if (flushed)
 			goto to_sit_page;
 
-- 
cgit v1.2.3-70-g09d2


From 7ac8c3b051ed3a3e71df72a7ffc44e14cc5c2bae Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 12 Nov 2013 15:00:38 +0900
Subject: f2fs: add a sysfs entry to control max_discards

If frequent small discards are issued to the device, the performance would
be degraded significantly.
So, this patch adds a sysfs entry to control the number of discards to be
issued during a checkpoint procedure.

By default, f2fs does not issue any small discards, which means max_discards
is zero.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e9aa3f79f42..a022412c66e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -175,6 +175,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -183,6 +184,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(gc_no_gc_sleep_time),
 	ATTR_LIST(gc_idle),
 	ATTR_LIST(reclaim_segments),
+	ATTR_LIST(max_small_discards),
 	NULL,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 3720887910864467a61cd0d64bad3965009cdef8 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 12 Nov 2013 16:55:17 +0900
Subject: f2fs: introduce f2fs_issue_discard() to clean up

Change log from v1:
 o fix 32bit drops reported by Dan Carpenter

This patch adds f2fs_issue_discard() to clean up blkdev_issue_discard() flows.

Dan carpenter reported:
"block_t is a 32 bit type and sector_t is a 64 bit type.  The upper 32
bits of the sector_t are not used because the shift will wrap."

Bug-Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 505a8894cfa..c51fa4bee60 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -266,6 +266,14 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
+static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
+				block_t blkstart, block_t blklen)
+{
+	sector_t start = ((sector_t)blkstart) << sbi->log_sectors_per_block;
+	sector_t len = ((sector_t)blklen) << sbi->log_sectors_per_block;
+	blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+}
+
 static void add_discard_addrs(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct seg_entry *se)
 {
@@ -354,22 +362,15 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
 		if (!test_opt(sbi, DISCARD))
 			continue;
 
-		blkdev_issue_discard(sbi->sb->s_bdev,
-				START_BLOCK(sbi, start) <<
-				sbi->log_sectors_per_block,
-				(1 << (sbi->log_sectors_per_block +
-				sbi->log_blocks_per_seg)) * (end - start),
-				GFP_NOFS, 0);
+		f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+				(end - start) << sbi->log_blocks_per_seg);
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
 	/* send small discards */
 	list_for_each_safe(this, next, head) {
 		entry = list_entry(this, struct discard_entry, list);
-		blkdev_issue_discard(sbi->sb->s_bdev,
-				entry->blkaddr << sbi->log_sectors_per_block,
-				(1 << sbi->log_sectors_per_block) * entry->len,
-				GFP_NOFS, 0);
+		f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
 		list_del(&entry->list);
 		SM_I(sbi)->nr_discards -= entry->len;
 		kmem_cache_free(discard_entry_slab, entry);
-- 
cgit v1.2.3-70-g09d2


From 1661d07c2d5e6486cab1c189cfb65ff60abf9b92 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 12 Nov 2013 17:01:00 +0900
Subject: f2fs: add a tracepoint for f2fs_issue_discard

This patch adds a tracepoint for f2fs_issue_discard.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c           |  1 +
 include/trace/events/f2fs.h | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c51fa4bee60..cfc0eb4f24d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -272,6 +272,7 @@ static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
 	sector_t start = ((sector_t)blkstart) << sbi->log_sectors_per_block;
 	sector_t len = ((sector_t)blklen) << sbi->log_sectors_per_block;
 	blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 }
 
 static void add_discard_addrs(struct f2fs_sb_info *sbi,
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index e0dc355fa31..47ee70de7fe 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -727,6 +727,29 @@ TRACE_EVENT(f2fs_write_checkpoint,
 		__entry->msg)
 );
 
+TRACE_EVENT(f2fs_issue_discard,
+
+	TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
+
+	TP_ARGS(sb, blkstart, blklen),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(block_t, blkstart)
+		__field(block_t, blklen)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->blkstart = blkstart;
+		__entry->blklen = blklen;
+	),
+
+	TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
+		show_dev(__entry),
+		(unsigned long long)__entry->blkstart,
+		(unsigned long long)__entry->blklen)
+);
 #endif /* _TRACE_F2FS_H */
 
  /* This part must be outside protection */
-- 
cgit v1.2.3-70-g09d2


From 75c3c8bc88d573d6823fcf2576f5c47e4fc41710 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Sat, 16 Nov 2013 14:15:59 +0800
Subject: f2fs: use f2fs_put_page to release page for uniform style

We should use f2fs_put_page to release page for uniform style of f2fs code.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3438c571f..076a60cb8db 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -714,8 +714,7 @@ static int f2fs_write_end(struct file *file,
 		update_inode_page(inode);
 	}
 
-	unlock_page(page);
-	page_cache_release(page);
+	f2fs_put_page(page, 1);
 	return copied;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7d5e510944ce60ef0c6c2300a58547679df76db7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Mon, 18 Nov 2013 17:13:35 +0900
Subject: f2fs: clean up the do_submit_bio flow

This patch introduces PAGE_TYPE_OF_BIO() and cleans up do_submit_bio() with it.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h    |  1 +
 fs/f2fs/segment.c | 39 +++++++++++++++++++++------------------
 2 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c73e3dff6f1..38963b4b500 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -350,6 +350,7 @@ enum count_type {
  *			with waiting the bio's completion
  * ...			Only can be used with META.
  */
+#define PAGE_TYPE_OF_BIO(type)	((type) > META ? META : (type))
 enum page_type {
 	DATA,
 	NODE,
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index cfc0eb4f24d..d5043bdaa1a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -835,32 +835,35 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 				enum page_type type, bool sync)
 {
 	int rw = sync ? WRITE_SYNC : WRITE;
-	enum page_type btype = type > META ? META : type;
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct bio *bio = sbi->bio[btype];
+	struct bio_private *p;
+
+	if (!bio)
+		return;
+
+	sbi->bio[btype] = NULL;
 
 	if (type >= META_FLUSH)
 		rw = WRITE_FLUSH_FUA;
-
 	if (btype == META)
 		rw |= REQ_META;
 
-	if (sbi->bio[btype]) {
-		struct bio_private *p = sbi->bio[btype]->bi_private;
-		p->sbi = sbi;
-		sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
+	p = bio->bi_private;
+	p->sbi = sbi;
+	bio->bi_end_io = f2fs_end_io_write;
 
-		trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
+	trace_f2fs_do_submit_bio(sbi->sb, btype, sync, bio);
 
-		if (type == META_FLUSH) {
-			DECLARE_COMPLETION_ONSTACK(wait);
-			p->is_sync = true;
-			p->wait = &wait;
-			submit_bio(rw, sbi->bio[btype]);
-			wait_for_completion(&wait);
-		} else {
-			p->is_sync = false;
-			submit_bio(rw, sbi->bio[btype]);
-		}
-		sbi->bio[btype] = NULL;
+	if (type == META_FLUSH) {
+		DECLARE_COMPLETION_ONSTACK(wait);
+		p->is_sync = true;
+		p->wait = &wait;
+		submit_bio(rw, bio);
+		wait_for_completion(&wait);
+	} else {
+		p->is_sync = false;
+		submit_bio(rw, bio);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 971767caf632190f77a40b4011c19948232eed75 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Mon, 18 Nov 2013 17:16:17 +0900
Subject: f2fs: use sbi->write_mutex for write bios

This patch removes an unnecessary semaphore (i.e., sbi->bio_sem).
There is no reason to use the semaphore when f2fs submits read and write IOs.
Instead, let's use a write mutex and cover the sbi->bio[] by the lock.

Change log from v1:
 o split write_mutex suggested by Chao Yu

Chao described,
"All DATA/NODE/META bio buffers in superblock is protected by
'sbi->write_mutex', but each bio buffer area is independent, So we
should split write_mutex to three for DATA/NODE/META."

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c    |  4 ----
 fs/f2fs/f2fs.h    |  2 +-
 fs/f2fs/segment.c | 13 +++++++++----
 fs/f2fs/super.c   |  6 +++++-
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 076a60cb8db..5920639ca37 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -383,8 +383,6 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
 
 	trace_f2fs_readpage(page, blk_addr, type);
 
-	down_read(&sbi->bio_sem);
-
 	/* Allocate a new bio */
 	bio = f2fs_bio_alloc(bdev, 1);
 
@@ -394,13 +392,11 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
 
 	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
 		bio_put(bio);
-		up_read(&sbi->bio_sem);
 		f2fs_put_page(page, 1);
 		return -EFAULT;
 	}
 
 	submit_bio(type, bio);
-	up_read(&sbi->bio_sem);
 	return 0;
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 38963b4b500..c2de5491972 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -374,7 +374,7 @@ struct f2fs_sb_info {
 	struct f2fs_sm_info *sm_info;		/* segment manager */
 	struct bio *bio[NR_PAGE_TYPE];		/* bios to merge */
 	sector_t last_block_in_bio[NR_PAGE_TYPE];	/* last block number */
-	struct rw_semaphore bio_sem;		/* IO semaphore */
+	struct mutex write_mutex[NR_PAGE_TYPE];	/* mutex for writing IOs */
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d5043bdaa1a..68b8a2bdf6d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -869,9 +869,14 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 
 void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
 {
-	down_write(&sbi->bio_sem);
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+
+	if (!sbi->bio[btype])
+		return;
+
+	mutex_lock(&sbi->write_mutex[btype]);
 	do_submit_bio(sbi, type, sync);
-	up_write(&sbi->bio_sem);
+	mutex_unlock(&sbi->write_mutex[btype]);
 }
 
 static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -882,7 +887,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
 
 	verify_block_addr(sbi, blk_addr);
 
-	down_write(&sbi->bio_sem);
+	mutex_lock(&sbi->write_mutex[type]);
 
 	inc_page_count(sbi, F2FS_WRITEBACK);
 
@@ -917,7 +922,7 @@ retry:
 
 	sbi->last_block_in_bio[type] = blk_addr;
 
-	up_write(&sbi->bio_sem);
+	mutex_unlock(&sbi->write_mutex[type]);
 	trace_f2fs_submit_write_page(page, blk_addr, type);
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a022412c66e..e194578b359 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -820,6 +820,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	struct buffer_head *raw_super_buf;
 	struct inode *root;
 	long err = -EINVAL;
+	int i;
 
 	/* allocate memory for f2fs-specific super block info */
 	sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
@@ -876,7 +877,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_init(&sbi->node_write);
 	sbi->por_doing = false;
 	spin_lock_init(&sbi->stat_lock);
-	init_rwsem(&sbi->bio_sem);
+
+	for (i = 0; i < NR_PAGE_TYPE; i++)
+		mutex_init(&sbi->write_mutex[i]);
+
 	init_rwsem(&sbi->cp_rwsem);
 	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
-- 
cgit v1.2.3-70-g09d2


From c11abd1a8075e428ecf5303359513b48193b29cd Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 19 Nov 2013 10:41:54 +0900
Subject: f2fs: disable the extent cache ops on high fragmented files

The f2fs manages an extent cache to search a number of consecutive data blocks
very quickly.

However it conducts unnecessary cache operations if the file is highly
fragmented with no valid extent cache.

In such the case, we don't need to handle the extent cache, but just can disable
the cache facility.

Nevertheless, this patch gives one more chance to enable the extent cache.

For example,
1. create a file
2. write data sequentially which produces a large valid extent cache
3. update some data, resulting in a fragmented extent
4. if the fragmented extent is too small, then drop extent cache
5. close the file

6. open the file again
7. give another chance to make a new extent cache
8. write data sequentially again which creates another big extent cache.
...

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 22 ++++++++++++++++++----
 fs/f2fs/f2fs.h |  3 +++
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5920639ca37..2e54522a806 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -71,6 +71,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 	pgoff_t start_fofs, end_fofs;
 	block_t start_blkaddr;
 
+	if (is_inode_flag_set(fi, FI_NO_EXTENT))
+		return 0;
+
 	read_lock(&fi->ext.ext_lock);
 	if (fi->ext.len == 0) {
 		read_unlock(&fi->ext.ext_lock);
@@ -109,6 +112,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
 	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
 	pgoff_t fofs, start_fofs, end_fofs;
 	block_t start_blkaddr, end_blkaddr;
+	int need_update = true;
 
 	f2fs_bug_on(blk_addr == NEW_ADDR);
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -117,6 +121,9 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
 	/* Update the page address in the parent node */
 	__set_data_blkaddr(dn, blk_addr);
 
+	if (is_inode_flag_set(fi, FI_NO_EXTENT))
+		return;
+
 	write_lock(&fi->ext.ext_lock);
 
 	start_fofs = fi->ext.fofs;
@@ -163,14 +170,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
 					fofs - start_fofs + 1;
 			fi->ext.len -= fofs - start_fofs + 1;
 		}
-		goto end_update;
+	} else {
+		need_update = false;
 	}
-	write_unlock(&fi->ext.ext_lock);
-	return;
 
+	/* Finally, if the extent is very fragmented, let's drop the cache. */
+	if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
+		fi->ext.len = 0;
+		set_inode_flag(fi, FI_NO_EXTENT);
+		need_update = true;
+	}
 end_update:
 	write_unlock(&fi->ext.ext_lock);
-	sync_inode_page(dn);
+	if (need_update)
+		sync_inode_page(dn);
+	return;
 }
 
 struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c2de5491972..e9038bbeee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -169,6 +169,8 @@ enum {
 #define F2FS_LINK_MAX		32000	/* maximum link count per file */
 
 /* for in-memory extent cache entry */
+#define F2FS_MIN_EXTENT_LEN	16	/* minimum extent length */
+
 struct extent_info {
 	rwlock_t ext_lock;	/* rwlock for consistency */
 	unsigned int fofs;	/* start offset in a file */
@@ -889,6 +891,7 @@ enum {
 	FI_NO_ALLOC,		/* should not allocate any blocks */
 	FI_UPDATE_DIR,		/* should update inode block for consistency */
 	FI_DELAY_IPUT,		/* used for the recovery */
+	FI_NO_EXTENT,		/* not to use the extent cache */
 	FI_INLINE_XATTR,	/* used for inline xattr */
 };
 
-- 
cgit v1.2.3-70-g09d2


From 1ff7bd3bb5f7f57bc7418ee6ed12f3ae217e4e9c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 19 Nov 2013 12:47:22 +0900
Subject: f2fs: introduce a bio array for per-page write bios

The f2fs has three bio types, NODE, DATA, and META, and manages some data
structures per each bio types.

The codes are a little bit messy, thus, this patch introduces a bio array
which groups individual data structures as follows.

struct f2fs_bio_info {
	struct bio *bio;		/* bios to merge */
	sector_t last_block_in_bio;	/* last block number */
	struct mutex io_mutex;		/* mutex for bio */
};

struct f2fs_sb_info {
	...
	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
	...
};

The code changes from this new data structure are trivial.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h    | 12 +++++++++---
 fs/f2fs/segment.c | 44 ++++++++++++++++++++++----------------------
 fs/f2fs/super.c   |  2 +-
 3 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e9038bbeee9..05f8fe1399a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -361,6 +361,12 @@ enum page_type {
 	META_FLUSH,
 };
 
+struct f2fs_bio_info {
+	struct bio *bio;		/* bios to merge */
+	sector_t last_block_in_bio;	/* last block number */
+	struct mutex io_mutex;		/* mutex for bio */
+};
+
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
@@ -374,9 +380,9 @@ struct f2fs_sb_info {
 
 	/* for segment-related operations */
 	struct f2fs_sm_info *sm_info;		/* segment manager */
-	struct bio *bio[NR_PAGE_TYPE];		/* bios to merge */
-	sector_t last_block_in_bio[NR_PAGE_TYPE];	/* last block number */
-	struct mutex write_mutex[NR_PAGE_TYPE];	/* mutex for writing IOs */
+
+	/* for bio operations */
+	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 68b8a2bdf6d..38c1a89f54c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -836,65 +836,65 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 {
 	int rw = sync ? WRITE_SYNC : WRITE;
 	enum page_type btype = PAGE_TYPE_OF_BIO(type);
-	struct bio *bio = sbi->bio[btype];
+	struct f2fs_bio_info *io = &sbi->write_io[btype];
 	struct bio_private *p;
 
-	if (!bio)
+	if (!io->bio)
 		return;
 
-	sbi->bio[btype] = NULL;
-
 	if (type >= META_FLUSH)
 		rw = WRITE_FLUSH_FUA;
 	if (btype == META)
 		rw |= REQ_META;
 
-	p = bio->bi_private;
+	p = io->bio->bi_private;
 	p->sbi = sbi;
-	bio->bi_end_io = f2fs_end_io_write;
+	io->bio->bi_end_io = f2fs_end_io_write;
 
-	trace_f2fs_do_submit_bio(sbi->sb, btype, sync, bio);
+	trace_f2fs_do_submit_bio(sbi->sb, btype, sync, io->bio);
 
 	if (type == META_FLUSH) {
 		DECLARE_COMPLETION_ONSTACK(wait);
 		p->is_sync = true;
 		p->wait = &wait;
-		submit_bio(rw, bio);
+		submit_bio(rw, io->bio);
 		wait_for_completion(&wait);
 	} else {
 		p->is_sync = false;
-		submit_bio(rw, bio);
+		submit_bio(rw, io->bio);
 	}
+	io->bio = NULL;
 }
 
 void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
 {
-	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct f2fs_bio_info *io = &sbi->write_io[PAGE_TYPE_OF_BIO(type)];
 
-	if (!sbi->bio[btype])
+	if (!io->bio)
 		return;
 
-	mutex_lock(&sbi->write_mutex[btype]);
+	mutex_lock(&io->io_mutex);
 	do_submit_bio(sbi, type, sync);
-	mutex_unlock(&sbi->write_mutex[btype]);
+	mutex_unlock(&io->io_mutex);
 }
 
 static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
 				block_t blk_addr, enum page_type type)
 {
 	struct block_device *bdev = sbi->sb->s_bdev;
+	struct f2fs_bio_info *io = &sbi->write_io[type];
 	int bio_blocks;
 
 	verify_block_addr(sbi, blk_addr);
 
-	mutex_lock(&sbi->write_mutex[type]);
+	mutex_lock(&io->io_mutex);
 
 	inc_page_count(sbi, F2FS_WRITEBACK);
 
-	if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
+	if (io->bio && io->last_block_in_bio != blk_addr - 1)
 		do_submit_bio(sbi, type, false);
 alloc_new:
-	if (sbi->bio[type] == NULL) {
+	if (io->bio == NULL) {
 		struct bio_private *priv;
 retry:
 		priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
@@ -904,9 +904,9 @@ retry:
 		}
 
 		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-		sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks);
-		sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-		sbi->bio[type]->bi_private = priv;
+		io->bio = f2fs_bio_alloc(bdev, bio_blocks);
+		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+		io->bio->bi_private = priv;
 		/*
 		 * The end_io will be assigned at the sumbission phase.
 		 * Until then, let bio_add_page() merge consecutive IOs as much
@@ -914,15 +914,15 @@ retry:
 		 */
 	}
 
-	if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
+	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
 							PAGE_CACHE_SIZE) {
 		do_submit_bio(sbi, type, false);
 		goto alloc_new;
 	}
 
-	sbi->last_block_in_bio[type] = blk_addr;
+	io->last_block_in_bio = blk_addr;
 
-	mutex_unlock(&sbi->write_mutex[type]);
+	mutex_unlock(&io->io_mutex);
 	trace_f2fs_submit_write_page(page, blk_addr, type);
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e194578b359..8c677e958de 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -879,7 +879,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&sbi->stat_lock);
 
 	for (i = 0; i < NR_PAGE_TYPE; i++)
-		mutex_init(&sbi->write_mutex[i]);
+		mutex_init(&sbi->write_io[i].io_mutex);
 
 	init_rwsem(&sbi->cp_rwsem);
 	init_waitqueue_head(&sbi->cp_wait);
-- 
cgit v1.2.3-70-g09d2


From 58e674d6ab8b23ff1e933a8b714b302786351bd7 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 19 Nov 2013 18:03:18 +0800
Subject: f2fs: convert remove_inode_page to void

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h |  2 +-
 fs/f2fs/node.c | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 05f8fe1399a..707896456a4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1049,7 +1049,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-int remove_inode_page(struct inode *);
+void remove_inode_page(struct inode *);
 struct page *new_inode_page(struct inode *, const struct qstr *);
 struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4ac4150d421..e7b6826beac 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -803,29 +803,25 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
  * Caller should grab and release a mutex by calling mutex_lock_op() and
  * mutex_unlock_op().
  */
-int remove_inode_page(struct inode *inode)
+void remove_inode_page(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct page *page;
 	nid_t ino = inode->i_ino;
 	struct dnode_of_data dn;
-	int err;
 
 	page = get_node_page(sbi, ino);
 	if (IS_ERR(page))
-		return PTR_ERR(page);
+		return;
 
-	err = truncate_xattr_node(inode, page);
-	if (err) {
+	if (truncate_xattr_node(inode, page)) {
 		f2fs_put_page(page, 1);
-		return err;
+		return;
 	}
-
 	/* 0 is possible, after f2fs_new_inode() is failed */
 	f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
 	set_new_dnode(&dn, inode, page, page, ino);
 	truncate_node(&dn);
-	return 0;
 }
 
 struct page *new_inode_page(struct inode *inode, const struct qstr *name)
-- 
cgit v1.2.3-70-g09d2


From da19b0dc5080db4e9138d4f5bc47d5fa9ce81c98 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 19 Nov 2013 18:03:27 +0800
Subject: f2fs: convert dev_valid_block_count to void

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 707896456a4..154f07604c1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -606,7 +606,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
 	return true;
 }
 
-static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
+static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 						struct inode *inode,
 						blkcnt_t count)
 {
@@ -616,7 +616,6 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
 	inode->i_blocks -= count;
 	sbi->total_valid_block_count -= (block_t)count;
 	spin_unlock(&sbi->stat_lock);
-	return 0;
 }
 
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
-- 
cgit v1.2.3-70-g09d2


From ef86d70994b57cf8095d436da76c6684c83ef0b2 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 19 Nov 2013 18:03:38 +0800
Subject: f2fs: convert inc/dec_valid_node_count to inc/dec one count

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h | 32 +++++++++++++++-----------------
 fs/f2fs/node.c |  6 +++---
 2 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 154f07604c1..b665ce492e1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -706,50 +706,48 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 }
 
 static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
-						struct inode *inode,
-						unsigned int count)
+						struct inode *inode)
 {
 	block_t	valid_block_count;
 	unsigned int valid_node_count;
 
 	spin_lock(&sbi->stat_lock);
 
-	valid_block_count = sbi->total_valid_block_count + (block_t)count;
-	sbi->alloc_valid_block_count += (block_t)count;
-	valid_node_count = sbi->total_valid_node_count + count;
-
+	valid_block_count = sbi->total_valid_block_count + 1;
 	if (valid_block_count > sbi->user_block_count) {
 		spin_unlock(&sbi->stat_lock);
 		return false;
 	}
 
+	valid_node_count = sbi->total_valid_node_count + 1;
 	if (valid_node_count > sbi->total_node_count) {
 		spin_unlock(&sbi->stat_lock);
 		return false;
 	}
 
 	if (inode)
-		inode->i_blocks += count;
-	sbi->total_valid_node_count = valid_node_count;
-	sbi->total_valid_block_count = valid_block_count;
+		inode->i_blocks++;
+
+	sbi->alloc_valid_block_count++;
+	sbi->total_valid_node_count++;
+	sbi->total_valid_block_count++;
 	spin_unlock(&sbi->stat_lock);
 
 	return true;
 }
 
 static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
-						struct inode *inode,
-						unsigned int count)
+						struct inode *inode)
 {
 	spin_lock(&sbi->stat_lock);
 
-	f2fs_bug_on(sbi->total_valid_block_count < count);
-	f2fs_bug_on(sbi->total_valid_node_count < count);
-	f2fs_bug_on(inode->i_blocks < count);
+	f2fs_bug_on(!sbi->total_valid_block_count);
+	f2fs_bug_on(!sbi->total_valid_node_count);
+	f2fs_bug_on(!inode->i_blocks);
 
-	inode->i_blocks -= count;
-	sbi->total_valid_node_count -= count;
-	sbi->total_valid_block_count -= (block_t)count;
+	inode->i_blocks--;
+	sbi->total_valid_node_count--;
+	sbi->total_valid_block_count--;
 
 	spin_unlock(&sbi->stat_lock);
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e7b6826beac..b843a5b3d5e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -502,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn)
 
 	/* Deallocate node address */
 	invalidate_blocks(sbi, ni.blk_addr);
-	dec_valid_node_count(sbi, dn->inode, 1);
+	dec_valid_node_count(sbi, dn->inode);
 	set_node_addr(sbi, &ni, NULL_ADDR);
 
 	if (dn->nid == dn->inode->i_ino) {
@@ -851,7 +851,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+	if (!inc_valid_node_count(sbi, dn->inode)) {
 		err = -ENOSPC;
 		goto fail;
 	}
@@ -1560,7 +1560,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 	new_ni = old_ni;
 	new_ni.ino = ino;
 
-	if (!inc_valid_node_count(sbi, NULL, 1))
+	if (!inc_valid_node_count(sbi, NULL))
 		WARN_ON(1);
 	set_node_addr(sbi, &new_ni, NEW_ADDR);
 	inc_valid_inode_count(sbi);
-- 
cgit v1.2.3-70-g09d2


From 502c6e0bcd95d45f734548e2fcf8a0de494a9095 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 19 Nov 2013 18:03:58 +0800
Subject: f2fs: simplify write_orphan_inodes for better readable

Simplify write_orphan_inodes for better readable. Because we hold the
orphan_inode_mutex, so it's safe to use list_for_each_entry instead of
list_for_each_safe.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5716e5eb4e8..247879f0e73 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -300,12 +300,13 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 
 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-	struct list_head *head, *this, *next;
+	struct list_head *head;
 	struct f2fs_orphan_block *orphan_blk = NULL;
 	struct page *page = NULL;
 	unsigned int nentries = 0;
 	unsigned short index = 1;
 	unsigned short orphan_blocks;
+	struct orphan_inode_entry *orphan = NULL;
 
 	orphan_blocks = (unsigned short)((sbi->n_orphans +
 		(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
@@ -314,12 +315,17 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	head = &sbi->orphan_inode_list;
 
 	/* loop for each orphan inode entry and write them in Jornal block */
-	list_for_each_safe(this, next, head) {
-		struct orphan_inode_entry *orphan;
+	list_for_each_entry(orphan, head, list) {
+		if (!page) {
+			page = grab_meta_page(sbi, start_blk);
+			orphan_blk =
+				(struct f2fs_orphan_block *)page_address(page);
+			memset(orphan_blk, 0, sizeof(*orphan_blk));
+		}
 
-		orphan = list_entry(this, struct orphan_inode_entry, list);
+		orphan_blk->ino[nentries] = cpu_to_le32(orphan->ino);
 
-		if (nentries == F2FS_ORPHANS_PER_BLOCK) {
+		if (nentries++ == F2FS_ORPHANS_PER_BLOCK) {
 			/*
 			 * an orphan block is full of 1020 entries,
 			 * then we need to flush current orphan blocks
@@ -335,24 +341,16 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 			nentries = 0;
 			page = NULL;
 		}
-		if (page)
-			goto page_exist;
+	}
 
-		page = grab_meta_page(sbi, start_blk);
-		orphan_blk = (struct f2fs_orphan_block *)page_address(page);
-		memset(orphan_blk, 0, sizeof(*orphan_blk));
-page_exist:
-		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+	if (page) {
+		orphan_blk->blk_addr = cpu_to_le16(index);
+		orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+		orphan_blk->entry_count = cpu_to_le32(nentries);
+		set_page_dirty(page);
+		f2fs_put_page(page, 1);
 	}
-	if (!page)
-		goto end;
-
-	orphan_blk->blk_addr = cpu_to_le16(index);
-	orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
-	orphan_blk->entry_count = cpu_to_le32(nentries);
-	set_page_dirty(page);
-	f2fs_put_page(page, 1);
-end:
+
 	mutex_unlock(&sbi->orphan_inode_mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From ce3b7d80edad7bc5ff347b9ff02f1484265b1f05 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 19 Nov 2013 18:03:47 +0800
Subject: f2fs: move the list_head initialization into the lock protection
 region

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 247879f0e73..eae8dc3e1c0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -511,8 +511,8 @@ void add_dirty_dir_inode(struct inode *inode)
 void remove_dirty_dir_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	struct list_head *head = &sbi->dir_inode_list;
-	struct list_head *this;
+
+	struct list_head *this, *head;
 
 	if (!S_ISDIR(inode->i_mode))
 		return;
@@ -523,6 +523,7 @@ void remove_dirty_dir_inode(struct inode *inode)
 		return;
 	}
 
+	head = &sbi->dir_inode_list;
 	list_for_each(this, head) {
 		struct dir_inode_entry *entry;
 		entry = list_entry(this, struct dir_inode_entry, list);
@@ -544,11 +545,13 @@ void remove_dirty_dir_inode(struct inode *inode)
 
 struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	struct list_head *head = &sbi->dir_inode_list;
-	struct list_head *this;
+
+	struct list_head *this, *head;
 	struct inode *inode = NULL;
 
 	spin_lock(&sbi->dir_inode_lock);
+
+	head = &sbi->dir_inode_list;
 	list_for_each(this, head) {
 		struct dir_inode_entry *entry;
 		entry = list_entry(this, struct dir_inode_entry, list);
@@ -563,11 +566,13 @@ struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
 
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
 {
-	struct list_head *head = &sbi->dir_inode_list;
+	struct list_head *head;
 	struct dir_inode_entry *entry;
 	struct inode *inode;
 retry:
 	spin_lock(&sbi->dir_inode_lock);
+
+	head = &sbi->dir_inode_list;
 	if (list_empty(head)) {
 		spin_unlock(&sbi->dir_inode_lock);
 		return;
-- 
cgit v1.2.3-70-g09d2


From 924b720b589f91311657216c97edbb3337449270 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 20 Nov 2013 14:46:39 +0800
Subject: f2fs: add a new function to support for merging contiguous read

For better read performance, we add a new function to support for merging
contiguous read as the one for write.

v1-->v2:
 o add declarations here as Gu Zheng suggested.
 o use new structure f2fs_bio_info introduced by Jaegeuk Kim.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Acked-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
---
 fs/f2fs/data.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/f2fs/f2fs.h  |  3 +++
 fs/f2fs/super.c |  1 +
 3 files changed, 54 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2e54522a806..ce3cbd92585 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -414,6 +414,56 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
 	return 0;
 }
 
+void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, int rw)
+{
+	struct f2fs_bio_info *io = &sbi->read_io;
+
+	if (!io->bio)
+		return;
+
+	mutex_lock(&io->io_mutex);
+	if (io->bio) {
+		submit_bio(rw, io->bio);
+		io->bio = NULL;
+	}
+	mutex_unlock(&io->io_mutex);
+}
+
+void submit_read_page(struct f2fs_sb_info *sbi, struct page *page,
+					block_t blk_addr, int rw)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct f2fs_bio_info *io = &sbi->read_io;
+	int bio_blocks;
+
+	verify_block_addr(sbi, blk_addr);
+
+	mutex_lock(&io->io_mutex);
+
+	if (io->bio && io->last_block_in_bio != blk_addr - 1) {
+		submit_bio(rw, io->bio);
+		io->bio = NULL;
+	}
+alloc_new:
+	if (io->bio == NULL) {
+		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+		io->bio = f2fs_bio_alloc(bdev, bio_blocks);
+		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+		io->bio->bi_end_io = read_end_io;
+	}
+
+	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+							PAGE_CACHE_SIZE) {
+		submit_bio(rw, io->bio);
+		io->bio = NULL;
+		goto alloc_new;
+	}
+
+	io->last_block_in_bio = blk_addr;
+
+	mutex_unlock(&io->io_mutex);
+}
+
 /*
  * This function should be used by the data read flow only where it
  * does not check the "create" flag that indicates block allocation.
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b665ce492e1..72718ef0d56 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -382,6 +382,7 @@ struct f2fs_sb_info {
 	struct f2fs_sm_info *sm_info;		/* segment manager */
 
 	/* for bio operations */
+	struct f2fs_bio_info read_io;			/* for read bios */
 	struct f2fs_bio_info write_io[NR_PAGE_TYPE];	/* for write bios */
 
 	/* for checkpoint */
@@ -1132,6 +1133,8 @@ struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
+void f2fs_submit_read_bio(struct f2fs_sb_info *, int);
+void submit_read_page(struct f2fs_sb_info *, struct page *, block_t, int);
 int do_write_data_page(struct page *);
 
 /*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8c677e958de..9981b28744e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -878,6 +878,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->por_doing = false;
 	spin_lock_init(&sbi->stat_lock);
 
+	mutex_init(&sbi->read_io.io_mutex);
 	for (i = 0; i < NR_PAGE_TYPE; i++)
 		mutex_init(&sbi->write_io[i].io_mutex);
 
-- 
cgit v1.2.3-70-g09d2


From 7107e0a9b131f46785b853388fb263306721a986 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 21 Nov 2013 13:54:23 +0900
Subject: f2fs: merge read IOs at ra_nat_pages()

Change log from v1:
  o add mark_page_accessed() not to reclaim the nat pages.

This patch changes the policy of submitting read bios at ra_nat_pages.

Previously, f2fs submits small read bios with block plugging.
But, with this patch, f2fs itself merges read bios first and then submits a
large bio, which can reduce the bio handling overheads.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/node.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b843a5b3d5e..b212599b0da 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -89,13 +89,10 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 {
 	struct address_space *mapping = sbi->meta_inode->i_mapping;
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct blk_plug plug;
 	struct page *page;
 	pgoff_t index;
 	int i;
 
-	blk_start_plug(&plug);
-
 	for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
 		if (nid >= nm_i->max_nid)
 			nid = 0;
@@ -105,15 +102,15 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 		if (!page)
 			continue;
 		if (PageUptodate(page)) {
+			mark_page_accessed(page);
 			f2fs_put_page(page, 1);
 			continue;
 		}
-		if (f2fs_readpage(sbi, page, index, READ))
-			continue;
-
+		submit_read_page(sbi, page, index, READ_SYNC);
+		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
 	}
-	blk_finish_plug(&plug);
+	f2fs_submit_read_bio(sbi, READ_SYNC);
 }
 
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
-- 
cgit v1.2.3-70-g09d2


From 61ae45c8803262a96f3af200489344f339992291 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Thu, 21 Nov 2013 20:04:21 +0900
Subject: f2fs: simplify IS_DATASEG and IS_NODESEG macro

It is not efficient comparing each segment type to find node or data.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
[Jaegeuk Kim: remove unnecessary white spaces]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 269f690b4e2..38f6196493f 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -20,13 +20,8 @@
 #define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
 
-#define IS_DATASEG(t)							\
-	((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) ||		\
-	(t == CURSEG_WARM_DATA))
-
-#define IS_NODESEG(t)							\
-	((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) ||		\
-	(t == CURSEG_WARM_NODE))
+#define IS_DATASEG(t)	(t <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t)	(t >= CURSEG_HOT_NODE)
 
 #define IS_CURSEG(sbi, seg)						\
 	((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||	\
-- 
cgit v1.2.3-70-g09d2


From 87b8872d5b4a8f9f61123ab913aff4f6047d8b53 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 20 Nov 2013 16:40:10 +0800
Subject: f2fs: adds a tracepoint for submit_read_page

This patch adds a tracepoint for submit_read_page.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: integrate tracepoints of f2fs_submit_read(_write)_page]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c              |  1 +
 fs/f2fs/segment.c           |  2 +-
 include/trace/events/f2fs.h | 25 +++++++++++++++++++++----
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ce3cbd92585..cdb342eeb42 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -462,6 +462,7 @@ alloc_new:
 	io->last_block_in_bio = blk_addr;
 
 	mutex_unlock(&io->io_mutex);
+	trace_f2fs_submit_read_page(page, rw, META, blk_addr);
 }
 
 /*
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 38c1a89f54c..d42426dc370 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -923,7 +923,7 @@ retry:
 	io->last_block_in_bio = blk_addr;
 
 	mutex_unlock(&io->io_mutex);
-	trace_f2fs_submit_write_page(page, blk_addr, type);
+	trace_f2fs_submit_write_page(page, WRITE, type, blk_addr);
 }
 
 void f2fs_wait_on_page_writeback(struct page *page,
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 47ee70de7fe..73cc5f07d1b 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -674,15 +674,16 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite,
 	TP_ARGS(page, type)
 );
 
-TRACE_EVENT(f2fs_submit_write_page,
+DECLARE_EVENT_CLASS(f2fs_io_page,
 
-	TP_PROTO(struct page *page, block_t blk_addr, int type),
+	TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
 
-	TP_ARGS(page, blk_addr, type),
+	TP_ARGS(page, rw, type, blk_addr),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(ino_t,	ino)
+		__field(int, rw)
 		__field(int, type)
 		__field(pgoff_t, index)
 		__field(block_t, block)
@@ -691,18 +692,34 @@ TRACE_EVENT(f2fs_submit_write_page,
 	TP_fast_assign(
 		__entry->dev	= page->mapping->host->i_sb->s_dev;
 		__entry->ino	= page->mapping->host->i_ino;
+		__entry->rw	= rw;
 		__entry->type	= type;
 		__entry->index	= page->index;
 		__entry->block	= blk_addr;
 	),
 
-	TP_printk("dev = (%d,%d), ino = %lu, %s, index = %lu, blkaddr = 0x%llx",
+	TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, blkaddr = 0x%llx",
 		show_dev_ino(__entry),
+		show_bio_type(__entry->rw),
 		show_block_type(__entry->type),
 		(unsigned long)__entry->index,
 		(unsigned long long)__entry->block)
 );
 
+DEFINE_EVENT(f2fs_io_page, f2fs_submit_write_page,
+
+	TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
+
+	TP_ARGS(page, rw, type, blk_addr)
+);
+
+DEFINE_EVENT(f2fs_io_page, f2fs_submit_read_page,
+
+	TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
+
+	TP_ARGS(page, rw, type, blk_addr)
+);
+
 TRACE_EVENT(f2fs_write_checkpoint,
 
 	TP_PROTO(struct super_block *sb, bool is_umount, char *msg),
-- 
cgit v1.2.3-70-g09d2


From d4d288bc72c020d335868ce217695c4d5dfd74d0 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Sun, 24 Nov 2013 12:36:42 +0900
Subject: f2fs: adds a tracepoint for f2fs_submit_read_bio

This patch adds a tracepoint for f2fs_submit_read_bio.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: integrate tracepoints of f2fs_submit_read(_write)_bio]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c              |  2 ++
 fs/f2fs/segment.c           |  5 +++--
 include/trace/events/f2fs.h | 49 +++++++++++++++++++++++++++++----------------
 3 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index cdb342eeb42..711722018b8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -421,6 +421,8 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, int rw)
 	if (!io->bio)
 		return;
 
+	trace_f2fs_submit_read_bio(sbi->sb, rw, META, io->bio);
+
 	mutex_lock(&io->io_mutex);
 	if (io->bio) {
 		submit_bio(rw, io->bio);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d42426dc370..a1acaa025bd 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -844,6 +844,9 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 
 	if (type >= META_FLUSH)
 		rw = WRITE_FLUSH_FUA;
+
+	trace_f2fs_submit_write_bio(sbi->sb, rw, btype, io->bio);
+
 	if (btype == META)
 		rw |= REQ_META;
 
@@ -851,8 +854,6 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 	p->sbi = sbi;
 	io->bio->bi_end_io = f2fs_end_io_write;
 
-	trace_f2fs_do_submit_bio(sbi->sb, btype, sync, io->bio);
-
 	if (type == META_FLUSH) {
 		DECLARE_COMPLETION_ONSTACK(wait);
 		p->is_sync = true;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 73cc5f07d1b..1f59f5db55e 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -18,13 +18,14 @@
 
 #define show_bio_type(type)						\
 	__print_symbolic(type,						\
-		{ READ, 	"READ" },				\
-		{ READA, 	"READAHEAD" },				\
-		{ READ_SYNC, 	"READ_SYNC" },				\
-		{ WRITE, 	"WRITE" },				\
-		{ WRITE_SYNC, 	"WRITE_SYNC" },				\
-		{ WRITE_FLUSH,	"WRITE_FLUSH" },			\
-		{ WRITE_FUA, 	"WRITE_FUA" })
+		{ READ, 		"READ" },			\
+		{ READA, 		"READAHEAD" },			\
+		{ READ_SYNC, 		"READ_SYNC" },			\
+		{ WRITE, 		"WRITE" },			\
+		{ WRITE_SYNC, 		"WRITE_SYNC" },			\
+		{ WRITE_FLUSH,		"WRITE_FLUSH" },		\
+		{ WRITE_FUA, 		"WRITE_FUA" },			\
+		{ WRITE_FLUSH_FUA,	"WRITE_FLUSH_FUA" })
 
 #define show_data_type(type)						\
 	__print_symbolic(type,						\
@@ -598,36 +599,50 @@ TRACE_EVENT(f2fs_reserve_new_block,
 		__entry->ofs_in_node)
 );
 
-TRACE_EVENT(f2fs_do_submit_bio,
+DECLARE_EVENT_CLASS(f2fs__submit_bio,
 
-	TP_PROTO(struct super_block *sb, int btype, bool sync, struct bio *bio),
+	TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
 
-	TP_ARGS(sb, btype, sync, bio),
+	TP_ARGS(sb, rw, type, bio),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
-		__field(int,	btype)
-		__field(bool,	sync)
+		__field(int,	rw)
+		__field(int,	type)
 		__field(sector_t,	sector)
 		__field(unsigned int,	size)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
-		__entry->btype		= btype;
-		__entry->sync		= sync;
+		__entry->rw		= rw;
+		__entry->type		= type;
 		__entry->sector		= bio->bi_sector;
 		__entry->size		= bio->bi_size;
 	),
 
-	TP_printk("dev = (%d,%d), type = %s, io = %s, sector = %lld, size = %u",
+	TP_printk("dev = (%d,%d), %s, %s, sector = %lld, size = %u",
 		show_dev(__entry),
-		show_block_type(__entry->btype),
-		__entry->sync ? "sync" : "no sync",
+		show_bio_type(__entry->rw),
+		show_block_type(__entry->type),
 		(unsigned long long)__entry->sector,
 		__entry->size)
 );
 
+DEFINE_EVENT(f2fs__submit_bio, f2fs_submit_write_bio,
+
+	TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+
+	TP_ARGS(sb, rw, type, bio)
+);
+
+DEFINE_EVENT(f2fs__submit_bio, f2fs_submit_read_bio,
+
+	TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+
+	TP_ARGS(sb, rw, type, bio)
+);
+
 DECLARE_EVENT_CLASS(f2fs__page,
 
 	TP_PROTO(struct page *page, int type),
-- 
cgit v1.2.3-70-g09d2


From 74de593af77b109f202c47e090c9e134c8882869 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 22 Nov 2013 09:09:59 +0800
Subject: f2fs: read contiguous sit entry pages by merging for mount
 performance

Previously we read sit entries page one by one, this method lost the chance
of reading contiguous page together. So we read pages as contiguous as
possible for better mount performance.

change log:
 o merge judgements/use 'Continue' or 'Break' instead of 'Goto' as Gu Zheng
   suggested.
 o add mark_page_accessed() before release page to delay VM reclaiming.
 o remove '*order' for simplification of function as Jaegeuk Kim suggested.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: fix a bug on the block address calculation]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 101 ++++++++++++++++++++++++++++++++++++++++--------------
 fs/f2fs/segment.h |   2 ++
 2 files changed, 77 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a1acaa025bd..6dd1dc16a9d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,6 +14,7 @@
 #include <linux/blkdev.h>
 #include <linux/prefetch.h>
 #include <linux/vmalloc.h>
+#include <linux/swap.h>
 
 #include "f2fs.h"
 #include "segment.h"
@@ -1706,41 +1707,89 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 	return restore_curseg_summaries(sbi);
 }
 
+static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
+{
+	struct address_space *mapping = sbi->meta_inode->i_mapping;
+	struct page *page;
+	block_t blk_addr, prev_blk_addr = 0;
+	int sit_blk_cnt = SIT_BLK_CNT(sbi);
+	int blkno = start;
+
+	for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
+
+		blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
+
+		if (blkno != start && prev_blk_addr + 1 != blk_addr)
+			break;
+		prev_blk_addr = blk_addr;
+repeat:
+		page = grab_cache_page(mapping, blk_addr);
+		if (!page) {
+			cond_resched();
+			goto repeat;
+		}
+		if (PageUptodate(page)) {
+			mark_page_accessed(page);
+			f2fs_put_page(page, 1);
+			continue;
+		}
+
+		submit_read_page(sbi, page, blk_addr, READ_SYNC);
+
+		mark_page_accessed(page);
+		f2fs_put_page(page, 0);
+	}
+
+	f2fs_submit_read_bio(sbi, READ_SYNC);
+	return blkno - start;
+}
+
 static void build_sit_entries(struct f2fs_sb_info *sbi)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
 	struct f2fs_summary_block *sum = curseg->sum_blk;
-	unsigned int start;
-
-	for (start = 0; start < TOTAL_SEGS(sbi); start++) {
-		struct seg_entry *se = &sit_i->sentries[start];
-		struct f2fs_sit_block *sit_blk;
-		struct f2fs_sit_entry sit;
-		struct page *page;
-		int i;
+	int sit_blk_cnt = SIT_BLK_CNT(sbi);
+	unsigned int i, start, end;
+	unsigned int readed, start_blk = 0;
+	int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
 
-		mutex_lock(&curseg->curseg_mutex);
-		for (i = 0; i < sits_in_cursum(sum); i++) {
-			if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
-				sit = sit_in_journal(sum, i);
-				mutex_unlock(&curseg->curseg_mutex);
-				goto got_it;
+	do {
+		readed = ra_sit_pages(sbi, start_blk, nrpages);
+
+		start = start_blk * sit_i->sents_per_block;
+		end = (start_blk + readed) * sit_i->sents_per_block;
+
+		for (; start < end && start < TOTAL_SEGS(sbi); start++) {
+			struct seg_entry *se = &sit_i->sentries[start];
+			struct f2fs_sit_block *sit_blk;
+			struct f2fs_sit_entry sit;
+			struct page *page;
+
+			mutex_lock(&curseg->curseg_mutex);
+			for (i = 0; i < sits_in_cursum(sum); i++) {
+				if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
+					sit = sit_in_journal(sum, i);
+					mutex_unlock(&curseg->curseg_mutex);
+					goto got_it;
+				}
 			}
-		}
-		mutex_unlock(&curseg->curseg_mutex);
-		page = get_current_sit_page(sbi, start);
-		sit_blk = (struct f2fs_sit_block *)page_address(page);
-		sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
-		f2fs_put_page(page, 1);
+			mutex_unlock(&curseg->curseg_mutex);
+
+			page = get_current_sit_page(sbi, start);
+			sit_blk = (struct f2fs_sit_block *)page_address(page);
+			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+			f2fs_put_page(page, 1);
 got_it:
-		check_block_count(sbi, start, &sit);
-		seg_info_from_raw_sit(se, &sit);
-		if (sbi->segs_per_sec > 1) {
-			struct sec_entry *e = get_sec_entry(sbi, start);
-			e->valid_blocks += se->valid_blocks;
+			check_block_count(sbi, start, &sit);
+			seg_info_from_raw_sit(se, &sit);
+			if (sbi->segs_per_sec > 1) {
+				struct sec_entry *e = get_sec_entry(sbi, start);
+				e->valid_blocks += se->valid_blocks;
+			}
 		}
-	}
+		start_blk += readed;
+	} while (start_blk < sit_blk_cnt);
 }
 
 static void init_free_segmap(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 38f6196493f..b84dd239666 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -78,6 +78,8 @@
 	(segno / SIT_ENTRY_PER_BLOCK)
 #define	START_SEGNO(sit_i, segno)		\
 	(SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define SIT_BLK_CNT(sbi)			\
+	((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
 #define f2fs_bitmap_size(nr)			\
 	(BITS_TO_LONGS(nr) * sizeof(unsigned long))
 #define TOTAL_SEGS(sbi)	(SM_I(sbi)->main_segments)
-- 
cgit v1.2.3-70-g09d2


From 0daaad97dcdc5cf91d368894a0954704e9a2cf2d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Sun, 24 Nov 2013 13:50:35 +0900
Subject: f2fs: avoid lock debugging overhead

If CONFIG_F2FS_CHECK_FS is unset, we don't need to add any debugging overhead.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 72718ef0d56..417280aa054 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,8 +22,10 @@
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(condition)	BUG_ON(condition)
+#define f2fs_down_write(x, y)	down_write_nest_lock(x, y)
 #else
 #define f2fs_bug_on(condition)
+#define f2fs_down_write(x, y)	down_write(x)
 #endif
 
 /*
@@ -556,7 +558,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 
 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-	down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex);
+	f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
 }
 
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
-- 
cgit v1.2.3-70-g09d2


From b600965c43f9690eb481d0c19948e109b685bde7 Mon Sep 17 00:00:00 2001
From: Huajun Li <huajun.li@intel.com>
Date: Sun, 10 Nov 2013 23:13:18 +0800
Subject: f2fs: add a new function: f2fs_reserve_block()

Add the function f2fs_reserve_block() to easily reserve new blocks, and
use it to clean up more codes.

Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 50 +++++++++++++++++++++++---------------------------
 fs/f2fs/f2fs.h |  1 +
 fs/f2fs/file.c | 38 ++++++--------------------------------
 3 files changed, 30 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 711722018b8..2d02cf36d80 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -64,6 +64,22 @@ int reserve_new_block(struct dnode_of_data *dn)
 	return 0;
 }
 
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
+{
+	bool need_put = dn->inode_page ? false : true;
+	int err;
+
+	err = get_dnode_of_data(dn, index, ALLOC_NODE);
+	if (err)
+		return err;
+	if (dn->data_blkaddr == NULL_ADDR)
+		err = reserve_new_block(dn);
+
+	if (need_put)
+		f2fs_put_dnode(dn);
+	return err;
+}
+
 static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 					struct buffer_head *bh_result)
 {
@@ -314,19 +330,10 @@ struct page *get_new_data_page(struct inode *inode,
 	int err;
 
 	set_new_dnode(&dn, inode, npage, npage, 0);
-	err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+	err = f2fs_reserve_block(&dn, index);
 	if (err)
 		return ERR_PTR(err);
 
-	if (dn.data_blkaddr == NULL_ADDR) {
-		if (reserve_new_block(&dn)) {
-			if (!npage)
-				f2fs_put_dnode(&dn);
-			return ERR_PTR(-ENOSPC);
-		}
-	}
-	if (!npage)
-		f2fs_put_dnode(&dn);
 repeat:
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -707,21 +714,15 @@ repeat:
 	*pagep = page;
 
 	f2fs_lock_op(sbi);
-
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, index, ALLOC_NODE);
-	if (err)
-		goto err;
-
-	if (dn.data_blkaddr == NULL_ADDR)
-		err = reserve_new_block(&dn);
-
-	f2fs_put_dnode(&dn);
-	if (err)
-		goto err;
-
+	err = f2fs_reserve_block(&dn, index);
 	f2fs_unlock_op(sbi);
 
+	if (err) {
+		f2fs_put_page(page, 1);
+		return err;
+	}
+
 	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
 		return 0;
 
@@ -754,11 +755,6 @@ out:
 	SetPageUptodate(page);
 	clear_cold_data(page);
 	return 0;
-
-err:
-	f2fs_unlock_op(sbi);
-	f2fs_put_page(page, 1);
-	return err;
 }
 
 static int f2fs_write_end(struct file *file,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 417280aa054..c8eb37e3b01 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1130,6 +1130,7 @@ void destroy_checkpoint_caches(void);
  * data.c
  */
 int reserve_new_block(struct dnode_of_data *);
+int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d714f4972d..1cd8e44b637 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	block_t old_blk_addr;
 	struct dnode_of_data dn;
 	int err;
 
@@ -44,24 +43,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	/* block allocation */
 	f2fs_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
-	if (err) {
-		f2fs_unlock_op(sbi);
-		goto out;
-	}
-
-	old_blk_addr = dn.data_blkaddr;
-
-	if (old_blk_addr == NULL_ADDR) {
-		err = reserve_new_block(&dn);
-		if (err) {
-			f2fs_put_dnode(&dn);
-			f2fs_unlock_op(sbi);
-			goto out;
-		}
-	}
-	f2fs_put_dnode(&dn);
+	err = f2fs_reserve_block(&dn, page->index);
 	f2fs_unlock_op(sbi);
+	if (err)
+		goto out;
 
 	file_update_time(vma->vm_file);
 	lock_page(page);
@@ -532,22 +517,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 
 		f2fs_lock_op(sbi);
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
-		if (ret) {
-			f2fs_unlock_op(sbi);
+		ret = f2fs_reserve_block(&dn, index);
+		f2fs_unlock_op(sbi);
+		if (ret)
 			break;
-		}
 
-		if (dn.data_blkaddr == NULL_ADDR) {
-			ret = reserve_new_block(&dn);
-			if (ret) {
-				f2fs_put_dnode(&dn);
-				f2fs_unlock_op(sbi);
-				break;
-			}
-		}
-		f2fs_put_dnode(&dn);
-		f2fs_unlock_op(sbi);
 
 		if (pg_start == pg_end)
 			new_size = offset + len;
-- 
cgit v1.2.3-70-g09d2


From a709f4a2f22c0ebaed1d99aee63ab44ffc2ba3d0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Sun, 24 Nov 2013 14:42:23 +0900
Subject: f2fs: add detailed information of bio types in the tracepoints

This patch inserts information of bio types in more detail.
So, we can now see REQ_META and REQ_PRIO too.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c           |  4 ++--
 include/trace/events/f2fs.h | 22 +++++++++++++++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6dd1dc16a9d..33ab378df5b 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -846,11 +846,11 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 	if (type >= META_FLUSH)
 		rw = WRITE_FLUSH_FUA;
 
-	trace_f2fs_submit_write_bio(sbi->sb, rw, btype, io->bio);
-
 	if (btype == META)
 		rw |= REQ_META;
 
+	trace_f2fs_submit_write_bio(sbi->sb, rw, btype, io->bio);
+
 	p = io->bio->bi_private;
 	p->sbi = sbi;
 	io->bio->bi_end_io = f2fs_end_io_write;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 1f59f5db55e..204fcc3201b 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -16,8 +16,13 @@
 		{ META,		"META" },				\
 		{ META_FLUSH,	"META_FLUSH" })
 
-#define show_bio_type(type)						\
-	__print_symbolic(type,						\
+#define F2FS_BIO_MASK(t)	(t & (READA | WRITE_FLUSH_FUA))
+#define F2FS_BIO_EXTRA_MASK(t)	(t & (REQ_META | REQ_PRIO))
+
+#define show_bio_type(type)	show_bio_base(type), show_bio_extra(type)
+
+#define show_bio_base(type)						\
+	__print_symbolic(F2FS_BIO_MASK(type),				\
 		{ READ, 		"READ" },			\
 		{ READA, 		"READAHEAD" },			\
 		{ READ_SYNC, 		"READ_SYNC" },			\
@@ -27,6 +32,13 @@
 		{ WRITE_FUA, 		"WRITE_FUA" },			\
 		{ WRITE_FLUSH_FUA,	"WRITE_FLUSH_FUA" })
 
+#define show_bio_extra(type)						\
+	__print_symbolic(F2FS_BIO_EXTRA_MASK(type),			\
+		{ REQ_META, 		"(M)" },			\
+		{ REQ_PRIO, 		"(P)" },			\
+		{ REQ_META | REQ_PRIO,	"(MP)" },			\
+		{ 0, " \b" })
+
 #define show_data_type(type)						\
 	__print_symbolic(type,						\
 		{ CURSEG_HOT_DATA, 	"Hot DATA" },			\
@@ -447,7 +459,7 @@ TRACE_EVENT_CONDITION(f2fs_readpage,
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
-		"blkaddr = 0x%llx, bio_type = %s",
+		"blkaddr = 0x%llx, bio_type = %s%s",
 		show_dev_ino(__entry),
 		(unsigned long)__entry->index,
 		(unsigned long long)__entry->blkaddr,
@@ -621,7 +633,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
 		__entry->size		= bio->bi_size;
 	),
 
-	TP_printk("dev = (%d,%d), %s, %s, sector = %lld, size = %u",
+	TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u",
 		show_dev(__entry),
 		show_bio_type(__entry->rw),
 		show_block_type(__entry->type),
@@ -713,7 +725,7 @@ DECLARE_EVENT_CLASS(f2fs_io_page,
 		__entry->block	= blk_addr;
 	),
 
-	TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, blkaddr = 0x%llx",
+	TP_printk("dev = (%d,%d), ino = %lu, %s%s, %s, index = %lu, blkaddr = 0x%llx",
 		show_dev_ino(__entry),
 		show_bio_type(__entry->rw),
 		show_block_type(__entry->type),
-- 
cgit v1.2.3-70-g09d2


From 03232305ff3cf44761f7ea271f7c9af5105392b9 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Sun, 24 Nov 2013 15:13:08 +0900
Subject: f2fs: send REQ_META or REQ_PRIO when reading meta area

Let's send REQ_META or REQ_PRIO when reading meta area such as NAT/SIT
etc.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 2 +-
 fs/f2fs/node.c       | 4 ++--
 fs/f2fs/segment.c    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index eae8dc3e1c0..7fe69ff2bfe 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -61,7 +61,7 @@ repeat:
 	if (PageUptodate(page))
 		goto out;
 
-	if (f2fs_readpage(sbi, page, index, READ_SYNC))
+	if (f2fs_readpage(sbi, page, index, READ_SYNC | REQ_META | REQ_PRIO))
 		goto repeat;
 
 	lock_page(page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b212599b0da..0fe9a9720b3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -106,11 +106,11 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 			f2fs_put_page(page, 1);
 			continue;
 		}
-		submit_read_page(sbi, page, index, READ_SYNC);
+		submit_read_page(sbi, page, index, READ_SYNC | REQ_META);
 		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
 	}
-	f2fs_submit_read_bio(sbi, READ_SYNC);
+	f2fs_submit_read_bio(sbi, READ_SYNC | REQ_META);
 }
 
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 33ab378df5b..1e8371392dc 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1734,13 +1734,13 @@ repeat:
 			continue;
 		}
 
-		submit_read_page(sbi, page, blk_addr, READ_SYNC);
+		submit_read_page(sbi, page, blk_addr, READ_SYNC | REQ_META);
 
 		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
 	}
 
-	f2fs_submit_read_bio(sbi, READ_SYNC);
+	f2fs_submit_read_bio(sbi, READ_SYNC | REQ_META);
 	return blkno - start;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1001b3479ce96e37aed5e4fcdc3c60126e034d08 Mon Sep 17 00:00:00 2001
From: Huajun Li <huajun.li@intel.com>
Date: Sun, 10 Nov 2013 23:13:16 +0800
Subject: f2fs: add flags and helpers to support inline data

Add new inode flags F2FS_INLINE_DATA and FI_INLINE_DATA to indicate
whether the inode has inline data.

Inline data makes use of inode block's data indices region to save small
file. Currently there are 923 data indices in an inode block. Since
inline xattr has made use of the last 50 indices to save its data, there
are 873 indices left which can be used for inline data. When
FI_INLINE_DATA is set, the layout of inode block's indices region is
like below:

+-----------------+
|                 | Reserved. reserve_new_block() will make use of
| i_addr[0]       | i_addr[0] when we need to reserve a new data block
|                 | to convert inline data into regular one's.
|-----------------|
|                 | Used by inline data. A file whose size is less than
| i_addr[1~872]   | 3488 bytes(~3.4k) and doesn't reserve extra
|                 | blocks by fallocate() can be saved here.
|-----------------|
|                 |
| i_addr[873~922] | Reserved for inline xattr
|                 |
+-----------------+

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h          | 14 ++++++++++++++
 include/linux/f2fs_fs.h |  8 ++++++++
 2 files changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c8eb37e3b01..72456ec3a3e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -39,6 +39,7 @@
 #define F2FS_MOUNT_POSIX_ACL		0x00000020
 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY	0x00000040
 #define F2FS_MOUNT_INLINE_XATTR		0x00000080
+#define F2FS_MOUNT_INLINE_DATA		0x00000100
 
 #define clear_opt(sbi, option)	(sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	(sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -899,6 +900,7 @@ enum {
 	FI_DELAY_IPUT,		/* used for the recovery */
 	FI_NO_EXTENT,		/* not to use the extent cache */
 	FI_INLINE_XATTR,	/* used for inline xattr */
+	FI_INLINE_DATA,		/* used for inline data*/
 };
 
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -936,6 +938,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
 {
 	if (ri->i_inline & F2FS_INLINE_XATTR)
 		set_inode_flag(fi, FI_INLINE_XATTR);
+	if (ri->i_inline & F2FS_INLINE_DATA)
+		set_inode_flag(fi, FI_INLINE_DATA);
 }
 
 static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -945,6 +949,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
 
 	if (is_inode_flag_set(fi, FI_INLINE_XATTR))
 		ri->i_inline |= F2FS_INLINE_XATTR;
+	if (is_inode_flag_set(fi, FI_INLINE_DATA))
+		ri->i_inline |= F2FS_INLINE_DATA;
 }
 
 static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
@@ -970,6 +976,13 @@ static inline int inline_xattr_size(struct inode *inode)
 		return 0;
 }
 
+static inline void *inline_data_addr(struct page *page)
+{
+	struct f2fs_inode *ri;
+	ri = (struct f2fs_inode *)page_address(page);
+	return (void *)&(ri->i_addr[1]);
+}
+
 static inline int f2fs_readonly(struct super_block *sb)
 {
 	return sb->s_flags & MS_RDONLY;
@@ -1265,4 +1278,5 @@ extern const struct address_space_operations f2fs_meta_aops;
 extern const struct inode_operations f2fs_dir_inode_operations;
 extern const struct inode_operations f2fs_symlink_inode_operations;
 extern const struct inode_operations f2fs_special_inode_operations;
+
 #endif
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index bb942f6d570..aea5eedbb1c 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -153,6 +153,14 @@ struct f2fs_extent {
 #define	NODE_DIND_BLOCK		(DEF_ADDRS_PER_INODE + 5)
 
 #define F2FS_INLINE_XATTR	0x01	/* file inline xattr flag */
+#define F2FS_INLINE_DATA	0x02	/* file inline data flag */
+
+
+#define MAX_INLINE_DATA		(sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
+						F2FS_INLINE_XATTR_ADDRS - 1))
+
+#define INLINE_DATA_OFFSET	(PAGE_CACHE_SIZE - sizeof(struct node_footer) \
+				- sizeof(__le32)*(DEF_ADDRS_PER_INODE + 5 - 1))
 
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
-- 
cgit v1.2.3-70-g09d2


From 8274de77b7072d983fe4b452b981b3e520f12698 Mon Sep 17 00:00:00 2001
From: Huajun Li <huajun.li@intel.com>
Date: Sun, 10 Nov 2013 23:13:17 +0800
Subject: f2fs: add a new mount option: inline_data

Add a mount option: inline_data. If the mount option is set,
data of New created small files can be stored in their inode.

Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/super.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 9981b28744e..43f11cae408 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -50,6 +50,7 @@ enum {
 	Opt_active_logs,
 	Opt_disable_ext_identify,
 	Opt_inline_xattr,
+	Opt_inline_data,
 	Opt_err,
 };
 
@@ -65,6 +66,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_active_logs, "active_logs=%u"},
 	{Opt_disable_ext_identify, "disable_ext_identify"},
 	{Opt_inline_xattr, "inline_xattr"},
+	{Opt_inline_data, "inline_data"},
 	{Opt_err, NULL},
 };
 
@@ -313,6 +315,9 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_disable_ext_identify:
 			set_opt(sbi, DISABLE_EXT_IDENTIFY);
 			break;
+		case Opt_inline_data:
+			set_opt(sbi, INLINE_DATA);
+			break;
 		default:
 			f2fs_msg(sb, KERN_ERR,
 				"Unrecognized mount option \"%s\" or missing value",
@@ -510,7 +515,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 #endif
 	if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
 		seq_puts(seq, ",disable_ext_identify");
-
+	if (test_opt(sbi, INLINE_DATA))
+		seq_puts(seq, ",inline_data");
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 0e80220ac554ea55fd867dede91f0054a13cf85c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 26 Nov 2013 16:36:20 +0900
Subject: f2fs: remove unnecessary return value

Let's remove the unnecessary return value.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 72456ec3a3e..1f1bc58d50e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -773,13 +773,12 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 	spin_unlock(&sbi->stat_lock);
 }
 
-static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
+static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
 {
 	spin_lock(&sbi->stat_lock);
 	f2fs_bug_on(!sbi->total_valid_inode_count);
 	sbi->total_valid_inode_count--;
 	spin_unlock(&sbi->stat_lock);
-	return 0;
 }
 
 static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
-- 
cgit v1.2.3-70-g09d2


From 36795567942a033ef9e22d8eba86396ffb9aa80c Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 26 Nov 2013 16:44:16 +0800
Subject: f2fs: fix a potential out of range issue

Fix a potential out of range issue introduced by commit:
22fb72225a
f2fs: simplify write_orphan_inodes for better readable

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7fe69ff2bfe..3e62987e333 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -323,9 +323,9 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 			memset(orphan_blk, 0, sizeof(*orphan_blk));
 		}
 
-		orphan_blk->ino[nentries] = cpu_to_le32(orphan->ino);
+		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
 
-		if (nentries++ == F2FS_ORPHANS_PER_BLOCK) {
+		if (nentries == F2FS_ORPHANS_PER_BLOCK) {
 			/*
 			 * an orphan block is full of 1020 entries,
 			 * then we need to flush current orphan blocks
-- 
cgit v1.2.3-70-g09d2


From f9a4e6df52edf8ce1040d1b8d340d31234a1bce3 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 28 Nov 2013 12:44:05 +0900
Subject: f2fs: bug fix on bit overflow from 32bits to 64bits

This patch fixes some bit overflows by the shift operations.

Dan Carpenter reported potential bugs on bit overflows as follows.

fs/f2fs/segment.c:910 submit_write_page()
	warn: should 'blk_addr << ((sbi)->log_blocksize - 9)' be a 64 bit type?
fs/f2fs/checkpoint.c:429 get_valid_checkpoint()
	warn: should '1 << ()' be a 64 bit type?
fs/f2fs/data.c:408 f2fs_readpage()
	warn: should 'blk_addr << ((sbi)->log_blocksize - 9)' be a 64 bit type?
fs/f2fs/data.c:457 submit_read_page()
	warn: should 'blk_addr << ((sbi)->log_blocksize - 9)' be a 64 bit type?
fs/f2fs/data.c:525 get_data_block_ro()
	warn: should 'i << blkbits' be a 64 bit type?

Bug-Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 3 ++-
 fs/f2fs/data.c       | 2 +-
 fs/f2fs/segment.c    | 4 ++--
 fs/f2fs/segment.h    | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 3e62987e333..21e72153496 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -426,7 +426,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 	cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
 
 	/* The second checkpoint pack should start at the next segment */
-	cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+	cp_start_blk_no += ((unsigned long long)1) <<
+				le32_to_cpu(fsb->log_blocks_per_seg);
 	cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
 
 	if (cp1 && cp2) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2d02cf36d80..24f752de6a9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -525,7 +525,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
 				!= (dn.data_blkaddr + i)) || maxblocks == i)
 				break;
 		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
-		bh_result->b_size = (i << blkbits);
+		bh_result->b_size = (((size_t)i) << blkbits);
 	}
 	f2fs_put_dnode(&dn);
 	trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1e8371392dc..03878634a0f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -270,8 +270,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
 				block_t blkstart, block_t blklen)
 {
-	sector_t start = ((sector_t)blkstart) << sbi->log_sectors_per_block;
-	sector_t len = ((sector_t)blklen) << sbi->log_sectors_per_block;
+	sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
+	sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
 	blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index b84dd239666..07887e1cc70 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -86,9 +86,9 @@
 #define TOTAL_SECS(sbi)	(sbi->total_sections)
 
 #define SECTOR_FROM_BLOCK(sbi, blk_addr)				\
-	(blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+	(((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
 #define SECTOR_TO_BLOCK(sbi, sectors)					\
-	(sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+	(sectors >> (sbi)->log_sectors_per_block)
 #define MAX_BIO_BLOCKS(max_hw_blocks)					\
 	(min((int)max_hw_blocks, BIO_MAX_PAGES))
 
-- 
cgit v1.2.3-70-g09d2


From 031fa8cc9ba45c14f440b9cf71d09950fbe5eb9b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 28 Nov 2013 12:55:13 +0900
Subject: f2fs: remove unnecessary condition checks

This patch removes the unnecessary condition checks on:

fs/f2fs/gc.c:667 do_garbage_collect() warn: 'sum_page' isn't an ERR_PTR
fs/f2fs/f2fs.h:795 f2fs_put_page() warn: 'page' isn't an ERR_PTR

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h | 2 +-
 fs/f2fs/gc.c   | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1f1bc58d50e..9a76c83937b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -792,7 +792,7 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
 
 static inline void f2fs_put_page(struct page *page, int unlock)
 {
-	if (!page || IS_ERR(page))
+	if (!page)
 		return;
 
 	if (unlock) {
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b7ad1ec7e4c..5fa54c1ca33 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -664,8 +664,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 
 	/* read segment summary of victim */
 	sum_page = get_sum_page(sbi, segno);
-	if (IS_ERR(sum_page))
-		return;
 
 	blk_start_plug(&plug);
 
-- 
cgit v1.2.3-70-g09d2


From a66c7b2fcfbc9ef4e972f6bc2b63d72d00f23122 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 22 Nov 2013 16:52:50 +0800
Subject: f2fs: remove unneeded code in punch_hole

Because FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE
in fallocate, so we could remove the useless 'keep size' branch code which
will never be excuted in punch_hole.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Fan Li <fanofcode.li@samsung.com>
[Jaegeuk Kim: remove an unnecessary parameter togather]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/file.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cd8e44b637..2b47adcd852 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -444,7 +444,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 	return 0;
 }
 
-static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	pgoff_t pg_start, pg_end;
 	loff_t off_start, off_end;
@@ -484,12 +484,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
 		}
 	}
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-		i_size_read(inode) <= (offset + len)) {
-		i_size_write(inode, offset);
-		mark_inode_dirty(inode);
-	}
-
 	return ret;
 }
 
@@ -552,7 +546,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		ret = punch_hole(inode, offset, len, mode);
+		ret = punch_hole(inode, offset, len);
 	else
 		ret = expand_inode_data(inode, offset, len, mode);
 
-- 
cgit v1.2.3-70-g09d2


From 6947eea957e4c4c873f92a4ee5da7a6ef0012718 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 29 Nov 2013 16:37:00 +0800
Subject: f2fs: avoid to calculate incorrect max orphan number

Because we will write node summaries when do_checkpoint with umount flag,
our number of max orphan blocks should minus NR_CURSEG_NODE_TYPE additional.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Shu Tan <shu.tan@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 21e72153496..c7e0572fded 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -190,12 +190,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 	int err = 0;
 
 	/*
-	 * considering 512 blocks in a segment 5 blocks are needed for cp
+	 * considering 512 blocks in a segment 8 blocks are needed for cp
 	 * and log segment summaries. Remaining blocks are used to keep
 	 * orphan entries with the limitation one reserved segment
-	 * for cp pack we can have max 1020*507 orphan entries
+	 * for cp pack we can have max 1020*504 orphan entries
 	 */
-	max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
+	max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
+				* F2FS_ORPHANS_PER_BLOCK;
 	mutex_lock(&sbi->orphan_inode_mutex);
 	if (sbi->n_orphans >= max_orphans)
 		err = -ENOSPC;
-- 
cgit v1.2.3-70-g09d2


From aac44046a2cdf70474fee8931bc8a591fd16c8dc Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 28 Nov 2013 15:41:39 +0800
Subject: f2fs: correct type of wait in struct bio_private

The void *wait in bio_private is used for waiting completion of checkpoint bio.
So we don't need to use its type as void, but declare it as completion type.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: add description]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 07887e1cc70..7fea2ee8a5d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -96,7 +96,7 @@
 struct bio_private {
 	struct f2fs_sb_info *sbi;
 	bool is_sync;
-	void *wait;
+	struct completion *wait;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 01d2d1aa0648192fd1d49f7d74d7e8b85b1c585a Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 28 Nov 2013 15:43:07 +0800
Subject: f2fs: use true and false for boolean variable

The inode_page_locked should be a boolean variable.

struct dnode_of_data {
	struct inode *inode;            /* vfs inode pointer */
	struct page *inode_page;        /* its inode page, NULL is possible */
	struct page *node_page;         /* cached direct node page */
	nid_t nid;                      /* node id of the direct node block */
	unsigned int ofs_in_node;       /* data offset in the node page */
==>	bool inode_page_locked;         /* inode page is locked or not */
	block_t data_blkaddr;           /* block address of the node block */
};

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: add description]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0fe9a9720b3..d0ab00334b0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -791,7 +791,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 	set_new_dnode(&dn, inode, page, npage, nid);
 
 	if (page)
-		dn.inode_page_locked = 1;
+		dn.inode_page_locked = true;
 	truncate_node(&dn);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 1069bbf7b963d31d5532c36d43a02b0447e5bcfa Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 28 Nov 2013 15:43:43 +0800
Subject: f2fs: check return value of f2fs_readpage in find_data_page

We should return error if we do not get an updated page in find_date_page
when f2fs_readpage failed.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 24f752de6a9..c9a76f8c102 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -240,6 +240,9 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 
 	err = f2fs_readpage(sbi, page, dn.data_blkaddr,
 					sync ? READ_SYNC : READA);
+	if (err)
+		return ERR_PTR(err);
+
 	if (sync) {
 		wait_on_page_locked(page);
 		if (!PageUptodate(page)) {
-- 
cgit v1.2.3-70-g09d2


From 8f99a946f360c4083d6e323e10a5928e1ce385a4 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 28 Nov 2013 15:43:43 +0800
Subject: f2fs: convert recover_orphan_inodes to void

The recover_orphan_inodes() returns no error all the time, so we don't need to
check its errors.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: add description]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 6 +++---
 fs/f2fs/f2fs.h       | 2 +-
 fs/f2fs/super.c      | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index c7e0572fded..40eea42f85f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -271,12 +271,12 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 	iput(inode);
 }
 
-int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+void recover_orphan_inodes(struct f2fs_sb_info *sbi)
 {
 	block_t start_blk, orphan_blkaddr, i, j;
 
 	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
-		return 0;
+		return;
 
 	sbi->por_doing = true;
 	start_blk = __start_cp_addr(sbi) + 1;
@@ -296,7 +296,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	/* clear Orphan Flag */
 	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
 	sbi->por_doing = false;
-	return 0;
+	return;
 }
 
 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9a76c83937b..ca33cda78e0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1126,7 +1126,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
 void add_orphan_inode(struct f2fs_sb_info *, nid_t);
 void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-int recover_orphan_inodes(struct f2fs_sb_info *);
+void recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void set_dirty_dir_page(struct inode *, struct page *);
 void add_dirty_dir_inode(struct inode *);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 43f11cae408..dd55074a304 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -952,9 +952,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* if there are nt orphan nodes free them */
-	err = -EINVAL;
-	if (recover_orphan_inodes(sbi))
-		goto free_node_inode;
+	recover_orphan_inodes(sbi);
 
 	/* read root inode and dentry */
 	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -963,8 +961,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 		err = PTR_ERR(root);
 		goto free_node_inode;
 	}
-	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		err = -EINVAL;
 		goto free_root_inode;
+	}
 
 	sb->s_root = d_make_root(root); /* allocate root dentry */
 	if (!sb->s_root) {
-- 
cgit v1.2.3-70-g09d2


From 187b5b8b3dfcfc73126f2743c89cc47df3bf07be Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Sat, 30 Nov 2013 10:10:31 +0900
Subject: f2fs: remove the own bi_private allocation

Previously f2fs allocates its own bi_private data structure all the time even
though we don't use it. But, can we remove this bi_private allocation?

This patch removes such the additional bi_private allocation.

1. Retrieve f2fs_sb_info from its page->mapping->host->i_sb.
 - This removes the usecases of bi_private in end_io.

2. Use bi_private only when we really need it.
 - The bi_private is used only when the checkpoint procedure is conducted.
 - When conducting the checkpoint, f2fs submits a META_FLUSH bio to wait its bio
completion.
 - Since we have no dependancies to remove bi_private now, let's just use
 bi_private pointer as the completion pointer.

Reviewed-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 43 ++++++++++++++++---------------------------
 fs/f2fs/segment.h |  7 -------
 2 files changed, 16 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 03878634a0f..0db40271f0d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -791,7 +791,7 @@ static void f2fs_end_io_write(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct bio_private *p = bio->bi_private;
+	struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -802,21 +802,21 @@ static void f2fs_end_io_write(struct bio *bio, int err)
 			SetPageError(page);
 			if (page->mapping)
 				set_bit(AS_EIO, &page->mapping->flags);
-			set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
-			p->sbi->sb->s_flags |= MS_RDONLY;
+
+			set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+			sbi->sb->s_flags |= MS_RDONLY;
 		}
 		end_page_writeback(page);
-		dec_page_count(p->sbi, F2FS_WRITEBACK);
+		dec_page_count(sbi, F2FS_WRITEBACK);
 	} while (bvec >= bio->bi_io_vec);
 
-	if (p->is_sync)
-		complete(p->wait);
+	if (bio->bi_private)
+		complete(bio->bi_private);
 
-	if (!get_pages(p->sbi, F2FS_WRITEBACK) &&
-			!list_empty(&p->sbi->cp_wait.task_list))
-		wake_up(&p->sbi->cp_wait);
+	if (!get_pages(sbi, F2FS_WRITEBACK) &&
+			!list_empty(&sbi->cp_wait.task_list))
+		wake_up(&sbi->cp_wait);
 
-	kfree(p);
 	bio_put(bio);
 }
 
@@ -838,7 +838,6 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 	int rw = sync ? WRITE_SYNC : WRITE;
 	enum page_type btype = PAGE_TYPE_OF_BIO(type);
 	struct f2fs_bio_info *io = &sbi->write_io[btype];
-	struct bio_private *p;
 
 	if (!io->bio)
 		return;
@@ -851,18 +850,16 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 
 	trace_f2fs_submit_write_bio(sbi->sb, rw, btype, io->bio);
 
-	p = io->bio->bi_private;
-	p->sbi = sbi;
-	io->bio->bi_end_io = f2fs_end_io_write;
-
+	/*
+	 * META_FLUSH is only from the checkpoint procedure, and we should wait
+	 * this metadata bio for FS consistency.
+	 */
 	if (type == META_FLUSH) {
 		DECLARE_COMPLETION_ONSTACK(wait);
-		p->is_sync = true;
-		p->wait = &wait;
+		io->bio->bi_private = &wait;
 		submit_bio(rw, io->bio);
 		wait_for_completion(&wait);
 	} else {
-		p->is_sync = false;
 		submit_bio(rw, io->bio);
 	}
 	io->bio = NULL;
@@ -897,18 +894,10 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
 		do_submit_bio(sbi, type, false);
 alloc_new:
 	if (io->bio == NULL) {
-		struct bio_private *priv;
-retry:
-		priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
-		if (!priv) {
-			cond_resched();
-			goto retry;
-		}
-
 		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
 		io->bio = f2fs_bio_alloc(bdev, bio_blocks);
 		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-		io->bio->bi_private = priv;
+		io->bio->bi_end_io = f2fs_end_io_write;
 		/*
 		 * The end_io will be assigned at the sumbission phase.
 		 * Until then, let bio_add_page() merge consecutive IOs as much
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7fea2ee8a5d..26812fc0fa1 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -92,13 +92,6 @@
 #define MAX_BIO_BLOCKS(max_hw_blocks)					\
 	(min((int)max_hw_blocks, BIO_MAX_PAGES))
 
-/* during checkpoint, bio_private is used to synchronize the last bio */
-struct bio_private {
-	struct f2fs_sb_info *sbi;
-	bool is_sync;
-	struct completion *wait;
-};
-
 /*
  * indicate a block allocation direction: RIGHT and LEFT.
  * RIGHT means allocating new sections towards the end of volume.
-- 
cgit v1.2.3-70-g09d2


From 93dfe2ac516250755f7d5edd438b0ce67c0e3aa6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Sat, 30 Nov 2013 12:51:14 +0900
Subject: f2fs: refactor bio-related operations

This patch integrates redundant bio operations on read and write IOs.

1. Move bio-related codes to the top of data.c.
2. Replace f2fs_submit_bio with f2fs_submit_merged_bio, which handles read
   bios additionally.
3. Introduce __submit_merged_bio to submit the merged bio.
4. Change f2fs_readpage to f2fs_submit_page_bio.
5. Introduce f2fs_submit_page_mbio to integrate previous submit_read_page and
   submit_write_page.

Reviewed-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com >
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c        |  14 +-
 fs/f2fs/data.c              | 316 +++++++++++++++++++++++++++++---------------
 fs/f2fs/f2fs.h              |  12 +-
 fs/f2fs/gc.c                |   2 +-
 fs/f2fs/node.c              |  14 +-
 fs/f2fs/recovery.c          |   4 +-
 fs/f2fs/segment.c           | 164 +++--------------------
 include/trace/events/f2fs.h |  30 ++---
 8 files changed, 257 insertions(+), 299 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 40eea42f85f..38f4a224508 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -61,7 +61,8 @@ repeat:
 	if (PageUptodate(page))
 		goto out;
 
-	if (f2fs_readpage(sbi, page, index, READ_SYNC | REQ_META | REQ_PRIO))
+	if (f2fs_submit_page_bio(sbi, page, index,
+				READ_SYNC | REQ_META | REQ_PRIO))
 		goto repeat;
 
 	lock_page(page);
@@ -157,7 +158,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 	}
 
 	if (nwritten)
-		f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
+		f2fs_submit_merged_bio(sbi, type, nr_to_write == LONG_MAX,
+								WRITE);
 
 	return nwritten;
 }
@@ -590,7 +592,7 @@ retry:
 		 * We should submit bio, since it exists several
 		 * wribacking dentry pages in the freeing inode.
 		 */
-		f2fs_submit_bio(sbi, DATA, true);
+		f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
 	}
 	goto retry;
 }
@@ -796,9 +798,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 
 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
 
-	f2fs_submit_bio(sbi, DATA, true);
-	f2fs_submit_bio(sbi, NODE, true);
-	f2fs_submit_bio(sbi, META, true);
+	f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
+	f2fs_submit_merged_bio(sbi, NODE, true, WRITE);
+	f2fs_submit_merged_bio(sbi, META, true, WRITE);
 
 	/*
 	 * update checkpoint pack index
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c9a76f8c102..4e2fc09f0e4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,6 +24,204 @@
 #include "segment.h"
 #include <trace/events/f2fs.h>
 
+/*
+ * Low-level block read/write IO operations.
+ */
+static struct bio *__bio_alloc(struct block_device *bdev, int npages)
+{
+	struct bio *bio;
+
+	/* No failure on bio allocation */
+	bio = bio_alloc(GFP_NOIO, npages);
+	bio->bi_bdev = bdev;
+	bio->bi_private = NULL;
+	return bio;
+}
+
+static void f2fs_read_end_io(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+}
+
+static void f2fs_write_end_io(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate) {
+			SetPageError(page);
+			set_bit(AS_EIO, &page->mapping->flags);
+			set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+			sbi->sb->s_flags |= MS_RDONLY;
+		}
+		end_page_writeback(page);
+		dec_page_count(sbi, F2FS_WRITEBACK);
+	} while (bvec >= bio->bi_io_vec);
+
+	if (bio->bi_private)
+		complete(bio->bi_private);
+
+	if (!get_pages(sbi, F2FS_WRITEBACK) &&
+			!list_empty(&sbi->cp_wait.task_list))
+		wake_up(&sbi->cp_wait);
+
+	bio_put(bio);
+}
+
+static void __submit_merged_bio(struct f2fs_sb_info *sbi,
+				struct f2fs_bio_info *io,
+				enum page_type type, bool sync, int rw)
+{
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+
+	if (!io->bio)
+		return;
+
+	if (btype == META)
+		rw |= REQ_META;
+
+	if (is_read_io(rw)) {
+		if (sync)
+			rw |= READ_SYNC;
+		submit_bio(rw, io->bio);
+		trace_f2fs_submit_read_bio(sbi->sb, rw, type, io->bio);
+		io->bio = NULL;
+		return;
+	}
+
+	if (sync)
+		rw |= WRITE_SYNC;
+	if (type >= META_FLUSH)
+		rw |= WRITE_FLUSH_FUA;
+
+	/*
+	 * META_FLUSH is only from the checkpoint procedure, and we should wait
+	 * this metadata bio for FS consistency.
+	 */
+	if (type == META_FLUSH) {
+		DECLARE_COMPLETION_ONSTACK(wait);
+		io->bio->bi_private = &wait;
+		submit_bio(rw, io->bio);
+		wait_for_completion(&wait);
+	} else {
+		submit_bio(rw, io->bio);
+	}
+	trace_f2fs_submit_write_bio(sbi->sb, rw, btype, io->bio);
+	io->bio = NULL;
+}
+
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+				enum page_type type, bool sync, int rw)
+{
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct f2fs_bio_info *io;
+
+	io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+
+	mutex_lock(&io->io_mutex);
+	__submit_merged_bio(sbi, io, type, sync, rw);
+	mutex_unlock(&io->io_mutex);
+}
+
+/*
+ * Fill the locked page with data located in the block address.
+ * Return unlocked page.
+ */
+int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
+					block_t blk_addr, int rw)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct bio *bio;
+
+	trace_f2fs_submit_page_bio(page, blk_addr, rw);
+
+	/* Allocate a new bio */
+	bio = __bio_alloc(bdev, 1);
+
+	/* Initialize the bio */
+	bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+	bio->bi_end_io = is_read_io(rw) ? f2fs_read_end_io : f2fs_write_end_io;
+
+	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+		bio_put(bio);
+		f2fs_put_page(page, 1);
+		return -EFAULT;
+	}
+
+	submit_bio(rw, bio);
+	return 0;
+}
+
+void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
+			block_t blk_addr, enum page_type type, int rw)
+{
+	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct f2fs_bio_info *io;
+	int bio_blocks;
+
+	io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+
+	verify_block_addr(sbi, blk_addr);
+
+	mutex_lock(&io->io_mutex);
+
+	if (!is_read_io(rw))
+		inc_page_count(sbi, F2FS_WRITEBACK);
+
+	if (io->bio && io->last_block_in_bio != blk_addr - 1)
+		__submit_merged_bio(sbi, io, type, true, rw);
+alloc_new:
+	if (io->bio == NULL) {
+		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+		io->bio = __bio_alloc(bdev, bio_blocks);
+		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+		io->bio->bi_end_io = is_read_io(rw) ? f2fs_read_end_io :
+							f2fs_write_end_io;
+		/*
+		 * The end_io will be assigned at the sumbission phase.
+		 * Until then, let bio_add_page() merge consecutive IOs as much
+		 * as possible.
+		 */
+	}
+
+	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+							PAGE_CACHE_SIZE) {
+		__submit_merged_bio(sbi, io, type, true, rw);
+		goto alloc_new;
+	}
+
+	io->last_block_in_bio = blk_addr;
+
+	mutex_unlock(&io->io_mutex);
+	trace_f2fs_submit_page_mbio(page, rw, type, blk_addr);
+}
+
 /*
  * Lock ordering for the change of data block address:
  * ->data_page
@@ -238,7 +436,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 		return page;
 	}
 
-	err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+	err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
 					sync ? READ_SYNC : READA);
 	if (err)
 		return ERR_PTR(err);
@@ -299,7 +497,7 @@ repeat:
 		return page;
 	}
 
-	err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+	err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
 	if (err)
 		return ERR_PTR(err);
 
@@ -349,7 +547,8 @@ repeat:
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 		SetPageUptodate(page);
 	} else {
-		err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+		err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+								READ_SYNC);
 		if (err)
 			return ERR_PTR(err);
 		lock_page(page);
@@ -373,110 +572,6 @@ repeat:
 	return page;
 }
 
-static void read_end_io(struct bio *bio, int err)
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
-	do {
-		struct page *page = bvec->bv_page;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate) {
-			SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		unlock_page(page);
-	} while (bvec >= bio->bi_io_vec);
-	bio_put(bio);
-}
-
-/*
- * Fill the locked page with data located in the block address.
- * Return unlocked page.
- */
-int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
-					block_t blk_addr, int type)
-{
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct bio *bio;
-
-	trace_f2fs_readpage(page, blk_addr, type);
-
-	/* Allocate a new bio */
-	bio = f2fs_bio_alloc(bdev, 1);
-
-	/* Initialize the bio */
-	bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-	bio->bi_end_io = read_end_io;
-
-	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
-		bio_put(bio);
-		f2fs_put_page(page, 1);
-		return -EFAULT;
-	}
-
-	submit_bio(type, bio);
-	return 0;
-}
-
-void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, int rw)
-{
-	struct f2fs_bio_info *io = &sbi->read_io;
-
-	if (!io->bio)
-		return;
-
-	trace_f2fs_submit_read_bio(sbi->sb, rw, META, io->bio);
-
-	mutex_lock(&io->io_mutex);
-	if (io->bio) {
-		submit_bio(rw, io->bio);
-		io->bio = NULL;
-	}
-	mutex_unlock(&io->io_mutex);
-}
-
-void submit_read_page(struct f2fs_sb_info *sbi, struct page *page,
-					block_t blk_addr, int rw)
-{
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct f2fs_bio_info *io = &sbi->read_io;
-	int bio_blocks;
-
-	verify_block_addr(sbi, blk_addr);
-
-	mutex_lock(&io->io_mutex);
-
-	if (io->bio && io->last_block_in_bio != blk_addr - 1) {
-		submit_bio(rw, io->bio);
-		io->bio = NULL;
-	}
-alloc_new:
-	if (io->bio == NULL) {
-		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-		io->bio = f2fs_bio_alloc(bdev, bio_blocks);
-		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-		io->bio->bi_end_io = read_end_io;
-	}
-
-	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
-							PAGE_CACHE_SIZE) {
-		submit_bio(rw, io->bio);
-		io->bio = NULL;
-		goto alloc_new;
-	}
-
-	io->last_block_in_bio = blk_addr;
-
-	mutex_unlock(&io->io_mutex);
-	trace_f2fs_submit_read_page(page, rw, META, blk_addr);
-}
-
 /*
  * This function should be used by the data read flow only where it
  * does not check the "create" flag that indicates block allocation.
@@ -638,7 +733,7 @@ write:
 		goto redirty_out;
 
 	if (wbc->for_reclaim)
-		f2fs_submit_bio(sbi, DATA, true);
+		f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
 
 	clear_cold_data(page);
 out:
@@ -690,7 +785,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
 	if (locked)
 		mutex_unlock(&sbi->writepages);
-	f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+	f2fs_submit_merged_bio(sbi, DATA, wbc->sync_mode == WB_SYNC_ALL, WRITE);
 
 	remove_dirty_dir_inode(inode);
 
@@ -741,7 +836,8 @@ repeat:
 	if (dn.data_blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 	} else {
-		err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+		err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+							READ_SYNC);
 		if (err)
 			return err;
 		lock_page(page);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ca33cda78e0..10eca022e1e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -364,6 +364,7 @@ enum page_type {
 	META_FLUSH,
 };
 
+#define is_read_io(rw)	(((rw) & 1) == READ)
 struct f2fs_bio_info {
 	struct bio *bio;		/* bios to merge */
 	sector_t last_block_in_bio;	/* last block number */
@@ -1093,9 +1094,6 @@ void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
-struct bio *f2fs_bio_alloc(struct block_device *, int);
-void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
 					block_t, block_t *);
@@ -1106,6 +1104,7 @@ void recover_data_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
 void rewrite_node_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
 int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1141,15 +1140,16 @@ void destroy_checkpoint_caches(void);
 /*
  * data.c
  */
+void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, bool, int);
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
+							enum page_type, int);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
-void f2fs_submit_read_bio(struct f2fs_sb_info *, int);
-void submit_read_page(struct f2fs_sb_info *, struct page *, block_t, int);
 int do_write_data_page(struct page *);
 
 /*
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5fa54c1ca33..2886aef35d5 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -631,7 +631,7 @@ next_iput:
 		goto next_step;
 
 	if (gc_type == FG_GC) {
-		f2fs_submit_bio(sbi, DATA, true);
+		f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
 
 		/*
 		 * In the case of FG_GC, it'd be better to reclaim this victim
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d0ab00334b0..0e1a3df18e5 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -106,11 +106,11 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 			f2fs_put_page(page, 1);
 			continue;
 		}
-		submit_read_page(sbi, page, index, READ_SYNC | REQ_META);
+		f2fs_submit_page_mbio(sbi, page, index, META, READ);
 		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
 	}
-	f2fs_submit_read_bio(sbi, READ_SYNC | REQ_META);
+	f2fs_submit_merged_bio(sbi, META, true, READ);
 }
 
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -891,7 +891,7 @@ fail:
  * LOCKED_PAGE: f2fs_put_page(page, 1)
  * error: nothing
  */
-static int read_node_page(struct page *page, int type)
+static int read_node_page(struct page *page, int rw)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
 	struct node_info ni;
@@ -906,7 +906,7 @@ static int read_node_page(struct page *page, int type)
 	if (PageUptodate(page))
 		return LOCKED_PAGE;
 
-	return f2fs_readpage(sbi, page, ni.blk_addr, type);
+	return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
 }
 
 /*
@@ -1136,8 +1136,8 @@ continue_unlock:
 	}
 
 	if (wrote)
-		f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
-
+		f2fs_submit_merged_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL,
+									WRITE);
 	return nwritten;
 }
 
@@ -1592,7 +1592,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 		 */
 		ClearPageUptodate(page);
 
-		if (f2fs_readpage(sbi, page, addr, READ_SYNC))
+		if (f2fs_submit_page_bio(sbi, page, addr, READ_SYNC))
 			goto out;
 
 		lock_page(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fdc81161f25..c209b865292 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -143,7 +143,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
 		if (err)
 			goto out;
 
@@ -386,7 +386,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
 		if (err)
 			goto out;
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0db40271f0d..ca9adf5914c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -787,146 +787,6 @@ static const struct segment_allocation default_salloc_ops = {
 	.allocate_segment = allocate_segment_by_default,
 };
 
-static void f2fs_end_io_write(struct bio *bio, int err)
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
-
-	do {
-		struct page *page = bvec->bv_page;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-		if (!uptodate) {
-			SetPageError(page);
-			if (page->mapping)
-				set_bit(AS_EIO, &page->mapping->flags);
-
-			set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
-			sbi->sb->s_flags |= MS_RDONLY;
-		}
-		end_page_writeback(page);
-		dec_page_count(sbi, F2FS_WRITEBACK);
-	} while (bvec >= bio->bi_io_vec);
-
-	if (bio->bi_private)
-		complete(bio->bi_private);
-
-	if (!get_pages(sbi, F2FS_WRITEBACK) &&
-			!list_empty(&sbi->cp_wait.task_list))
-		wake_up(&sbi->cp_wait);
-
-	bio_put(bio);
-}
-
-struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
-{
-	struct bio *bio;
-
-	/* No failure on bio allocation */
-	bio = bio_alloc(GFP_NOIO, npages);
-	bio->bi_bdev = bdev;
-	bio->bi_private = NULL;
-
-	return bio;
-}
-
-static void do_submit_bio(struct f2fs_sb_info *sbi,
-				enum page_type type, bool sync)
-{
-	int rw = sync ? WRITE_SYNC : WRITE;
-	enum page_type btype = PAGE_TYPE_OF_BIO(type);
-	struct f2fs_bio_info *io = &sbi->write_io[btype];
-
-	if (!io->bio)
-		return;
-
-	if (type >= META_FLUSH)
-		rw = WRITE_FLUSH_FUA;
-
-	if (btype == META)
-		rw |= REQ_META;
-
-	trace_f2fs_submit_write_bio(sbi->sb, rw, btype, io->bio);
-
-	/*
-	 * META_FLUSH is only from the checkpoint procedure, and we should wait
-	 * this metadata bio for FS consistency.
-	 */
-	if (type == META_FLUSH) {
-		DECLARE_COMPLETION_ONSTACK(wait);
-		io->bio->bi_private = &wait;
-		submit_bio(rw, io->bio);
-		wait_for_completion(&wait);
-	} else {
-		submit_bio(rw, io->bio);
-	}
-	io->bio = NULL;
-}
-
-void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
-{
-	struct f2fs_bio_info *io = &sbi->write_io[PAGE_TYPE_OF_BIO(type)];
-
-	if (!io->bio)
-		return;
-
-	mutex_lock(&io->io_mutex);
-	do_submit_bio(sbi, type, sync);
-	mutex_unlock(&io->io_mutex);
-}
-
-static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
-				block_t blk_addr, enum page_type type)
-{
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct f2fs_bio_info *io = &sbi->write_io[type];
-	int bio_blocks;
-
-	verify_block_addr(sbi, blk_addr);
-
-	mutex_lock(&io->io_mutex);
-
-	inc_page_count(sbi, F2FS_WRITEBACK);
-
-	if (io->bio && io->last_block_in_bio != blk_addr - 1)
-		do_submit_bio(sbi, type, false);
-alloc_new:
-	if (io->bio == NULL) {
-		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-		io->bio = f2fs_bio_alloc(bdev, bio_blocks);
-		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-		io->bio->bi_end_io = f2fs_end_io_write;
-		/*
-		 * The end_io will be assigned at the sumbission phase.
-		 * Until then, let bio_add_page() merge consecutive IOs as much
-		 * as possible.
-		 */
-	}
-
-	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
-							PAGE_CACHE_SIZE) {
-		do_submit_bio(sbi, type, false);
-		goto alloc_new;
-	}
-
-	io->last_block_in_bio = blk_addr;
-
-	mutex_unlock(&io->io_mutex);
-	trace_f2fs_submit_write_page(page, WRITE, type, blk_addr);
-}
-
-void f2fs_wait_on_page_writeback(struct page *page,
-				enum page_type type, bool sync)
-{
-	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
-	if (PageWriteback(page)) {
-		f2fs_submit_bio(sbi, type, sync);
-		wait_on_page_writeback(page);
-	}
-}
-
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1040,7 +900,7 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
 
 	/* writeout dirty page into bdev */
-	submit_write_page(sbi, page, *new_blkaddr, p_type);
+	f2fs_submit_page_mbio(sbi, page, *new_blkaddr, p_type, WRITE);
 
 	mutex_unlock(&curseg->curseg_mutex);
 }
@@ -1048,7 +908,7 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
 	set_page_writeback(page);
-	submit_write_page(sbi, page, page->index, META);
+	f2fs_submit_page_mbio(sbi, page, page->index, META, WRITE);
 }
 
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1078,7 +938,7 @@ void write_data_page(struct inode *inode, struct page *page,
 void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
 					block_t old_blk_addr)
 {
-	submit_write_page(sbi, page, old_blk_addr, DATA);
+	f2fs_submit_page_mbio(sbi, page, old_blk_addr, DATA, WRITE);
 }
 
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1165,8 +1025,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 
 	/* rewrite node page */
 	set_page_writeback(page);
-	submit_write_page(sbi, page, new_blkaddr, NODE);
-	f2fs_submit_bio(sbi, NODE, true);
+	f2fs_submit_page_mbio(sbi, page, new_blkaddr, NODE, WRITE);
+	f2fs_submit_merged_bio(sbi, NODE, true, WRITE);
 	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
 
 	locate_dirty_segment(sbi, old_cursegno);
@@ -1176,6 +1036,16 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
+void f2fs_wait_on_page_writeback(struct page *page,
+				enum page_type type, bool sync)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+	if (PageWriteback(page)) {
+		f2fs_submit_merged_bio(sbi, type, sync, WRITE);
+		wait_on_page_writeback(page);
+	}
+}
+
 static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1723,13 +1593,13 @@ repeat:
 			continue;
 		}
 
-		submit_read_page(sbi, page, blk_addr, READ_SYNC | REQ_META);
+		f2fs_submit_page_mbio(sbi, page, blk_addr, META, READ);
 
 		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
 	}
 
-	f2fs_submit_read_bio(sbi, READ_SYNC | REQ_META);
+	f2fs_submit_merged_bio(sbi, META, true, READ);
 	return blkno - start;
 }
 
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 204fcc3201b..3b9f28dfc84 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -434,7 +434,7 @@ TRACE_EVENT(f2fs_truncate_partial_nodes,
 		__entry->err)
 );
 
-TRACE_EVENT_CONDITION(f2fs_readpage,
+TRACE_EVENT_CONDITION(f2fs_submit_page_bio,
 
 	TP_PROTO(struct page *page, sector_t blkaddr, int type),
 
@@ -641,18 +641,22 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
 		__entry->size)
 );
 
-DEFINE_EVENT(f2fs__submit_bio, f2fs_submit_write_bio,
+DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio,
 
 	TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
 
-	TP_ARGS(sb, rw, type, bio)
+	TP_ARGS(sb, rw, type, bio),
+
+	TP_CONDITION(bio)
 );
 
-DEFINE_EVENT(f2fs__submit_bio, f2fs_submit_read_bio,
+DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio,
 
 	TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
 
-	TP_ARGS(sb, rw, type, bio)
+	TP_ARGS(sb, rw, type, bio),
+
+	TP_CONDITION(bio)
 );
 
 DECLARE_EVENT_CLASS(f2fs__page,
@@ -701,7 +705,7 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite,
 	TP_ARGS(page, type)
 );
 
-DECLARE_EVENT_CLASS(f2fs_io_page,
+TRACE_EVENT(f2fs_submit_page_mbio,
 
 	TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
 
@@ -733,20 +737,6 @@ DECLARE_EVENT_CLASS(f2fs_io_page,
 		(unsigned long long)__entry->block)
 );
 
-DEFINE_EVENT(f2fs_io_page, f2fs_submit_write_page,
-
-	TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
-
-	TP_ARGS(page, rw, type, blk_addr)
-);
-
-DEFINE_EVENT(f2fs_io_page, f2fs_submit_read_page,
-
-	TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
-
-	TP_ARGS(page, rw, type, blk_addr)
-);
-
 TRACE_EVENT(f2fs_write_checkpoint,
 
 	TP_PROTO(struct super_block *sb, bool is_umount, char *msg),
-- 
cgit v1.2.3-70-g09d2


From 9af0ff1c527ebb267d9a3b6aa8af93c5843d4390 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Fri, 22 Nov 2013 15:48:54 +0800
Subject: f2fs: readahead contiguous pages for restore_node_summary

If cp has no CP_UMOUNT_FLAG, we will read all pages in whole node segment
one by one, it makes low performance. So let's merge contiguous pages and
readahead for better performance.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: adjust the new bio operations]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/node.c | 89 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0e1a3df18e5..0855168af7d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1565,47 +1565,84 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 	return 0;
 }
 
+/*
+ * ra_sum_pages() merge contiguous pages into one bio and submit.
+ * these pre-readed pages are linked in pages list.
+ */
+static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
+				int start, int nrpages)
+{
+	struct page *page;
+	int page_idx = start;
+
+	for (; page_idx < start + nrpages; page_idx++) {
+		/* alloc temporal page for read node summary info*/
+		page = alloc_page(GFP_NOFS | __GFP_ZERO);
+		if (!page) {
+			struct page *tmp;
+			list_for_each_entry_safe(page, tmp, pages, lru) {
+				list_del(&page->lru);
+				unlock_page(page);
+				__free_pages(page, 0);
+			}
+			return -ENOMEM;
+		}
+
+		lock_page(page);
+		page->index = page_idx;
+		list_add_tail(&page->lru, pages);
+	}
+
+	list_for_each_entry(page, pages, lru)
+		f2fs_submit_page_mbio(sbi, page, page->index, META, READ);
+
+	f2fs_submit_merged_bio(sbi, META, true, READ);
+	return 0;
+}
+
 int restore_node_summary(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct f2fs_summary_block *sum)
 {
 	struct f2fs_node *rn;
 	struct f2fs_summary *sum_entry;
-	struct page *page;
+	struct page *page, *tmp;
 	block_t addr;
-	int i, last_offset;
-
-	/* alloc temporal page for read node */
-	page = alloc_page(GFP_NOFS | __GFP_ZERO);
-	if (!page)
-		return -ENOMEM;
-	lock_page(page);
+	int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+	int i, last_offset, nrpages, err = 0;
+	LIST_HEAD(page_list);
 
 	/* scan the node segment */
 	last_offset = sbi->blocks_per_seg;
 	addr = START_BLOCK(sbi, segno);
 	sum_entry = &sum->entries[0];
 
-	for (i = 0; i < last_offset; i++, sum_entry++) {
-		/*
-		 * In order to read next node page,
-		 * we must clear PageUptodate flag.
-		 */
-		ClearPageUptodate(page);
+	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
+		nrpages = min(last_offset - i, bio_blocks);
 
-		if (f2fs_submit_page_bio(sbi, page, addr, READ_SYNC))
-			goto out;
+		/* read ahead node pages */
+		err = ra_sum_pages(sbi, &page_list, addr, nrpages);
+		if (err)
+			return err;
 
-		lock_page(page);
-		rn = F2FS_NODE(page);
-		sum_entry->nid = rn->footer.nid;
-		sum_entry->version = 0;
-		sum_entry->ofs_in_node = 0;
-		addr++;
+		list_for_each_entry_safe(page, tmp, &page_list, lru) {
+
+			lock_page(page);
+			if(PageUptodate(page)) {
+				rn = F2FS_NODE(page);
+				sum_entry->nid = rn->footer.nid;
+				sum_entry->version = 0;
+				sum_entry->ofs_in_node = 0;
+				sum_entry++;
+			} else {
+				err = -EIO;
+			}
+
+			list_del(&page->lru);
+			unlock_page(page);
+			__free_pages(page, 0);
+		}
 	}
-	unlock_page(page);
-out:
-	__free_pages(page, 0);
-	return 0;
+	return err;
 }
 
 static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
-- 
cgit v1.2.3-70-g09d2


From c524723ebf5794083819c25320802452fe06da18 Mon Sep 17 00:00:00 2001
From: Younger Liu <liuyiyang@hisense.com>
Date: Tue, 3 Dec 2013 20:11:46 +0800
Subject: f2fs: remove debufs dir if debugfs_create_file() failed

When debugfs_create_file() failed in f2fs_create_root_stats(),
debugfs_root should be remove.

Signed-off-by: Younger Liu <liuyiyang@hisense.com>
Cc: Younger Liu <younger.liucn@gmail.com>
Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/debug.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a84b0a8e685..b51fb3c2a8e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -340,14 +340,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 
 void __init f2fs_create_root_stats(void)
 {
+	struct dentry *file;
+
 	debugfs_root = debugfs_create_dir("f2fs", NULL);
-	if (debugfs_root)
-		debugfs_create_file("status", S_IRUGO, debugfs_root,
-					 NULL, &stat_fops);
+	if (!debugfs_root)
+		goto bail;
+
+	file = debugfs_create_file("status", S_IRUGO, debugfs_root,
+			NULL, &stat_fops);
+	if (!file)
+		goto free_debugfs_dir;
+
+	return;
+
+free_debugfs_dir:
+	debugfs_remove(debugfs_root);
+
+bail:
+	debugfs_root = NULL;
+	return;
 }
 
 void f2fs_destroy_root_stats(void)
 {
+	if (!debugfs_root)
+		return;
+
 	debugfs_remove_recursive(debugfs_root);
 	debugfs_root = NULL;
 }
-- 
cgit v1.2.3-70-g09d2


From 40e1ebe97d58b141332e895facf9e69b8b430ae1 Mon Sep 17 00:00:00 2001
From: Younger Liu <liuyiyang@hisense.com>
Date: Tue, 3 Dec 2013 21:09:29 +0800
Subject: f2fs: replace the debugfs_root with f2fs_debugfs_root

This minor change for the naming conventions of debugfs_root
to avoid any possible conflicts to the other filesystem.

Signed-off-by: Younger Liu <younger.liucn@gmail.com>
Cc: Younger Liu <younger.liucn@gmail.com>
Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com>
[Jaegeuk Kim: change the patch name]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/debug.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index b51fb3c2a8e..61adbcbe9b8 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -24,7 +24,7 @@
 #include "gc.h"
 
 static LIST_HEAD(f2fs_stat_list);
-static struct dentry *debugfs_root;
+static struct dentry *f2fs_debugfs_root;
 static DEFINE_MUTEX(f2fs_stat_mutex);
 
 static void update_general_status(struct f2fs_sb_info *sbi)
@@ -342,11 +342,11 @@ void __init f2fs_create_root_stats(void)
 {
 	struct dentry *file;
 
-	debugfs_root = debugfs_create_dir("f2fs", NULL);
-	if (!debugfs_root)
+	f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
+	if (!f2fs_debugfs_root)
 		goto bail;
 
-	file = debugfs_create_file("status", S_IRUGO, debugfs_root,
+	file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
 			NULL, &stat_fops);
 	if (!file)
 		goto free_debugfs_dir;
@@ -354,18 +354,18 @@ void __init f2fs_create_root_stats(void)
 	return;
 
 free_debugfs_dir:
-	debugfs_remove(debugfs_root);
+	debugfs_remove(f2fs_debugfs_root);
 
 bail:
-	debugfs_root = NULL;
+	f2fs_debugfs_root = NULL;
 	return;
 }
 
 void f2fs_destroy_root_stats(void)
 {
-	if (!debugfs_root)
+	if (!f2fs_debugfs_root)
 		return;
 
-	debugfs_remove_recursive(debugfs_root);
-	debugfs_root = NULL;
+	debugfs_remove_recursive(f2fs_debugfs_root);
+	f2fs_debugfs_root = NULL;
 }
-- 
cgit v1.2.3-70-g09d2


From a0acdfe05a954363861a65eb537573ab417cb7ed Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 5 Dec 2013 09:54:00 +0800
Subject: f2fs: use inner macro GFP_F2FS_ZERO for simplification

Use inner macro GFP_F2FS_ZERO to instead of GFP_NOFS | __GFP_ZERO for
simplification of code.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/node.c     | 2 +-
 fs/f2fs/recovery.c | 2 +-
 fs/f2fs/super.c    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0855168af7d..099f06f84e2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1577,7 +1577,7 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
 
 	for (; page_idx < start + nrpages; page_idx++) {
 		/* alloc temporal page for read node summary info*/
-		page = alloc_page(GFP_NOFS | __GFP_ZERO);
+		page = alloc_page(GFP_F2FS_ZERO);
 		if (!page) {
 			struct page *tmp;
 			list_for_each_entry_safe(page, tmp, pages, lru) {
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index c209b865292..7dda1f28f6c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -377,7 +377,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
 	/* read node page */
-	page = alloc_page(GFP_NOFS | __GFP_ZERO);
+	page = alloc_page(GFP_F2FS_ZERO);
 	if (!page)
 		return -ENOMEM;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index dd55074a304..22b07c33662 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -332,7 +332,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
 	struct f2fs_inode_info *fi;
 
-	fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
+	fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
 	if (!fi)
 		return NULL;
 
-- 
cgit v1.2.3-70-g09d2


From b9987a277f1ec9dba203d04c3a20d967c01a1fba Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 5 Dec 2013 09:54:56 +0800
Subject: f2fs: avoid unneeded page release for correct _count of page

In find_fsync_dnodes() and recover_data(), our flow is like this:

->f2fs_submit_page_bio()
	-> f2fs_put_page()
		-> page_cache_release()	---- page->_count declined to zero.
->__free_pages()
	-> put_page_testzero() ---- page->_count will be declined again.

We will get a segment fault in put_page_testzero when CONFIG_DEBUG_VM
is on, or return MM with a bad page with wrong _count num.

So let's just release this page.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/recovery.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 7dda1f28f6c..d0754657587 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -145,7 +145,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 
 		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
 		if (err)
-			goto out;
+			return err;
 
 		lock_page(page);
 
@@ -191,9 +191,10 @@ next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
 	}
+
 	unlock_page(page);
-out:
 	__free_pages(page, 0);
+
 	return err;
 }
 
@@ -388,7 +389,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 
 		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
 		if (err)
-			goto out;
+			return err;
 
 		lock_page(page);
 
@@ -412,8 +413,8 @@ next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
 	}
+
 	unlock_page(page);
-out:
 	__free_pages(page, 0);
 
 	if (!err)
-- 
cgit v1.2.3-70-g09d2


From cfb271d485d0ec31eb92b51f4fbe54bf6542e8e6 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 5 Dec 2013 17:15:22 +0800
Subject: f2fs: add unlikely() macro for compiler optimization

As we know, some of our branch condition will rarely be true. So we could add
'unlikely' to let compiler optimize these code, by this way we could drop
unneeded 'jump' assemble code to improve performance.

change log:
 o add *unlikely* as many as possible across the whole source files at once
   suggested by Jaegeuk Kim.

Suggested-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 26 ++++++++++++++++----------
 fs/f2fs/data.c       |  4 ++--
 fs/f2fs/dir.c        |  4 ++--
 fs/f2fs/f2fs.h       |  8 ++++----
 fs/f2fs/node.c       | 16 ++++++++--------
 fs/f2fs/segment.h    |  2 +-
 6 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 38f4a224508..6b2106685b7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -82,13 +82,12 @@ static int f2fs_write_meta_page(struct page *page,
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
 	/* Should not write any meta pages, if any IO error was occurred */
-	if (wbc->for_reclaim || sbi->por_doing ||
-			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
-		dec_page_count(sbi, F2FS_DIRTY_META);
-		wbc->pages_skipped++;
-		set_page_dirty(page);
-		return AOP_WRITEPAGE_ACTIVATE;
-	}
+	if (unlikely(sbi->por_doing ||
+			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+		goto redirty_out;
+
+	if (wbc->for_reclaim)
+		goto redirty_out;
 
 	wait_on_page_writeback(page);
 
@@ -96,6 +95,12 @@ static int f2fs_write_meta_page(struct page *page,
 	dec_page_count(sbi, F2FS_DIRTY_META);
 	unlock_page(page);
 	return 0;
+
+redirty_out:
+	dec_page_count(sbi, F2FS_DIRTY_META);
+	wbc->pages_skipped++;
+	set_page_dirty(page);
+	return AOP_WRITEPAGE_ACTIVATE;
 }
 
 static int f2fs_write_meta_pages(struct address_space *mapping,
@@ -137,7 +142,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 				PAGECACHE_TAG_DIRTY,
 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
+		if (unlikely(nr_pages == 0))
 			break;
 
 		for (i = 0; i < nr_pages; i++) {
@@ -150,7 +155,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 				unlock_page(page);
 				break;
 			}
-			if (nwritten++ >= nr_to_write)
+			nwritten++;
+			if (unlikely(nwritten >= nr_to_write))
 				break;
 		}
 		pagevec_release(&pvec);
@@ -200,7 +206,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 	max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
 				* F2FS_ORPHANS_PER_BLOCK;
 	mutex_lock(&sbi->orphan_inode_mutex);
-	if (sbi->n_orphans >= max_orphans)
+	if (unlikely(sbi->n_orphans >= max_orphans))
 		err = -ENOSPC;
 	else
 		sbi->n_orphans++;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4e2fc09f0e4..2ce5a9ef508 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -251,7 +251,7 @@ int reserve_new_block(struct dnode_of_data *dn)
 
 	if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
 		return -EPERM;
-	if (!inc_valid_block_count(sbi, dn->inode, 1))
+	if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
 		return -ENOSPC;
 
 	trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
@@ -711,7 +711,7 @@ static int f2fs_write_data_page(struct page *page,
 
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 write:
-	if (sbi->por_doing) {
+	if (unlikely(sbi->por_doing)) {
 		err = AOP_WRITEPAGE_ACTIVATE;
 		goto redirty_out;
 	}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 594fc1bb64e..0cc26ba07c3 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -190,7 +190,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 	unsigned int max_depth;
 	unsigned int level;
 
-	if (namelen > F2FS_NAME_LEN)
+	if (unlikely(namelen > F2FS_NAME_LEN))
 		return NULL;
 
 	if (npages == 0)
@@ -461,7 +461,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
 	}
 
 start:
-	if (current_depth == MAX_DIR_HASH_DEPTH)
+	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
 		return -ENOSPC;
 
 	/* Increase the depth, if required */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 10eca022e1e..dca18b3bcc6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -574,7 +574,7 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 {
 	WARN_ON((nid >= NM_I(sbi)->max_nid));
-	if (nid >= NM_I(sbi)->max_nid)
+	if (unlikely(nid >= NM_I(sbi)->max_nid))
 		return -EINVAL;
 	return 0;
 }
@@ -600,7 +600,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
 	spin_lock(&sbi->stat_lock);
 	valid_block_count =
 		sbi->total_valid_block_count + (block_t)count;
-	if (valid_block_count > sbi->user_block_count) {
+	if (unlikely(valid_block_count > sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		return false;
 	}
@@ -719,13 +719,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
 	spin_lock(&sbi->stat_lock);
 
 	valid_block_count = sbi->total_valid_block_count + 1;
-	if (valid_block_count > sbi->user_block_count) {
+	if (unlikely(valid_block_count > sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		return false;
 	}
 
 	valid_node_count = sbi->total_valid_node_count + 1;
-	if (valid_node_count > sbi->total_node_count) {
+	if (unlikely(valid_node_count > sbi->total_node_count)) {
 		spin_unlock(&sbi->stat_lock);
 		return false;
 	}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 099f06f84e2..2e41636be47 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -94,7 +94,7 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 	int i;
 
 	for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
-		if (nid >= nm_i->max_nid)
+		if (unlikely(nid >= nm_i->max_nid))
 			nid = 0;
 		index = current_nat_addr(sbi, nid);
 
@@ -1160,7 +1160,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 			struct page *page = pvec.pages[i];
 
 			/* until radix tree lookup accepts end_index */
-			if (page->index > end)
+			if (unlikely(page->index > end))
 				continue;
 
 			if (ino && ino_of_node(page) == ino) {
@@ -1190,7 +1190,7 @@ static int f2fs_write_node_page(struct page *page,
 	block_t new_addr;
 	struct node_info ni;
 
-	if (sbi->por_doing)
+	if (unlikely(sbi->por_doing))
 		goto redirty_out;
 
 	wait_on_page_writeback(page);
@@ -1326,7 +1326,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
 		return -1;
 
 	/* 0 nid should not be used */
-	if (nid == 0)
+	if (unlikely(nid == 0))
 		return 0;
 
 	if (build) {
@@ -1379,7 +1379,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i,
 
 	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
 
-		if (start_nid >= nm_i->max_nid)
+		if (unlikely(start_nid >= nm_i->max_nid))
 			break;
 
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
@@ -1413,7 +1413,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 		f2fs_put_page(page, 1);
 
 		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
-		if (nid >= nm_i->max_nid)
+		if (unlikely(nid >= nm_i->max_nid))
 			nid = 0;
 
 		if (i++ == FREE_NID_PAGES)
@@ -1447,7 +1447,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct free_nid *i = NULL;
 	struct list_head *this;
 retry:
-	if (sbi->total_valid_node_count + 1 >= nm_i->max_nid)
+	if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
 		return false;
 
 	spin_lock(&nm_i->free_nid_list_lock);
@@ -1557,7 +1557,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 	new_ni = old_ni;
 	new_ni.ino = ino;
 
-	if (!inc_valid_node_count(sbi, NULL))
+	if (unlikely(!inc_valid_node_count(sbi, NULL)))
 		WARN_ON(1);
 	set_node_addr(sbi, &new_ni, NEW_ADDR);
 	inc_valid_inode_count(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 26812fc0fa1..ea563760f4b 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -457,7 +457,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
 	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
 	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
 
-	if (sbi->por_doing)
+	if (unlikely(sbi->por_doing))
 		return false;
 
 	return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
-- 
cgit v1.2.3-70-g09d2


From 6bacf52fb58aeb3e89d9a62970b85a5570aa8ace Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 6 Dec 2013 15:00:58 +0900
Subject: f2fs: add unlikely() macro for compiler more aggressively

This patch adds unlikely() macro into the most of codes.
The basic rule is to add that when:
- checking unusual errors,
- checking page mappings,
- and the other unlikely conditions.

Change log from v1:
 - Don't add unlikely for the NULL test and error test: advised by Andi Kleen.

Cc: Chao Yu <chao2.yu@samsung.com>
Cc: Andi Kleen <andi@firstfloor.org>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 10 +++++-----
 fs/f2fs/data.c       | 29 ++++++++++++++---------------
 fs/f2fs/file.c       |  9 ++++-----
 fs/f2fs/gc.c         |  3 +--
 fs/f2fs/node.c       | 28 ++++++++++++++--------------
 fs/f2fs/recovery.c   |  2 +-
 fs/f2fs/super.c      | 14 +++++++-------
 fs/f2fs/xattr.c      |  2 +-
 8 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6b2106685b7..cf505eb843e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -66,7 +66,7 @@ repeat:
 		goto repeat;
 
 	lock_page(page);
-	if (page->mapping != mapping) {
+	if (unlikely(page->mapping != mapping)) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
@@ -473,7 +473,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 	list_for_each(this, head) {
 		struct dir_inode_entry *entry;
 		entry = list_entry(this, struct dir_inode_entry, list);
-		if (entry->inode == inode)
+		if (unlikely(entry->inode == inode))
 			return -EEXIST;
 	}
 	list_add_tail(&new->list, head);
@@ -783,7 +783,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	/* Here, we only have one bio having CP pack */
 	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
 
-	if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+	if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
 		clear_prefree_segments(sbi);
 		F2FS_RESET_SB_DIRT(sbi);
 	}
@@ -840,11 +840,11 @@ int __init create_checkpoint_caches(void)
 {
 	orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
 			sizeof(struct orphan_inode_entry), NULL);
-	if (unlikely(!orphan_entry_slab))
+	if (!orphan_entry_slab)
 		return -ENOMEM;
 	inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
 			sizeof(struct dir_inode_entry), NULL);
-	if (unlikely(!inode_entry_slab)) {
+	if (!inode_entry_slab) {
 		kmem_cache_destroy(orphan_entry_slab);
 		return -ENOMEM;
 	}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2ce5a9ef508..5607393198d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -49,11 +49,11 @@ static void f2fs_read_end_io(struct bio *bio, int err)
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 
-		if (uptodate) {
-			SetPageUptodate(page);
-		} else {
+		if (unlikely(!uptodate)) {
 			ClearPageUptodate(page);
 			SetPageError(page);
+		} else {
+			SetPageUptodate(page);
 		}
 		unlock_page(page);
 	} while (bvec >= bio->bi_io_vec);
@@ -73,7 +73,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 
-		if (!uptodate) {
+		if (unlikely(!uptodate)) {
 			SetPageError(page);
 			set_bit(AS_EIO, &page->mapping->flags);
 			set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -249,7 +249,7 @@ int reserve_new_block(struct dnode_of_data *dn)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 
-	if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
 		return -EPERM;
 	if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
 		return -ENOSPC;
@@ -424,7 +424,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 		return ERR_PTR(-ENOENT);
 
 	/* By fallocate(), there is no cached page, but with NEW_ADDR */
-	if (dn.data_blkaddr == NEW_ADDR)
+	if (unlikely(dn.data_blkaddr == NEW_ADDR))
 		return ERR_PTR(-EINVAL);
 
 	page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -443,7 +443,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 
 	if (sync) {
 		wait_on_page_locked(page);
-		if (!PageUptodate(page)) {
+		if (unlikely(!PageUptodate(page))) {
 			f2fs_put_page(page, 0);
 			return ERR_PTR(-EIO);
 		}
@@ -477,7 +477,7 @@ repeat:
 	}
 	f2fs_put_dnode(&dn);
 
-	if (dn.data_blkaddr == NULL_ADDR) {
+	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(-ENOENT);
 	}
@@ -502,11 +502,11 @@ repeat:
 		return ERR_PTR(err);
 
 	lock_page(page);
-	if (!PageUptodate(page)) {
+	if (unlikely(!PageUptodate(page))) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(-EIO);
 	}
-	if (page->mapping != mapping) {
+	if (unlikely(page->mapping != mapping)) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
@@ -534,7 +534,6 @@ struct page *get_new_data_page(struct inode *inode,
 	err = f2fs_reserve_block(&dn, index);
 	if (err)
 		return ERR_PTR(err);
-
 repeat:
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -552,11 +551,11 @@ repeat:
 		if (err)
 			return ERR_PTR(err);
 		lock_page(page);
-		if (!PageUptodate(page)) {
+		if (unlikely(!PageUptodate(page))) {
 			f2fs_put_page(page, 1);
 			return ERR_PTR(-EIO);
 		}
-		if (page->mapping != mapping) {
+		if (unlikely(page->mapping != mapping)) {
 			f2fs_put_page(page, 1);
 			goto repeat;
 		}
@@ -841,11 +840,11 @@ repeat:
 		if (err)
 			return err;
 		lock_page(page);
-		if (!PageUptodate(page)) {
+		if (unlikely(!PageUptodate(page))) {
 			f2fs_put_page(page, 1);
 			return -EIO;
 		}
-		if (page->mapping != mapping) {
+		if (unlikely(page->mapping != mapping)) {
 			f2fs_put_page(page, 1);
 			goto repeat;
 		}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2b47adcd852..5accc964368 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -50,9 +50,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 
 	file_update_time(vma->vm_file);
 	lock_page(page);
-	if (page->mapping != inode->i_mapping ||
+	if (unlikely(page->mapping != inode->i_mapping ||
 			page_offset(page) > i_size_read(inode) ||
-			!PageUptodate(page)) {
+			!PageUptodate(page))) {
 		unlock_page(page);
 		err = -EFAULT;
 		goto out;
@@ -120,7 +120,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		.for_reclaim = 0,
 	};
 
-	if (f2fs_readonly(inode->i_sb))
+	if (unlikely(f2fs_readonly(inode->i_sb)))
 		return 0;
 
 	trace_f2fs_sync_file_enter(inode);
@@ -241,7 +241,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
 		return;
 
 	lock_page(page);
-	if (page->mapping != inode->i_mapping) {
+	if (unlikely(page->mapping != inode->i_mapping)) {
 		f2fs_put_page(page, 1);
 		return;
 	}
@@ -516,7 +516,6 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 		if (ret)
 			break;
 
-
 		if (pg_start == pg_end)
 			new_size = offset + len;
 		else if (index == pg_start && off_start)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 2886aef35d5..29ceb9d71c8 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -119,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
 		kfree(gc_th);
 		sbi->gc_thread = NULL;
 	}
-
 out:
 	return err;
 }
@@ -695,7 +694,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 
 	INIT_LIST_HEAD(&ilist);
 gc_more:
-	if (!(sbi->sb->s_flags & MS_ACTIVE))
+	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
 		goto stop;
 
 	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 2e41636be47..6c6ef772cf0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -750,7 +750,7 @@ skip_partial:
 		if (offset[1] == 0 &&
 				rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 			lock_page(page);
-			if (page->mapping != node_mapping) {
+			if (unlikely(page->mapping != node_mapping)) {
 				f2fs_put_page(page, 1);
 				goto restart;
 			}
@@ -841,14 +841,14 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	struct page *page;
 	int err;
 
-	if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
 		return ERR_PTR(-EPERM);
 
 	page = grab_cache_page(mapping, dn->nid);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	if (!inc_valid_node_count(sbi, dn->inode)) {
+	if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
 		err = -ENOSPC;
 		goto fail;
 	}
@@ -898,7 +898,7 @@ static int read_node_page(struct page *page, int rw)
 
 	get_node_info(sbi, page->index, &ni);
 
-	if (ni.blk_addr == NULL_ADDR) {
+	if (unlikely(ni.blk_addr == NULL_ADDR)) {
 		f2fs_put_page(page, 1);
 		return -ENOENT;
 	}
@@ -953,11 +953,11 @@ repeat:
 		goto got_it;
 
 	lock_page(page);
-	if (!PageUptodate(page)) {
+	if (unlikely(!PageUptodate(page))) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(-EIO);
 	}
-	if (page->mapping != mapping) {
+	if (unlikely(page->mapping != mapping)) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
@@ -1010,12 +1010,12 @@ repeat:
 	blk_finish_plug(&plug);
 
 	lock_page(page);
-	if (page->mapping != mapping) {
+	if (unlikely(page->mapping != mapping)) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
 page_hit:
-	if (!PageUptodate(page)) {
+	if (unlikely(!PageUptodate(page))) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(-EIO);
 	}
@@ -1173,9 +1173,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 		cond_resched();
 	}
 
-	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+	if (unlikely(test_and_clear_bit(AS_ENOSPC, &mapping->flags)))
 		ret2 = -ENOSPC;
-	if (test_and_clear_bit(AS_EIO, &mapping->flags))
+	if (unlikely(test_and_clear_bit(AS_EIO, &mapping->flags)))
 		ret2 = -EIO;
 	if (!ret)
 		ret = ret2;
@@ -1202,7 +1202,7 @@ static int f2fs_write_node_page(struct page *page,
 	get_node_info(sbi, nid, &ni);
 
 	/* This page is already truncated */
-	if (ni.blk_addr == NULL_ADDR) {
+	if (unlikely(ni.blk_addr == NULL_ADDR)) {
 		dec_page_count(sbi, F2FS_DIRTY_NODES);
 		unlock_page(page);
 		return 0;
@@ -1627,14 +1627,14 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 		list_for_each_entry_safe(page, tmp, &page_list, lru) {
 
 			lock_page(page);
-			if(PageUptodate(page)) {
+			if (unlikely(!PageUptodate(page))) {
+				err = -EIO;
+			} else {
 				rn = F2FS_NODE(page);
 				sum_entry->nid = rn->footer.nid;
 				sum_entry->version = 0;
 				sum_entry->ofs_in_node = 0;
 				sum_entry++;
-			} else {
-				err = -EIO;
 			}
 
 			list_del(&page->lru);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d0754657587..a3f4542160c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -430,7 +430,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
 			sizeof(struct fsync_inode_entry), NULL);
-	if (unlikely(!fsync_entry_slab))
+	if (!fsync_entry_slab)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&inode_list);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 22b07c33662..68df44afd14 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -626,7 +626,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
 
-	if (ino < F2FS_ROOT_INO(sbi))
+	if (unlikely(ino < F2FS_ROOT_INO(sbi)))
 		return ERR_PTR(-ESTALE);
 
 	/*
@@ -637,7 +637,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
 	inode = f2fs_iget(sb, ino);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
-	if (generation && inode->i_generation != generation) {
+	if (unlikely(generation && inode->i_generation != generation)) {
 		/* we didn't find the right inode.. */
 		iput(inode);
 		return ERR_PTR(-ESTALE);
@@ -740,10 +740,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
 	fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
 
-	if (fsmeta >= total)
+	if (unlikely(fsmeta >= total))
 		return 1;
 
-	if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+	if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;
 	}
@@ -808,7 +808,7 @@ retry:
 		brelse(*raw_super_buf);
 		f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
 				"in %dth superblock", block + 1);
-		if(block == 0) {
+		if (block == 0) {
 			block++;
 			goto retry;
 		} else {
@@ -834,7 +834,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	/* set a block size */
-	if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
+	if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
 		f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
 		goto free_sbi;
 	}
@@ -1066,7 +1066,7 @@ static int __init init_inodecache(void)
 {
 	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
 			sizeof(struct f2fs_inode_info), NULL);
-	if (f2fs_inode_cachep == NULL)
+	if (!f2fs_inode_cachep)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index aa7a3f139fe..b0fb8a27f3d 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -522,7 +522,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index,
 		if (found)
 			free = free + ENTRY_SIZE(here);
 
-		if (free < newsize) {
+		if (unlikely(free < newsize)) {
 			error = -ENOSPC;
 			goto exit;
 		}
-- 
cgit v1.2.3-70-g09d2


From 63a0b7cb33d85aeb0df39b984c08e234db4925d1 Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Mon, 9 Dec 2013 16:09:00 +0800
Subject: f2fs: merge pages with the same sync_mode flag

Previously f2fs submits most of write requests using WRITE_SYNC, but f2fs_write_data_pages
submits last write requests by sync_mode flags callers pass.

This causes a performance problem since continuous pages with different sync flags
can't be merged in cfq IO scheduler(thanks yu chao for pointing it out), and synchronous
requests often take more time.

This patch makes the following modifies to DATA writebacks:

1. every page will be written back using the sync mode caller pass.
2. only pages with the same sync mode can be merged in one bio request.

These changes are restricted to DATA pages.Other types of writebacks are modified
To remain synchronous.

In my test with tiotest, f2fs sequence write performance is improved by about 7%-10% ,
and this patch has no obvious impact on other performance tests.

Signed-off-by: Fan Li <fanofcode.li@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c    | 18 ++++++++++--------
 fs/f2fs/f2fs.h    |  8 +++++---
 fs/f2fs/gc.c      |  6 +++++-
 fs/f2fs/segment.c | 27 +++++++++++++++++----------
 4 files changed, 37 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5607393198d..fb5e5c2627e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -194,8 +194,9 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
 	if (!is_read_io(rw))
 		inc_page_count(sbi, F2FS_WRITEBACK);
 
-	if (io->bio && io->last_block_in_bio != blk_addr - 1)
-		__submit_merged_bio(sbi, io, type, true, rw);
+	if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+						io->rw_flag != rw))
+		__submit_merged_bio(sbi, io, type, false, io->rw_flag);
 alloc_new:
 	if (io->bio == NULL) {
 		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
@@ -203,6 +204,7 @@ alloc_new:
 		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
 		io->bio->bi_end_io = is_read_io(rw) ? f2fs_read_end_io :
 							f2fs_write_end_io;
+		io->rw_flag = rw;
 		/*
 		 * The end_io will be assigned at the sumbission phase.
 		 * Until then, let bio_add_page() merge consecutive IOs as much
@@ -212,7 +214,7 @@ alloc_new:
 
 	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
 							PAGE_CACHE_SIZE) {
-		__submit_merged_bio(sbi, io, type, true, rw);
+		__submit_merged_bio(sbi, io, type, false, rw);
 		goto alloc_new;
 	}
 
@@ -641,7 +643,7 @@ static int f2fs_read_data_pages(struct file *file,
 	return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
 }
 
-int do_write_data_page(struct page *page)
+int do_write_data_page(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	block_t old_blk_addr, new_blk_addr;
@@ -669,10 +671,10 @@ int do_write_data_page(struct page *page)
 			!is_cold_data(page) &&
 			need_inplace_update(inode))) {
 		rewrite_data_page(F2FS_SB(inode->i_sb), page,
-						old_blk_addr);
+						old_blk_addr, wbc);
 	} else {
 		write_data_page(inode, page, &dn,
-				old_blk_addr, &new_blk_addr);
+				old_blk_addr, &new_blk_addr, wbc);
 		update_extent_cache(new_blk_addr, &dn);
 	}
 out_writepage:
@@ -719,10 +721,10 @@ write:
 	if (S_ISDIR(inode->i_mode)) {
 		dec_page_count(sbi, F2FS_DIRTY_DENTS);
 		inode_dec_dirty_dents(inode);
-		err = do_write_data_page(page);
+		err = do_write_data_page(page, wbc);
 	} else {
 		f2fs_lock_op(sbi);
-		err = do_write_data_page(page);
+		err = do_write_data_page(page, wbc);
 		f2fs_unlock_op(sbi);
 		need_balance_fs = true;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index dca18b3bcc6..b1df239ba42 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -368,6 +368,7 @@ enum page_type {
 struct f2fs_bio_info {
 	struct bio *bio;		/* bios to merge */
 	sector_t last_block_in_bio;	/* last block number */
+	int rw_flag;				/* rw flag for all pages */
 	struct mutex io_mutex;		/* mutex for bio */
 };
 
@@ -1098,8 +1099,9 @@ void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
 					block_t, block_t *);
 void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
-					block_t, block_t *);
-void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
+			block_t, block_t *, struct writeback_control *);
+void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t,
+				struct writeback_control *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
 void rewrite_node_page(struct f2fs_sb_info *, struct page *,
@@ -1150,7 +1152,7 @@ void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int do_write_data_page(struct page *);
+int do_write_data_page(struct page *, struct writeback_control *);
 
 /*
  * gc.c
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 29ceb9d71c8..c68fba5ffad 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -520,6 +520,10 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 
 static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 {
+	struct writeback_control wbc = {
+		.sync_mode = 1,
+	};
+
 	if (gc_type == BG_GC) {
 		if (PageWriteback(page))
 			goto out;
@@ -536,7 +540,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 			inode_dec_dirty_dents(inode);
 		}
 		set_cold_data(page);
-		do_write_data_page(page);
+		do_write_data_page(page, &wbc);
 		clear_cold_data(page);
 	}
 out:
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ca9adf5914c..e5dc4111486 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -856,12 +856,13 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
 
 static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 			block_t old_blkaddr, block_t *new_blkaddr,
-			struct f2fs_summary *sum, enum page_type p_type)
+			struct f2fs_summary *sum, enum page_type p_type,
+			struct writeback_control *wbc)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg;
 	unsigned int old_cursegno;
-	int type;
+	int type, rw = WRITE;
 
 	type = __get_segment_type(page, p_type);
 	curseg = CURSEG_I(sbi, type);
@@ -900,7 +901,9 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
 
 	/* writeout dirty page into bdev */
-	f2fs_submit_page_mbio(sbi, page, *new_blkaddr, p_type, WRITE);
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		rw |= WRITE_SYNC;
+	f2fs_submit_page_mbio(sbi, page, *new_blkaddr, p_type, rw);
 
 	mutex_unlock(&curseg->curseg_mutex);
 }
@@ -915,13 +918,16 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
 		unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
 {
 	struct f2fs_summary sum;
+	struct writeback_control wbc = {
+		.sync_mode = 1,
+	};
 	set_summary(&sum, nid, 0, 0);
-	do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
+	do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE, &wbc);
 }
 
 void write_data_page(struct inode *inode, struct page *page,
 		struct dnode_of_data *dn, block_t old_blkaddr,
-		block_t *new_blkaddr)
+		block_t *new_blkaddr, struct writeback_control *wbc)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_summary sum;
@@ -932,13 +938,14 @@ void write_data_page(struct inode *inode, struct page *page,
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
 	do_write_page(sbi, page, old_blkaddr,
-			new_blkaddr, &sum, DATA);
+			new_blkaddr, &sum, DATA, wbc);
 }
 
 void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
-					block_t old_blk_addr)
+			block_t old_blk_addr, struct writeback_control *wbc)
 {
-	f2fs_submit_page_mbio(sbi, page, old_blk_addr, DATA, WRITE);
+	int rw = wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE;
+	f2fs_submit_page_mbio(sbi, page, old_blk_addr, DATA, rw);
 }
 
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1025,7 +1032,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 
 	/* rewrite node page */
 	set_page_writeback(page);
-	f2fs_submit_page_mbio(sbi, page, new_blkaddr, NODE, WRITE);
+	f2fs_submit_page_mbio(sbi, page, new_blkaddr, NODE, WRITE_SYNC);
 	f2fs_submit_merged_bio(sbi, NODE, true, WRITE);
 	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
 
@@ -1593,7 +1600,7 @@ repeat:
 			continue;
 		}
 
-		f2fs_submit_page_mbio(sbi, page, blk_addr, META, READ);
+		f2fs_submit_page_mbio(sbi, page, blk_addr, META, READ_SYNC);
 
 		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
-- 
cgit v1.2.3-70-g09d2


From 458e6197c37de53f7be0a837644daabb900c3036 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Wed, 11 Dec 2013 13:54:01 +0900
Subject: f2fs: refactor bio->rw handling

This patch introduces f2fs_io_info to mitigate the complex parameter list.

struct f2fs_io_info {
	enum page_type type;		/* contains DATA/NODE/META/META_FLUSH */
	int rw;				/* contains R/RS/W/WS */
	int rw_flag;			/* contains REQ_META/REQ_PRIO */
}

1. f2fs_write_data_pages
 - DATA
 - WRITE_SYNC is set when wbc->WB_SYNC_ALL.

2. sync_node_pages
 - NODE
 - WRITE_SYNC all the time

3. sync_meta_pages
 - META
 - WRITE_SYNC all the time
 - REQ_META | REQ_PRIO all the time

 ** f2fs_submit_merged_bio() handles META_FLUSH.

4. ra_nat_pages, ra_sit_pages, ra_sum_pages
 - META
 - READ_SYNC

Cc: Fan Li <fanofcode.li@samsung.com>
Cc: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 11 ++++---
 fs/f2fs/data.c       | 85 ++++++++++++++++++++++++++--------------------------
 fs/f2fs/f2fs.h       | 22 +++++++++-----
 fs/f2fs/gc.c         | 10 ++++---
 fs/f2fs/node.c       | 22 ++++++++++----
 fs/f2fs/segment.c    | 70 ++++++++++++++++++++++++++-----------------
 fs/f2fs/super.c      |  7 ++++-
 7 files changed, 132 insertions(+), 95 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index cf505eb843e..f8c074961e0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -164,8 +164,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 	}
 
 	if (nwritten)
-		f2fs_submit_merged_bio(sbi, type, nr_to_write == LONG_MAX,
-								WRITE);
+		f2fs_submit_merged_bio(sbi, type, WRITE);
 
 	return nwritten;
 }
@@ -598,7 +597,7 @@ retry:
 		 * We should submit bio, since it exists several
 		 * wribacking dentry pages in the freeing inode.
 		 */
-		f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
+		f2fs_submit_merged_bio(sbi, DATA, WRITE);
 	}
 	goto retry;
 }
@@ -804,9 +803,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 
 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
 
-	f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
-	f2fs_submit_merged_bio(sbi, NODE, true, WRITE);
-	f2fs_submit_merged_bio(sbi, META, true, WRITE);
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
+	f2fs_submit_merged_bio(sbi, NODE, WRITE);
+	f2fs_submit_merged_bio(sbi, META, WRITE);
 
 	/*
 	 * update checkpoint pack index
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fb5e5c2627e..ebc91778e81 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -93,37 +93,28 @@ static void f2fs_write_end_io(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static void __submit_merged_bio(struct f2fs_sb_info *sbi,
-				struct f2fs_bio_info *io,
-				enum page_type type, bool sync, int rw)
+static void __submit_merged_bio(struct f2fs_bio_info *io)
 {
-	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	struct f2fs_io_info *fio = &io->fio;
+	int rw;
 
 	if (!io->bio)
 		return;
 
-	if (btype == META)
-		rw |= REQ_META;
+	rw = fio->rw | fio->rw_flag;
 
 	if (is_read_io(rw)) {
-		if (sync)
-			rw |= READ_SYNC;
 		submit_bio(rw, io->bio);
-		trace_f2fs_submit_read_bio(sbi->sb, rw, type, io->bio);
+		trace_f2fs_submit_read_bio(io->sbi->sb, rw, fio->type, io->bio);
 		io->bio = NULL;
 		return;
 	}
 
-	if (sync)
-		rw |= WRITE_SYNC;
-	if (type >= META_FLUSH)
-		rw |= WRITE_FLUSH_FUA;
-
 	/*
 	 * META_FLUSH is only from the checkpoint procedure, and we should wait
 	 * this metadata bio for FS consistency.
 	 */
-	if (type == META_FLUSH) {
+	if (fio->type == META_FLUSH) {
 		DECLARE_COMPLETION_ONSTACK(wait);
 		io->bio->bi_private = &wait;
 		submit_bio(rw, io->bio);
@@ -131,12 +122,12 @@ static void __submit_merged_bio(struct f2fs_sb_info *sbi,
 	} else {
 		submit_bio(rw, io->bio);
 	}
-	trace_f2fs_submit_write_bio(sbi->sb, rw, btype, io->bio);
+	trace_f2fs_submit_write_bio(io->sbi->sb, rw, fio->type, io->bio);
 	io->bio = NULL;
 }
 
 void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
-				enum page_type type, bool sync, int rw)
+				enum page_type type, int rw)
 {
 	enum page_type btype = PAGE_TYPE_OF_BIO(type);
 	struct f2fs_bio_info *io;
@@ -144,7 +135,13 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 	io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
 
 	mutex_lock(&io->io_mutex);
-	__submit_merged_bio(sbi, io, type, sync, rw);
+
+	/* change META to META_FLUSH in the checkpoint procedure */
+	if (type >= META_FLUSH) {
+		io->fio.type = META_FLUSH;
+		io->fio.rw = WRITE_FLUSH_FUA;
+	}
+	__submit_merged_bio(io);
 	mutex_unlock(&io->io_mutex);
 }
 
@@ -178,33 +175,33 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
 }
 
 void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
-			block_t blk_addr, enum page_type type, int rw)
+			block_t blk_addr, struct f2fs_io_info *fio)
 {
-	enum page_type btype = PAGE_TYPE_OF_BIO(type);
+	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 	struct block_device *bdev = sbi->sb->s_bdev;
 	struct f2fs_bio_info *io;
 	int bio_blocks;
 
-	io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+	io = is_read_io(fio->rw) ? &sbi->read_io : &sbi->write_io[btype];
 
 	verify_block_addr(sbi, blk_addr);
 
 	mutex_lock(&io->io_mutex);
 
-	if (!is_read_io(rw))
+	if (!is_read_io(fio->rw))
 		inc_page_count(sbi, F2FS_WRITEBACK);
 
 	if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
-						io->rw_flag != rw))
-		__submit_merged_bio(sbi, io, type, false, io->rw_flag);
+						io->fio.rw != fio->rw))
+		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
 		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
 		io->bio = __bio_alloc(bdev, bio_blocks);
 		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-		io->bio->bi_end_io = is_read_io(rw) ? f2fs_read_end_io :
+		io->bio->bi_end_io = is_read_io(fio->rw) ? f2fs_read_end_io :
 							f2fs_write_end_io;
-		io->rw_flag = rw;
+		io->fio = *fio;
 		/*
 		 * The end_io will be assigned at the sumbission phase.
 		 * Until then, let bio_add_page() merge consecutive IOs as much
@@ -214,14 +211,14 @@ alloc_new:
 
 	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
 							PAGE_CACHE_SIZE) {
-		__submit_merged_bio(sbi, io, type, false, rw);
+		__submit_merged_bio(io);
 		goto alloc_new;
 	}
 
 	io->last_block_in_bio = blk_addr;
 
 	mutex_unlock(&io->io_mutex);
-	trace_f2fs_submit_page_mbio(page, rw, type, blk_addr);
+	trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
 }
 
 /*
@@ -643,10 +640,10 @@ static int f2fs_read_data_pages(struct file *file,
 	return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
 }
 
-int do_write_data_page(struct page *page, struct writeback_control *wbc)
+int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
 {
 	struct inode *inode = page->mapping->host;
-	block_t old_blk_addr, new_blk_addr;
+	block_t old_blkaddr, new_blkaddr;
 	struct dnode_of_data dn;
 	int err = 0;
 
@@ -655,10 +652,10 @@ int do_write_data_page(struct page *page, struct writeback_control *wbc)
 	if (err)
 		return err;
 
-	old_blk_addr = dn.data_blkaddr;
+	old_blkaddr = dn.data_blkaddr;
 
 	/* This page is already truncated */
-	if (old_blk_addr == NULL_ADDR)
+	if (old_blkaddr == NULL_ADDR)
 		goto out_writepage;
 
 	set_page_writeback(page);
@@ -667,15 +664,13 @@ int do_write_data_page(struct page *page, struct writeback_control *wbc)
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (unlikely(old_blk_addr != NEW_ADDR &&
+	if (unlikely(old_blkaddr != NEW_ADDR &&
 			!is_cold_data(page) &&
 			need_inplace_update(inode))) {
-		rewrite_data_page(F2FS_SB(inode->i_sb), page,
-						old_blk_addr, wbc);
+		rewrite_data_page(page, old_blkaddr, fio);
 	} else {
-		write_data_page(inode, page, &dn,
-				old_blk_addr, &new_blk_addr, wbc);
-		update_extent_cache(new_blk_addr, &dn);
+		write_data_page(page, &dn, &new_blkaddr, fio);
+		update_extent_cache(new_blkaddr, &dn);
 	}
 out_writepage:
 	f2fs_put_dnode(&dn);
@@ -693,6 +688,11 @@ static int f2fs_write_data_page(struct page *page,
 	unsigned offset;
 	bool need_balance_fs = false;
 	int err = 0;
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC: WRITE,
+		.rw_flag = 0,
+	};
 
 	if (page->index < end_index)
 		goto write;
@@ -721,10 +721,10 @@ write:
 	if (S_ISDIR(inode->i_mode)) {
 		dec_page_count(sbi, F2FS_DIRTY_DENTS);
 		inode_dec_dirty_dents(inode);
-		err = do_write_data_page(page, wbc);
+		err = do_write_data_page(page, &fio);
 	} else {
 		f2fs_lock_op(sbi);
-		err = do_write_data_page(page, wbc);
+		err = do_write_data_page(page, &fio);
 		f2fs_unlock_op(sbi);
 		need_balance_fs = true;
 	}
@@ -734,7 +734,7 @@ write:
 		goto redirty_out;
 
 	if (wbc->for_reclaim)
-		f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
+		f2fs_submit_merged_bio(sbi, DATA, WRITE);
 
 	clear_cold_data(page);
 out:
@@ -786,7 +786,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
 	if (locked)
 		mutex_unlock(&sbi->writepages);
-	f2fs_submit_merged_bio(sbi, DATA, wbc->sync_mode == WB_SYNC_ALL, WRITE);
+
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
 
 	remove_dirty_dir_inode(inode);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b1df239ba42..022ce324b16 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -364,11 +364,18 @@ enum page_type {
 	META_FLUSH,
 };
 
+struct f2fs_io_info {
+	enum page_type type;		/* contains DATA/NODE/META/META_FLUSH */
+	int rw;				/* contains R/RS/W/WS */
+	int rw_flag;			/* contains REQ_META/REQ_PRIO */
+};
+
 #define is_read_io(rw)	(((rw) & 1) == READ)
 struct f2fs_bio_info {
+	struct f2fs_sb_info *sbi;	/* f2fs superblock */
 	struct bio *bio;		/* bios to merge */
 	sector_t last_block_in_bio;	/* last block number */
-	int rw_flag;				/* rw flag for all pages */
+	struct f2fs_io_info fio;	/* store buffered io info. */
 	struct mutex io_mutex;		/* mutex for bio */
 };
 
@@ -1098,10 +1105,9 @@ struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
 					block_t, block_t *);
-void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
-			block_t, block_t *, struct writeback_control *);
-void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t,
-				struct writeback_control *);
+void write_data_page(struct page *, struct dnode_of_data *, block_t *,
+					struct f2fs_io_info *);
+void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
 void rewrite_node_page(struct f2fs_sb_info *, struct page *,
@@ -1142,17 +1148,17 @@ void destroy_checkpoint_caches(void);
 /*
  * data.c
  */
-void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, bool, int);
+void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
 int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
 void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
-							enum page_type, int);
+						struct f2fs_io_info *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int do_write_data_page(struct page *, struct writeback_control *);
+int do_write_data_page(struct page *, struct f2fs_io_info *);
 
 /*
  * gc.c
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index c68fba5ffad..69c18e39901 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -520,8 +520,10 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 
 static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 {
-	struct writeback_control wbc = {
-		.sync_mode = 1,
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = WRITE_SYNC,
+		.rw_flag = 0,
 	};
 
 	if (gc_type == BG_GC) {
@@ -540,7 +542,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 			inode_dec_dirty_dents(inode);
 		}
 		set_cold_data(page);
-		do_write_data_page(page, &wbc);
+		do_write_data_page(page, &fio);
 		clear_cold_data(page);
 	}
 out:
@@ -634,7 +636,7 @@ next_iput:
 		goto next_step;
 
 	if (gc_type == FG_GC) {
-		f2fs_submit_merged_bio(sbi, DATA, true, WRITE);
+		f2fs_submit_merged_bio(sbi, DATA, WRITE);
 
 		/*
 		 * In the case of FG_GC, it'd be better to reclaim this victim
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 6c6ef772cf0..3565caf9700 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -92,6 +92,12 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 	struct page *page;
 	pgoff_t index;
 	int i;
+	struct f2fs_io_info fio = {
+		.type = META,
+		.rw = READ_SYNC,
+		.rw_flag = REQ_META | REQ_PRIO
+	};
+
 
 	for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
 		if (unlikely(nid >= nm_i->max_nid))
@@ -106,11 +112,11 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 			f2fs_put_page(page, 1);
 			continue;
 		}
-		f2fs_submit_page_mbio(sbi, page, index, META, READ);
+		f2fs_submit_page_mbio(sbi, page, index, &fio);
 		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
 	}
-	f2fs_submit_merged_bio(sbi, META, true, READ);
+	f2fs_submit_merged_bio(sbi, META, READ);
 }
 
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -1136,8 +1142,7 @@ continue_unlock:
 	}
 
 	if (wrote)
-		f2fs_submit_merged_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL,
-									WRITE);
+		f2fs_submit_merged_bio(sbi, NODE, WRITE);
 	return nwritten;
 }
 
@@ -1574,6 +1579,11 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
 {
 	struct page *page;
 	int page_idx = start;
+	struct f2fs_io_info fio = {
+		.type = META,
+		.rw = READ_SYNC,
+		.rw_flag = REQ_META | REQ_PRIO
+	};
 
 	for (; page_idx < start + nrpages; page_idx++) {
 		/* alloc temporal page for read node summary info*/
@@ -1594,9 +1604,9 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
 	}
 
 	list_for_each_entry(page, pages, lru)
-		f2fs_submit_page_mbio(sbi, page, page->index, META, READ);
+		f2fs_submit_page_mbio(sbi, page, page->index, &fio);
 
-	f2fs_submit_merged_bio(sbi, META, true, READ);
+	f2fs_submit_merged_bio(sbi, META, READ);
 	return 0;
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index e5dc4111486..0b2e8ceec98 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -856,15 +856,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
 
 static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 			block_t old_blkaddr, block_t *new_blkaddr,
-			struct f2fs_summary *sum, enum page_type p_type,
-			struct writeback_control *wbc)
+			struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg;
 	unsigned int old_cursegno;
-	int type, rw = WRITE;
+	int type;
 
-	type = __get_segment_type(page, p_type);
+	type = __get_segment_type(page, fio->type);
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
@@ -897,55 +896,60 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
 	mutex_unlock(&sit_i->sentry_lock);
 
-	if (p_type == NODE)
+	if (fio->type == NODE)
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
 
 	/* writeout dirty page into bdev */
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		rw |= WRITE_SYNC;
-	f2fs_submit_page_mbio(sbi, page, *new_blkaddr, p_type, rw);
+	f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
 
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
+	struct f2fs_io_info fio = {
+		.type = META,
+		.rw = WRITE_SYNC,
+		.rw_flag = REQ_META | REQ_PRIO
+	};
+
 	set_page_writeback(page);
-	f2fs_submit_page_mbio(sbi, page, page->index, META, WRITE);
+	f2fs_submit_page_mbio(sbi, page, page->index, &fio);
 }
 
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
 		unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
 {
 	struct f2fs_summary sum;
-	struct writeback_control wbc = {
-		.sync_mode = 1,
+	struct f2fs_io_info fio = {
+		.type = NODE,
+		.rw = WRITE_SYNC,
+		.rw_flag = 0
 	};
+
 	set_summary(&sum, nid, 0, 0);
-	do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE, &wbc);
+	do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, &fio);
 }
 
-void write_data_page(struct inode *inode, struct page *page,
-		struct dnode_of_data *dn, block_t old_blkaddr,
-		block_t *new_blkaddr, struct writeback_control *wbc)
+void write_data_page(struct page *page, struct dnode_of_data *dn,
+		block_t *new_blkaddr, struct f2fs_io_info *fio)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 	struct f2fs_summary sum;
 	struct node_info ni;
 
-	f2fs_bug_on(old_blkaddr == NULL_ADDR);
+	f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
-	do_write_page(sbi, page, old_blkaddr,
-			new_blkaddr, &sum, DATA, wbc);
+	do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
 }
 
-void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
-			block_t old_blk_addr, struct writeback_control *wbc)
+void rewrite_data_page(struct page *page, block_t old_blkaddr, struct f2fs_io_info *fio)
 {
-	int rw = wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE;
-	f2fs_submit_page_mbio(sbi, page, old_blk_addr, DATA, rw);
+	struct inode *inode = page->mapping->host;
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
 }
 
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1004,6 +1008,11 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 	unsigned int segno, old_cursegno;
 	block_t next_blkaddr = next_blkaddr_of_node(page);
 	unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
+	struct f2fs_io_info fio = {
+		.type = NODE,
+		.rw = WRITE_SYNC,
+		.rw_flag = 0
+	};
 
 	curseg = CURSEG_I(sbi, type);
 
@@ -1032,8 +1041,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 
 	/* rewrite node page */
 	set_page_writeback(page);
-	f2fs_submit_page_mbio(sbi, page, new_blkaddr, NODE, WRITE_SYNC);
-	f2fs_submit_merged_bio(sbi, NODE, true, WRITE);
+	f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
+	f2fs_submit_merged_bio(sbi, NODE, WRITE);
 	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
 
 	locate_dirty_segment(sbi, old_cursegno);
@@ -1048,7 +1057,7 @@ void f2fs_wait_on_page_writeback(struct page *page,
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
 	if (PageWriteback(page)) {
-		f2fs_submit_merged_bio(sbi, type, sync, WRITE);
+		f2fs_submit_merged_bio(sbi, type, WRITE);
 		wait_on_page_writeback(page);
 	}
 }
@@ -1580,6 +1589,11 @@ static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
 	block_t blk_addr, prev_blk_addr = 0;
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
 	int blkno = start;
+	struct f2fs_io_info fio = {
+		.type = META,
+		.rw = READ_SYNC,
+		.rw_flag = REQ_META | REQ_PRIO
+	};
 
 	for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
 
@@ -1600,13 +1614,13 @@ repeat:
 			continue;
 		}
 
-		f2fs_submit_page_mbio(sbi, page, blk_addr, META, READ_SYNC);
+		f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
 
 		mark_page_accessed(page);
 		f2fs_put_page(page, 0);
 	}
 
-	f2fs_submit_merged_bio(sbi, META, true, READ);
+	f2fs_submit_merged_bio(sbi, META, READ);
 	return blkno - start;
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 68df44afd14..1674f2f9970 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -885,8 +885,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&sbi->stat_lock);
 
 	mutex_init(&sbi->read_io.io_mutex);
-	for (i = 0; i < NR_PAGE_TYPE; i++)
+	sbi->read_io.sbi = sbi;
+	sbi->read_io.bio = NULL;
+	for (i = 0; i < NR_PAGE_TYPE; i++) {
 		mutex_init(&sbi->write_io[i].io_mutex);
+		sbi->write_io[i].sbi = sbi;
+		sbi->write_io[i].bio = NULL;
+	}
 
 	init_rwsem(&sbi->cp_rwsem);
 	init_waitqueue_head(&sbi->cp_wait);
-- 
cgit v1.2.3-70-g09d2


From 76130ccabcc39f0716691113f739f28a3088e253 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Wed, 11 Dec 2013 14:29:39 +0900
Subject: f2fs: fix the location of tracepoint

We need to get a trace before submit_bio, since its bi_sector is remapped during
the submit_bio.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ebc91778e81..15956fa584d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -104,11 +104,12 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	rw = fio->rw | fio->rw_flag;
 
 	if (is_read_io(rw)) {
-		submit_bio(rw, io->bio);
 		trace_f2fs_submit_read_bio(io->sbi->sb, rw, fio->type, io->bio);
+		submit_bio(rw, io->bio);
 		io->bio = NULL;
 		return;
 	}
+	trace_f2fs_submit_write_bio(io->sbi->sb, rw, fio->type, io->bio);
 
 	/*
 	 * META_FLUSH is only from the checkpoint procedure, and we should wait
@@ -122,7 +123,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	} else {
 		submit_bio(rw, io->bio);
 	}
-	trace_f2fs_submit_write_bio(io->sbi->sb, rw, fio->type, io->bio);
 	io->bio = NULL;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5dcd8a71505186a124640f9f7c5c81fb269cc476 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Wed, 11 Dec 2013 14:32:13 +0900
Subject: f2fs: missing kmem_cache_destroy for discard_entry

insmod f2fs.ko is failed after insmod and rmmod firstly.

$ sudo insmod fs/f2fs/f2fs.ko
insmod: error inserting 'fs/f2fs/f2fs.ko': -1 Cannot allocate memory

-- dmesg --
kmem_cache_sanity_check (free_nid): Cache name already exists.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1674f2f9970..5c574fa1143 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1140,6 +1140,7 @@ static void __exit exit_f2fs_fs(void)
 	unregister_filesystem(&f2fs_fs_type);
 	destroy_checkpoint_caches();
 	destroy_gc_caches();
+	destroy_segment_manager_caches();
 	destroy_node_manager_caches();
 	destroy_inodecache();
 	kset_unregister(f2fs_kset);
-- 
cgit v1.2.3-70-g09d2


From 216fbd64437452d23db54ae845916facd7215caa Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 7 Nov 2013 13:13:42 +0900
Subject: f2fs: introduce sysfs entry to control in-place-update policy

This patch introduces new sysfs entries for users to control the policy of
in-place-updates, namely IPU, in f2fs.

Sometimes f2fs suffers from performance degradation due to its out-of-place
update policy that produces many additional node block writes.
If the storage performance is very dependant on the amount of data writes
instead of IO patterns, we'd better drop this out-of-place update policy.

This patch suggests 5 polcies and their triggering conditions as follows.

[sysfs entry name = ipu_policy]

0: F2FS_IPU_FORCE       all the time,
1: F2FS_IPU_SSR         if SSR mode is activated,
2: F2FS_IPU_UTIL        if FS utilization is over threashold,
3: F2FS_IPU_SSR_UTIL    if SSR mode is activated and FS utilization is over
                        threashold,
4: F2FS_IPU_DISABLE    disable IPU. (=default option)

[sysfs entry name = min_ipu_util]

This parameter controls the threshold to trigger in-place-updates.
The number indicates percentage of the filesystem utilization, and used by
F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.

For more details, see need_inplace_update() in segment.h.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 Documentation/filesystems/f2fs.txt | 11 ++++++++++
 fs/f2fs/f2fs.h                     |  3 +++
 fs/f2fs/segment.c                  |  2 ++
 fs/f2fs/segment.h                  | 44 ++++++++++++++++++++++++++++++++------
 fs/f2fs/super.c                    |  4 ++++
 5 files changed, 58 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index a3fe811bbdb..1a94ac7fee8 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -171,6 +171,17 @@ Files in /sys/fs/f2fs/<devname>
 			      conduct checkpoint to reclaim the prefree segments
 			      to free segments. By default, 100 segments, 200MB.
 
+ ipu_policy                   This parameter controls the policy of in-place
+                              updates in f2fs. There are five policies:
+                               0: F2FS_IPU_FORCE, 1: F2FS_IPU_SSR,
+                               2: F2FS_IPU_UTIL,  3: F2FS_IPU_SSR_UTIL,
+                               4: F2FS_IPU_DISABLE.
+
+ min_ipu_util                 This parameter controls the threshold to trigger
+                              in-place-updates. The number indicates percentage
+                              of the filesystem utilization, and used by
+                              F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
+
 ================================================================================
 USAGE
 ================================================================================
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 022ce324b16..1b05a628670 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -325,6 +325,9 @@ struct f2fs_sm_info {
 	struct list_head discard_list;		/* 4KB discard list */
 	int nr_discards;			/* # of discards in the list */
 	int max_discards;			/* max. discards to be issued */
+
+	unsigned int ipu_policy;	/* in-place-update policy */
+	unsigned int min_ipu_util;	/* in-place-update threshold */
 };
 
 /*
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0b2e8ceec98..5b890ce74b1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1799,6 +1799,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
 	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
 	sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
+	sm_info->ipu_policy = F2FS_IPU_DISABLE;
+	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 
 	INIT_LIST_HEAD(&sm_info->discard_list);
 	sm_info->nr_discards = 0;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ea563760f4b..e9a10bdd7fe 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -476,19 +476,51 @@ static inline int utilization(struct f2fs_sb_info *sbi)
 
 /*
  * Sometimes f2fs may be better to drop out-of-place update policy.
- * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
- * data in the original place likewise other traditional file systems.
- * But, currently set 100 in percentage, which means it is disabled.
- * See below need_inplace_update().
+ * And, users can control the policy through sysfs entries.
+ * There are five policies with triggering conditions as follows.
+ * F2FS_IPU_FORCE - all the time,
+ * F2FS_IPU_SSR - if SSR mode is activated,
+ * F2FS_IPU_UTIL - if FS utilization is over threashold,
+ * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
+ *                     threashold,
+ * F2FS_IPUT_DISABLE - disable IPU. (=default option)
  */
-#define MIN_IPU_UTIL		100
+#define DEF_MIN_IPU_UTIL	70
+
+enum {
+	F2FS_IPU_FORCE,
+	F2FS_IPU_SSR,
+	F2FS_IPU_UTIL,
+	F2FS_IPU_SSR_UTIL,
+	F2FS_IPU_DISABLE,
+};
+
 static inline bool need_inplace_update(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+
+	/* IPU can be done only for the user data */
 	if (S_ISDIR(inode->i_mode))
 		return false;
-	if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
+
+	switch (SM_I(sbi)->ipu_policy) {
+	case F2FS_IPU_FORCE:
 		return true;
+	case F2FS_IPU_SSR:
+		if (need_SSR(sbi))
+			return true;
+		break;
+	case F2FS_IPU_UTIL:
+		if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
+			return true;
+		break;
+	case F2FS_IPU_SSR_UTIL:
+		if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
+			return true;
+		break;
+	case F2FS_IPU_DISABLE:
+		break;
+	}
 	return false;
 }
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 5c574fa1143..f16da92a789 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -178,6 +178,8 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -187,6 +189,8 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(gc_idle),
 	ATTR_LIST(reclaim_segments),
 	ATTR_LIST(max_small_discards),
+	ATTR_LIST(ipu_policy),
+	ATTR_LIST(min_ipu_util),
 	NULL,
 };
 
-- 
cgit v1.2.3-70-g09d2


From bfad7c2d40332be6a1d7a89660bceb0f6ea1d73a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Mon, 16 Dec 2013 19:04:05 +0900
Subject: f2fs: introduce a new direct_IO write path

Previously, f2fs doesn't support direct IOs with high performance, which throws
every write requests via the buffered write path, resulting in highly
performance degradation due to memory opeations like copy_from_user.

This patch introduces a new direct IO path in which every write requests are
processed by generic blockdev_direct_IO() with enhanced get_block function.

The get_data_block() in f2fs handles:
1. if original data blocks are allocates, then give them to blockdev.
2. otherwise,
  a. preallocate requested block addresses
  b. do not use extent cache for better performance
  c. give the block addresses to blockdev

This policy induces that:
- new allocated data are sequentially written to the disk
- updated data are randomly written to the disk.
- f2fs gives consistency on its file meta, not file data.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c    | 152 ++++++++++++++++++++++++++++++++++++++++--------------
 fs/f2fs/f2fs.h    |   2 +
 fs/f2fs/segment.c |  23 ++++++---
 3 files changed, 129 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 15956fa584d..a0950bcbf56 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -570,74 +570,151 @@ repeat:
 	return page;
 }
 
+static int __allocate_data_block(struct dnode_of_data *dn)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_summary sum;
+	block_t new_blkaddr;
+	struct node_info ni;
+	int type;
+
+	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+		return -EPERM;
+	if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+		return -ENOSPC;
+
+	__set_data_blkaddr(dn, NEW_ADDR);
+	dn->data_blkaddr = NEW_ADDR;
+
+	get_node_info(sbi, dn->nid, &ni);
+	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+
+	type = CURSEG_WARM_DATA;
+
+	allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
+
+	/* direct IO doesn't use extent cache to maximize the performance */
+	set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+	update_extent_cache(new_blkaddr, dn);
+	clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+
+	dn->data_blkaddr = new_blkaddr;
+	return 0;
+}
+
 /*
  * This function should be used by the data read flow only where it
  * does not check the "create" flag that indicates block allocation.
  * The reason for this special functionality is to exploit VFS readahead
  * mechanism.
  */
-static int get_data_block_ro(struct inode *inode, sector_t iblock,
+static int get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	unsigned int blkbits = inode->i_sb->s_blocksize_bits;
 	unsigned maxblocks = bh_result->b_size >> blkbits;
 	struct dnode_of_data dn;
-	pgoff_t pgofs;
-	int err;
+	int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
+	pgoff_t pgofs, end_offset;
+	int err = 0, ofs = 1;
+	bool allocated = false;
 
 	/* Get the page offset from the block offset(iblock) */
 	pgofs =	(pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
 
-	if (check_extent_cache(inode, pgofs, bh_result)) {
-		trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
-		return 0;
-	}
+	if (check_extent_cache(inode, pgofs, bh_result))
+		goto out;
+
+	if (create)
+		f2fs_lock_op(sbi);
 
 	/* When reading holes, we need its node page */
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
-	if (err) {
-		trace_f2fs_get_data_block(inode, iblock, bh_result, err);
-		return (err == -ENOENT) ? 0 : err;
+	err = get_dnode_of_data(&dn, pgofs, mode);
+	if (err || dn.data_blkaddr == NEW_ADDR) {
+		if (err == -ENOENT)
+			err = 0;
+		goto unlock_out;
 	}
 
-	/* It does not support data allocation */
-	f2fs_bug_on(create);
-
-	if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
-		int i;
-		unsigned int end_offset;
-
+	if (dn.data_blkaddr != NULL_ADDR) {
+		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+	} else if (create) {
+		err = __allocate_data_block(&dn);
+		if (err)
+			goto put_out;
+		allocated = true;
+		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+	} else {
+		goto put_out;
+	}
+
+	end_offset = IS_INODE(dn.node_page) ?
+			ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+	bh_result->b_size = (((size_t)1) << blkbits);
+	dn.ofs_in_node++;
+	pgofs++;
+
+get_next:
+	if (dn.ofs_in_node >= end_offset) {
+		if (allocated)
+			sync_inode_page(&dn);
+		allocated = false;
+		f2fs_put_dnode(&dn);
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		err = get_dnode_of_data(&dn, pgofs, mode);
+		if (err || dn.data_blkaddr == NEW_ADDR) {
+			if (err == -ENOENT)
+				err = 0;
+			goto unlock_out;
+		}
 		end_offset = IS_INODE(dn.node_page) ?
-				ADDRS_PER_INODE(F2FS_I(inode)) :
-				ADDRS_PER_BLOCK;
-
-		clear_buffer_new(bh_result);
+			ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+	}
 
+	if (maxblocks > (bh_result->b_size >> blkbits)) {
+		block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+		if (blkaddr == NULL_ADDR && create) {
+			err = __allocate_data_block(&dn);
+			if (err)
+				goto sync_out;
+			allocated = true;
+			blkaddr = dn.data_blkaddr;
+		}
 		/* Give more consecutive addresses for the read ahead */
-		for (i = 0; i < end_offset - dn.ofs_in_node; i++)
-			if (((datablock_addr(dn.node_page,
-							dn.ofs_in_node + i))
-				!= (dn.data_blkaddr + i)) || maxblocks == i)
-				break;
-		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
-		bh_result->b_size = (((size_t)i) << blkbits);
+		if (blkaddr == (bh_result->b_blocknr + ofs)) {
+			ofs++;
+			dn.ofs_in_node++;
+			pgofs++;
+			bh_result->b_size += (((size_t)1) << blkbits);
+			goto get_next;
+		}
 	}
+sync_out:
+	if (allocated)
+		sync_inode_page(&dn);
+put_out:
 	f2fs_put_dnode(&dn);
-	trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
-	return 0;
+unlock_out:
+	if (create)
+		f2fs_unlock_op(sbi);
+out:
+	trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+	return err;
 }
 
 static int f2fs_read_data_page(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, get_data_block_ro);
+	return mpage_readpage(page, get_data_block);
 }
 
 static int f2fs_read_data_pages(struct file *file,
 			struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+	return mpage_readpages(mapping, pages, nr_pages, get_data_block);
 }
 
 int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
@@ -883,13 +960,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-
-	if (rw == WRITE)
-		return 0;
-
-	/* Needs synchronization with the cleaner */
 	return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
-						  get_data_block_ro);
+							get_data_block);
 }
 
 static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@ -928,7 +1000,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
 
 static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 {
-	return generic_block_bmap(mapping, block, get_data_block_ro);
+	return generic_block_bmap(mapping, block, get_data_block);
 }
 
 const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1b05a628670..8cbc5a6bf48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1115,6 +1115,8 @@ void recover_data_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
 void rewrite_node_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
+void allocate_data_block(struct f2fs_sb_info *, struct page *,
+		block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5b890ce74b1..9f8bdd02e3a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -854,16 +854,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
 	return __get_segment_type_6(page, p_type);
 }
 
-static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
-			block_t old_blkaddr, block_t *new_blkaddr,
-			struct f2fs_summary *sum, struct f2fs_io_info *fio)
+void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+		block_t old_blkaddr, block_t *new_blkaddr,
+		struct f2fs_summary *sum, int type)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg;
 	unsigned int old_cursegno;
-	int type;
 
-	type = __get_segment_type(page, fio->type);
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
@@ -896,13 +894,22 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
 	mutex_unlock(&sit_i->sentry_lock);
 
-	if (fio->type == NODE)
+	if (page && IS_NODESEG(type))
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
 
+	mutex_unlock(&curseg->curseg_mutex);
+}
+
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+			block_t old_blkaddr, block_t *new_blkaddr,
+			struct f2fs_summary *sum, struct f2fs_io_info *fio)
+{
+	int type = __get_segment_type(page, fio->type);
+
+	allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+
 	/* writeout dirty page into bdev */
 	f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
-
-	mutex_unlock(&curseg->curseg_mutex);
 }
 
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
-- 
cgit v1.2.3-70-g09d2


From 5459aa9770fe2e8c6a660a0985ea454c42cfd8c1 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 17 Dec 2013 17:28:41 +0900
Subject: f2fs: write dirty meta pages collectively

This patch enhances writing dirty meta pages collectively in background.
During the file data writes, it'd better avoid to write small dirty meta pages
frequently.
So let's give a chance to collect a number of dirty meta pages for a while.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f8c074961e0..2fc3b6beaeb 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -107,18 +107,19 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 				struct writeback_control *wbc)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-	struct block_device *bdev = sbi->sb->s_bdev;
+	int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
 	long written;
 
 	if (wbc->for_kupdate)
 		return 0;
 
-	if (get_pages(sbi, F2FS_DIRTY_META) == 0)
+	/* collect a number of dirty meta pages and write together */
+	if (get_pages(sbi, F2FS_DIRTY_META) < nrpages)
 		return 0;
 
 	/* if mounting is failed, skip writing node pages */
 	mutex_lock(&sbi->cp_mutex);
-	written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
+	written = sync_meta_pages(sbi, META, nrpages);
 	mutex_unlock(&sbi->cp_mutex);
 	wbc->nr_to_write -= written;
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 940a6d34b31b96f0748a4b688a551a0890b2b229 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Fri, 20 Dec 2013 17:39:59 +0800
Subject: f2fs: move all the bio initialization into __bio_alloc

Move all the bio initialization into __bio_alloc, and some minor cleanups are
also added.

v3:
  Use 'bool' rather than 'int' as Kim suggested.

v2:
  Use 'is_read' rather than 'rw' as Yu Chao suggested.
  Remove the needless initialization of bio->bi_private.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 92 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 42 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a0950bcbf56..154a4f93a54 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,20 +24,6 @@
 #include "segment.h"
 #include <trace/events/f2fs.h>
 
-/*
- * Low-level block read/write IO operations.
- */
-static struct bio *__bio_alloc(struct block_device *bdev, int npages)
-{
-	struct bio *bio;
-
-	/* No failure on bio allocation */
-	bio = bio_alloc(GFP_NOIO, npages);
-	bio->bi_bdev = bdev;
-	bio->bi_private = NULL;
-	return bio;
-}
-
 static void f2fs_read_end_io(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -93,6 +79,24 @@ static void f2fs_write_end_io(struct bio *bio, int err)
 	bio_put(bio);
 }
 
+/*
+ * Low-level block read/write IO operations.
+ */
+static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+				int npages, bool is_read)
+{
+	struct bio *bio;
+
+	/* No failure on bio allocation */
+	bio = bio_alloc(GFP_NOIO, npages);
+
+	bio->bi_bdev = sbi->sb->s_bdev;
+	bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+
+	return bio;
+}
+
 static void __submit_merged_bio(struct f2fs_bio_info *io)
 {
 	struct f2fs_io_info *fio = &io->fio;
@@ -104,25 +108,26 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	rw = fio->rw | fio->rw_flag;
 
 	if (is_read_io(rw)) {
-		trace_f2fs_submit_read_bio(io->sbi->sb, rw, fio->type, io->bio);
+		trace_f2fs_submit_read_bio(io->sbi->sb, rw,
+						fio->type, io->bio);
 		submit_bio(rw, io->bio);
-		io->bio = NULL;
-		return;
-	}
-	trace_f2fs_submit_write_bio(io->sbi->sb, rw, fio->type, io->bio);
-
-	/*
-	 * META_FLUSH is only from the checkpoint procedure, and we should wait
-	 * this metadata bio for FS consistency.
-	 */
-	if (fio->type == META_FLUSH) {
-		DECLARE_COMPLETION_ONSTACK(wait);
-		io->bio->bi_private = &wait;
-		submit_bio(rw, io->bio);
-		wait_for_completion(&wait);
 	} else {
-		submit_bio(rw, io->bio);
+		trace_f2fs_submit_write_bio(io->sbi->sb, rw,
+						fio->type, io->bio);
+		/*
+		 * META_FLUSH is only from the checkpoint procedure, and we
+		 * should wait this metadata bio for FS consistency.
+		 */
+		if (fio->type == META_FLUSH) {
+			DECLARE_COMPLETION_ONSTACK(wait);
+			io->bio->bi_private = &wait;
+			submit_bio(rw, io->bio);
+			wait_for_completion(&wait);
+		} else {
+			submit_bio(rw, io->bio);
+		}
 	}
+
 	io->bio = NULL;
 }
 
@@ -152,17 +157,12 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
 					block_t blk_addr, int rw)
 {
-	struct block_device *bdev = sbi->sb->s_bdev;
 	struct bio *bio;
 
 	trace_f2fs_submit_page_bio(page, blk_addr, rw);
 
 	/* Allocate a new bio */
-	bio = __bio_alloc(bdev, 1);
-
-	/* Initialize the bio */
-	bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-	bio->bi_end_io = is_read_io(rw) ? f2fs_read_end_io : f2fs_write_end_io;
+	bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
 
 	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
 		bio_put(bio);
@@ -178,17 +178,16 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
 			block_t blk_addr, struct f2fs_io_info *fio)
 {
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
-	struct block_device *bdev = sbi->sb->s_bdev;
 	struct f2fs_bio_info *io;
-	int bio_blocks;
+	bool is_read = is_read_io(fio->rw);
 
-	io = is_read_io(fio->rw) ? &sbi->read_io : &sbi->write_io[btype];
+	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
 
 	verify_block_addr(sbi, blk_addr);
 
 	mutex_lock(&io->io_mutex);
 
-	if (!is_read_io(fio->rw))
+	if (!is_read)
 		inc_page_count(sbi, F2FS_WRITEBACK);
 
 	if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
@@ -196,17 +195,10 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
-		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
-		io->bio = __bio_alloc(bdev, bio_blocks);
-		io->bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
-		io->bio->bi_end_io = is_read_io(fio->rw) ? f2fs_read_end_io :
-							f2fs_write_end_io;
+		int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+
+		io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
 		io->fio = *fio;
-		/*
-		 * The end_io will be assigned at the sumbission phase.
-		 * Until then, let bio_add_page() merge consecutive IOs as much
-		 * as possible.
-		 */
 	}
 
 	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
-- 
cgit v1.2.3-70-g09d2


From 7e8f23081ab3a11de90d7389f2c6fd44676c8df9 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Fri, 20 Dec 2013 18:17:49 +0800
Subject: f2fs: remove the rw_flag domain from f2fs_io_info

When using the f2fs_io_info in the low level, we still need to merge the
rw and rw_flag, so use the rw to hold all the io flags directly,
and remove the rw_flag field.

ps.It is based on the previous patch:
f2fs: move all the bio initialization into __bio_alloc

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c    | 3 +--
 fs/f2fs/f2fs.h    | 5 ++---
 fs/f2fs/gc.c      | 1 -
 fs/f2fs/node.c    | 6 ++----
 fs/f2fs/segment.c | 8 ++------
 5 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 154a4f93a54..e46b5c52d2e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -105,7 +105,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 	if (!io->bio)
 		return;
 
-	rw = fio->rw | fio->rw_flag;
+	rw = fio->rw;
 
 	if (is_read_io(rw)) {
 		trace_f2fs_submit_read_bio(io->sbi->sb, rw,
@@ -760,7 +760,6 @@ static int f2fs_write_data_page(struct page *page,
 	struct f2fs_io_info fio = {
 		.type = DATA,
 		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC: WRITE,
-		.rw_flag = 0,
 	};
 
 	if (page->index < end_index)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8cbc5a6bf48..42f28d4134c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -368,9 +368,8 @@ enum page_type {
 };
 
 struct f2fs_io_info {
-	enum page_type type;		/* contains DATA/NODE/META/META_FLUSH */
-	int rw;				/* contains R/RS/W/WS */
-	int rw_flag;			/* contains REQ_META/REQ_PRIO */
+	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
+	int rw;			/* contains R/RS/W/WS with REQ_META/REQ_PRIO */
 };
 
 #define is_read_io(rw)	(((rw) & 1) == READ)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 69c18e39901..599f546d042 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -523,7 +523,6 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 	struct f2fs_io_info fio = {
 		.type = DATA,
 		.rw = WRITE_SYNC,
-		.rw_flag = 0,
 	};
 
 	if (gc_type == BG_GC) {
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3565caf9700..0af0a715e36 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -94,8 +94,7 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 	int i;
 	struct f2fs_io_info fio = {
 		.type = META,
-		.rw = READ_SYNC,
-		.rw_flag = REQ_META | REQ_PRIO
+		.rw = READ_SYNC | REQ_META | REQ_PRIO
 	};
 
 
@@ -1581,8 +1580,7 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
 	int page_idx = start;
 	struct f2fs_io_info fio = {
 		.type = META,
-		.rw = READ_SYNC,
-		.rw_flag = REQ_META | REQ_PRIO
+		.rw = READ_SYNC | REQ_META | REQ_PRIO
 	};
 
 	for (; page_idx < start + nrpages; page_idx++) {
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9f8bdd02e3a..555ae7693ea 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -916,8 +916,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
 	struct f2fs_io_info fio = {
 		.type = META,
-		.rw = WRITE_SYNC,
-		.rw_flag = REQ_META | REQ_PRIO
+		.rw = WRITE_SYNC | REQ_META | REQ_PRIO
 	};
 
 	set_page_writeback(page);
@@ -931,7 +930,6 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
 	struct f2fs_io_info fio = {
 		.type = NODE,
 		.rw = WRITE_SYNC,
-		.rw_flag = 0
 	};
 
 	set_summary(&sum, nid, 0, 0);
@@ -1018,7 +1016,6 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 	struct f2fs_io_info fio = {
 		.type = NODE,
 		.rw = WRITE_SYNC,
-		.rw_flag = 0
 	};
 
 	curseg = CURSEG_I(sbi, type);
@@ -1598,8 +1595,7 @@ static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
 	int blkno = start;
 	struct f2fs_io_info fio = {
 		.type = META,
-		.rw = READ_SYNC,
-		.rw_flag = REQ_META | REQ_PRIO
+		.rw = READ_SYNC | REQ_META | REQ_PRIO
 	};
 
 	for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
-- 
cgit v1.2.3-70-g09d2


From 4f4124d0b99682efa7307191a28ec050872d2079 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Sat, 21 Dec 2013 18:02:14 +0800
Subject: f2fs: update several comments

Update several comments:
1. use f2fs_{un}lock_op install of mutex_{un}lock_op.
2. update comment of get_data_block().
3. update description of node offset.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 14 ++++++++------
 fs/f2fs/dir.c  |  4 ++--
 fs/f2fs/f2fs.h |  2 +-
 fs/f2fs/node.c |  8 ++++----
 fs/f2fs/node.h |  8 +++++++-
 5 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e46b5c52d2e..0879d2aa97e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -508,8 +508,8 @@ repeat:
  * Caller ensures that this data page is never allocated.
  * A new zero-filled data page is allocated in the page cache.
  *
- * Also, caller should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op().
+ * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
  * Note that, npage is set only by make_empty_dir.
  */
 struct page *get_new_data_page(struct inode *inode,
@@ -595,10 +595,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 }
 
 /*
- * This function should be used by the data read flow only where it
- * does not check the "create" flag that indicates block allocation.
- * The reason for this special functionality is to exploit VFS readahead
- * mechanism.
+ * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ * If original data blocks are allocated, then give them to blockdev.
+ * Otherwise,
+ *     a. preallocate requested block addresses
+ *     b. do not use extent cache for better performance
+ *     c. give the block addresses to blockdev
  */
 static int get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 0cc26ba07c3..28206109a29 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -432,8 +432,8 @@ next:
 }
 
 /*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op().
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
  */
 int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
 {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 42f28d4134c..ba91186823d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -165,7 +165,7 @@ enum {
 	LOOKUP_NODE,			/* look up a node without readahead */
 	LOOKUP_NODE_RA,			/*
 					 * look up a node with readahead called
-					 * by get_datablock_ro.
+					 * by get_data_block.
 					 */
 };
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0af0a715e36..e8fe52d6073 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -393,8 +393,8 @@ got:
 
 /*
  * Caller should call f2fs_put_dnode(dn).
- * Also, it should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op() only if ro is not set RDONLY_NODE.
+ * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
  * In the case of RDONLY_NODE, we don't need to care about mutex.
  */
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
@@ -802,8 +802,8 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 }
 
 /*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op().
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
  */
 void remove_inode_page(struct inode *inode)
 {
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3496bb3e15d..c4c79885c99 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -224,7 +224,13 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
  *    |            `- direct node (5 + N => 5 + 2N - 1)
  *    `- double indirect node (5 + 2N)
  *                 `- indirect node (6 + 2N)
- *                       `- direct node (x(N + 1))
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + x(N + 1))
+ *                       `- direct node
+ *                 ......
+ *                 `- indirect node ((6 + 2N) + (N - 1)(N + 1))
+ *                       `- direct node
  */
 static inline bool IS_DNODE(struct page *node_page)
 {
-- 
cgit v1.2.3-70-g09d2


From deead09009fc5136185fe95026c395b5c2337e1f Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Sat, 21 Dec 2013 18:03:28 +0800
Subject: f2fs: avoid to set wrong pino of inode when rename dir

When we rename a dir to new name which is not exist previous,
we will set pino of parent inode with ino of child inode in f2fs_set_link.
It destroy consistency of pino, it should be fixed.

Thanks for previous work of Shu Tan.

Signed-off-by: Shu Tan <shu.tan@samsung.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/dir.c   | 3 ---
 fs/f2fs/namei.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 28206109a29..07ad850bbf9 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -259,9 +259,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(dir);
 
-	/* update parent inode number before releasing dentry page */
-	F2FS_I(inode)->i_pino = dir->i_ino;
-
 	f2fs_put_page(page, 1);
 }
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 575adac17f8..a68838dae33 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -424,6 +424,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 
 		f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+		F2FS_I(old_inode)->i_pino = new_dir->i_ino;
 
 		new_inode->i_ctime = CURRENT_TIME;
 		if (old_dir_entry)
@@ -457,6 +458,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (old_dir != new_dir) {
 			f2fs_set_link(old_inode, old_dir_entry,
 						old_dir_page, new_dir);
+			F2FS_I(old_inode)->i_pino = new_dir->i_ino;
 		} else {
 			kunmap(old_dir_page);
 			f2fs_put_page(old_dir_page, 0);
-- 
cgit v1.2.3-70-g09d2


From 4ea7772f828a2f1cf6fbf96a3e6f99ae149d2724 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 23 Dec 2013 22:02:16 +0100
Subject: udf: Fix lockdep warning from udf_symlink()

Lockdep is complaining about UDF:
=============================================
[ INFO: possible recursive locking detected ]
3.12.0+ #16 Not tainted
---------------------------------------------
ln/7386 is trying to acquire lock:
 (&ei->i_data_sem){+.+...}, at: [<ffffffff8142f06d>] udf_get_block+0x8d/0x130

but task is already holding lock:
 (&ei->i_data_sem){+.+...}, at: [<ffffffff81431a8d>] udf_symlink+0x8d/0x690

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&ei->i_data_sem);
  lock(&ei->i_data_sem);

 *** DEADLOCK ***

This is because we hold i_data_sem of the symlink inode while calling
udf_add_entry() for the directory. I don't think this can ever lead to
deadlocks since we never hold i_data_sem for two inodes in any other
place.

The fix is simple - move unlock of i_data_sem for symlink inode up. We
don't need it for anything when linking symlink inode to directory.

Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 5f6fc17d6bc..9737cba1357 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	else
 		udf_truncate_tail_extent(inode);
 	mark_inode_dirty(inode);
+	up_write(&iinfo->i_data_sem);
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi)
@@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
 	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 		mark_inode_dirty(dir);
-	up_write(&iinfo->i_data_sem);
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
-- 
cgit v1.2.3-70-g09d2


From d96b143151a11820ee3eee552554209f2453799e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 23 Dec 2013 11:12:21 +0800
Subject: f2fs: check filename length in recover_dentry

In current flow, we will get Null return value of f2fs_find_entry in
recover_dentry when name.len is bigger than F2FS_NAME_LEN, and then we
still add this inode into its dir entry.
To avoid this situation, we must check filename length before we use it.

Another point is that we could remove the code of checking filename length
In f2fs_find_entry, because f2fs_lookup will be called previously to ensure of
validity of filename length.

V2:
 o add WARN_ON() as Jaegeuk Kim suggested.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/dir.c      | 3 ---
 fs/f2fs/recovery.c | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 07ad850bbf9..f0b46304944 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -190,9 +190,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 	unsigned int max_depth;
 	unsigned int level;
 
-	if (unlikely(namelen > F2FS_NAME_LEN))
-		return NULL;
-
 	if (npages == 0)
 		return NULL;
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index a3f4542160c..4d411a26b85 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -62,6 +62,12 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
 
 	name.len = le32_to_cpu(raw_inode->i_namelen);
 	name.name = raw_inode->i_name;
+
+	if (unlikely(name.len > F2FS_NAME_LEN)) {
+		WARN_ON(1);
+		err = -ENAMETOOLONG;
+		goto out;
+	}
 retry:
 	de = f2fs_find_entry(dir, &name, &page);
 	if (de && inode->i_ino == le32_to_cpu(de->ino))
-- 
cgit v1.2.3-70-g09d2


From 58bfaf44df58082c72882b235cae611c975537d4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 26 Dec 2013 16:30:41 +0900
Subject: f2fs: introduce F2FS_INODE macro to get f2fs_inode

This patch introduces F2FS_INODE that returns struct f2fs_inode * from the inode
page.
By using this macro, we can remove unnecessary casting codes like below.

   struct f2fs_inode *ri = &F2FS_NODE(inode_page)->i;
-> struct f2fs_inode *ri = F2FS_INODE(inode_page);

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/dir.c      |  8 ++++----
 fs/f2fs/f2fs.h     |  5 +++++
 fs/f2fs/inode.c    |  8 ++------
 fs/f2fs/node.c     | 30 +++++++++++++++---------------
 fs/f2fs/recovery.c |  6 ++----
 5 files changed, 28 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index f0b46304944..6da77e5c753 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -261,12 +261,12 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 
 static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
-	struct f2fs_node *rn;
+	struct f2fs_inode *ri;
 
 	/* copy name info. to this inode page */
-	rn = F2FS_NODE(ipage);
-	rn->i.i_namelen = cpu_to_le32(name->len);
-	memcpy(rn->i.i_name, name->name, name->len);
+	ri = F2FS_INODE(ipage);
+	ri->i_namelen = cpu_to_le32(name->len);
+	memcpy(ri->i_name, name->name, name->len);
 	set_page_dirty(ipage);
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ba91186823d..1a06f0aaa7d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -498,6 +498,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page)
 	return (struct f2fs_node *)page_address(page);
 }
 
+static inline struct f2fs_inode *F2FS_INODE(struct page *page)
+{
+	return &((struct f2fs_node *)page_address(page))->i;
+}
+
 static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
 {
 	return (struct f2fs_nm_info *)(sbi->nm_info);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d0eaa9faeca..a91f45177cd 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -67,7 +67,6 @@ static int do_read_inode(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct page *node_page;
-	struct f2fs_node *rn;
 	struct f2fs_inode *ri;
 
 	/* Check if ino is within scope */
@@ -81,8 +80,7 @@ static int do_read_inode(struct inode *inode)
 	if (IS_ERR(node_page))
 		return PTR_ERR(node_page);
 
-	rn = F2FS_NODE(node_page);
-	ri = &(rn->i);
+	ri = F2FS_INODE(node_page);
 
 	inode->i_mode = le16_to_cpu(ri->i_mode);
 	i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -175,13 +173,11 @@ bad_inode:
 
 void update_inode(struct inode *inode, struct page *node_page)
 {
-	struct f2fs_node *rn;
 	struct f2fs_inode *ri;
 
 	f2fs_wait_on_page_writeback(node_page, NODE, false);
 
-	rn = F2FS_NODE(node_page);
-	ri = &(rn->i);
+	ri = F2FS_INODE(node_page);
 
 	ri->i_mode = cpu_to_le16(inode->i_mode);
 	ri->i_advise = F2FS_I(inode)->i_advise;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e8fe52d6073..9405a17a671 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -684,7 +684,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 	int err = 0, cont = 1;
 	int level, offset[4], noffset[4];
 	unsigned int nofs = 0;
-	struct f2fs_node *rn;
+	struct f2fs_inode *ri;
 	struct dnode_of_data dn;
 	struct page *page;
 
@@ -701,7 +701,7 @@ restart:
 	set_new_dnode(&dn, inode, page, NULL, 0);
 	unlock_page(page);
 
-	rn = F2FS_NODE(page);
+	ri = F2FS_INODE(page);
 	switch (level) {
 	case 0:
 	case 1:
@@ -711,7 +711,7 @@ restart:
 		nofs = noffset[1];
 		if (!offset[level - 1])
 			goto skip_partial;
-		err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+		err = truncate_partial_nodes(&dn, ri, offset, level);
 		if (err < 0 && err != -ENOENT)
 			goto fail;
 		nofs += 1 + NIDS_PER_BLOCK;
@@ -720,7 +720,7 @@ restart:
 		nofs = 5 + 2 * NIDS_PER_BLOCK;
 		if (!offset[level - 1])
 			goto skip_partial;
-		err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+		err = truncate_partial_nodes(&dn, ri, offset, level);
 		if (err < 0 && err != -ENOENT)
 			goto fail;
 		break;
@@ -730,7 +730,7 @@ restart:
 
 skip_partial:
 	while (cont) {
-		dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
+		dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
 		switch (offset[0]) {
 		case NODE_DIR1_BLOCK:
 		case NODE_DIR2_BLOCK:
@@ -753,14 +753,14 @@ skip_partial:
 		if (err < 0 && err != -ENOENT)
 			goto fail;
 		if (offset[1] == 0 &&
-				rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 			lock_page(page);
 			if (unlikely(page->mapping != node_mapping)) {
 				f2fs_put_page(page, 1);
 				goto restart;
 			}
 			wait_on_page_writeback(page);
-			rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
 			set_page_dirty(page);
 			unlock_page(page);
 		}
@@ -1533,7 +1533,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 {
 	struct address_space *mapping = sbi->node_inode->i_mapping;
-	struct f2fs_node *src, *dst;
+	struct f2fs_inode *src, *dst;
 	nid_t ino = ino_of_node(page);
 	struct node_info old_ni, new_ni;
 	struct page *ipage;
@@ -1549,14 +1549,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 	SetPageUptodate(ipage);
 	fill_node_footer(ipage, ino, ino, 0, true);
 
-	src = F2FS_NODE(page);
-	dst = F2FS_NODE(ipage);
+	src = F2FS_INODE(page);
+	dst = F2FS_INODE(ipage);
 
-	memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
-	dst->i.i_size = 0;
-	dst->i.i_blocks = cpu_to_le64(1);
-	dst->i.i_links = cpu_to_le32(1);
-	dst->i.i_xattr_nid = 0;
+	memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
+	dst->i_size = 0;
+	dst->i_blocks = cpu_to_le64(1);
+	dst->i_links = cpu_to_le32(1);
+	dst->i_xattr_nid = 0;
 
 	new_ni = old_ni;
 	new_ni.ino = ino;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 4d411a26b85..96e690b6f0f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 
 static int recover_dentry(struct page *ipage, struct inode *inode)
 {
-	struct f2fs_node *raw_node = F2FS_NODE(ipage);
-	struct f2fs_inode *raw_inode = &(raw_node->i);
+	struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
 	nid_t pino = le32_to_cpu(raw_inode->i_pino);
 	struct f2fs_dir_entry *de;
 	struct qstr name;
@@ -105,8 +104,7 @@ out:
 
 static int recover_inode(struct inode *inode, struct page *node_page)
 {
-	struct f2fs_node *raw_node = F2FS_NODE(node_page);
-	struct f2fs_inode *raw_inode = &(raw_node->i);
+	struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
 
 	if (!IS_INODE(node_page))
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 1ec79083b2d4614d9dbaea67b5f55b60d7137a2d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 26 Dec 2013 16:55:22 +0900
Subject: f2fs: should put the dnode when NEW_ADDR is detected

When get_dnode_of_data() in get_data_block() returns a successful dnode, we
should put the dnode.
But, previously, if its data block address is equal to NEW_ADDR, we didn't do
that, resulting in a deadlock condition.
So, this patch splits original error conditions with this case, and then calls
f2fs_put_dnode before finishing the function.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0879d2aa97e..991e36835df 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -626,11 +626,13 @@ static int get_data_block(struct inode *inode, sector_t iblock,
 	/* When reading holes, we need its node page */
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, pgofs, mode);
-	if (err || dn.data_blkaddr == NEW_ADDR) {
+	if (err) {
 		if (err == -ENOENT)
 			err = 0;
 		goto unlock_out;
 	}
+	if (dn.data_blkaddr == NEW_ADDR)
+		goto put_out;
 
 	if (dn.data_blkaddr != NULL_ADDR) {
 		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
@@ -659,11 +661,14 @@ get_next:
 
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 		err = get_dnode_of_data(&dn, pgofs, mode);
-		if (err || dn.data_blkaddr == NEW_ADDR) {
+		if (err) {
 			if (err == -ENOENT)
 				err = 0;
 			goto unlock_out;
 		}
+		if (dn.data_blkaddr == NEW_ADDR)
+			goto put_out;
+
 		end_offset = IS_INODE(dn.node_page) ?
 			ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
 	}
-- 
cgit v1.2.3-70-g09d2


From 944fcfc18445b22d7ad1953a6effcfbdc4ffec74 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 26 Dec 2013 20:15:09 +0900
Subject: f2fs: check the blocksize before calling generic_direct_IO path

The f2fs supports 4KB block size. If user requests dwrite with under 4KB data,
it allocates a new 4KB data block.
However, f2fs doesn't add zero data into the untouched data area inside the
newly allocated data block.

This incurs an error during the xfstest #263 test as follow.

263 12s ... [failed, exit status 1] - output mismatch (see 263.out.bad)
	--- 263.out	2013-03-09 03:37:15.043967603 +0900
	+++ 263.out.bad	2013-12-27 04:20:39.230203114 +0900
	@@ -1,3 +1,976 @@
	QA output created by 263
	fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z
	-fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z
	+fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z
	+truncating to largest ever: 0x12a00
	+truncating to largest ever: 0x75400
	+fallocating to largest ever: 0x79cbf
	...
	(Run 'diff -u 263.out 263.out.bad' to see the entire diff)
	Ran: 263
	Failures: 263
	Failed 1 of 1 tests

It turns out that, when the test tries to write 2KB data with dio, the new dio
path allocates 4KB data block without filling zero data inside the remained 2KB
area. Finally, the output file contains a garbage data for that region.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 991e36835df..e0965b43d16 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -953,11 +953,33 @@ static int f2fs_write_end(struct file *file,
 	return copied;
 }
 
+static int check_direct_IO(struct inode *inode, int rw,
+		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+	unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+	int i;
+
+	if (rw == READ)
+		return 0;
+
+	if (offset & blocksize_mask)
+		return -EINVAL;
+
+	for (i = 0; i < nr_segs; i++)
+		if (iov[i].iov_len & blocksize_mask)
+			return -EINVAL;
+	return 0;
+}
+
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+
+	if (check_direct_IO(inode, rw, iov, offset, nr_segs))
+		return 0;
+
 	return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
 							get_data_block);
 }
-- 
cgit v1.2.3-70-g09d2


From 0d47c1adc2a1f1e4f4673f122a8328c90c58e232 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 18:24:19 +0800
Subject: f2fs: convert max_orphans to a field of f2fs_sb_info

Previously, we need to calculate the max orphan num when we try to acquire an
orphan inode, but it's a stable value since the super block was inited. So
converting it to a field of f2fs_sb_info and use it directly when needed seems
a better choose.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 20 ++++++++++----------
 fs/f2fs/f2fs.h       |  1 +
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 2fc3b6beaeb..0d78bbe79f9 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -194,23 +194,15 @@ const struct address_space_operations f2fs_meta_aops = {
 
 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
-	unsigned int max_orphans;
 	int err = 0;
 
-	/*
-	 * considering 512 blocks in a segment 8 blocks are needed for cp
-	 * and log segment summaries. Remaining blocks are used to keep
-	 * orphan entries with the limitation one reserved segment
-	 * for cp pack we can have max 1020*504 orphan entries
-	 */
-	max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
-				* F2FS_ORPHANS_PER_BLOCK;
 	mutex_lock(&sbi->orphan_inode_mutex);
-	if (unlikely(sbi->n_orphans >= max_orphans))
+	if (unlikely(sbi->n_orphans >= sbi->max_orphans))
 		err = -ENOSPC;
 	else
 		sbi->n_orphans++;
 	mutex_unlock(&sbi->orphan_inode_mutex);
+
 	return err;
 }
 
@@ -834,6 +826,14 @@ void init_orphan_info(struct f2fs_sb_info *sbi)
 	mutex_init(&sbi->orphan_inode_mutex);
 	INIT_LIST_HEAD(&sbi->orphan_inode_list);
 	sbi->n_orphans = 0;
+	/*
+	 * considering 512 blocks in a segment 8 blocks are needed for cp
+	 * and log segment summaries. Remaining blocks are used to keep
+	 * orphan entries with the limitation one reserved segment
+	 * for cp pack we can have max 1020*504 orphan entries
+	 */
+	sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
+				* F2FS_ORPHANS_PER_BLOCK;
 }
 
 int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1a06f0aaa7d..36211cb1626 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -414,6 +414,7 @@ struct f2fs_sb_info {
 	struct list_head orphan_inode_list;	/* orphan inode list */
 	struct mutex orphan_inode_mutex;	/* for orphan inode list */
 	unsigned int n_orphans;			/* # of orphan inodes */
+	unsigned int max_orphans;		/* max orphan inodes */
 
 	/* for directory inode management */
 	struct list_head dir_inode_list;	/* dir inode list */
-- 
cgit v1.2.3-70-g09d2


From e18c65b2ac91aa59f89333da595d5155184f76cf Mon Sep 17 00:00:00 2001
From: Huajun Li <huajun.li@intel.com>
Date: Sun, 10 Nov 2013 23:13:19 +0800
Subject: f2fs: key functions to handle inline data

Functions to implement inline data read/write, and move inline data to
normal data block when file size exceeds inline data limitation.

Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/Makefile |   2 +-
 fs/f2fs/f2fs.h   |   8 +++
 fs/f2fs/inline.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 fs/f2fs/inline.c

(limited to 'fs')

diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 27a0820340b..2e35da12d29 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_F2FS_FS) += f2fs.o
 
-f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o
+f2fs-y		:= dir.o file.o inode.o namei.o hash.o super.o inline.o
 f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 36211cb1626..b01ccaa73b2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1296,4 +1296,12 @@ extern const struct inode_operations f2fs_dir_inode_operations;
 extern const struct inode_operations f2fs_symlink_inode_operations;
 extern const struct inode_operations f2fs_special_inode_operations;
 
+/*
+ * inline.c
+ */
+inline int f2fs_has_inline_data(struct inode *);
+bool f2fs_may_inline(struct inode *);
+int f2fs_read_inline_data(struct inode *, struct page *);
+int f2fs_convert_inline_data(struct inode *, struct page *, unsigned);
+int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
 #endif
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
new file mode 100644
index 00000000000..62c72aa84ac
--- /dev/null
+++ b/fs/f2fs/inline.c
@@ -0,0 +1,176 @@
+/*
+ * fs/f2fs/inline.c
+ * Copyright (c) 2013, Intel Corporation
+ * Authors: Huajun Li <huajun.li@intel.com>
+ *          Haicheng Li <haicheng.li@intel.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+
+inline int f2fs_has_inline_data(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+}
+
+bool f2fs_may_inline(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	block_t nr_blocks;
+	loff_t i_size;
+
+	if (!test_opt(sbi, INLINE_DATA))
+		return false;
+
+	nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
+	if (inode->i_blocks > nr_blocks)
+		return false;
+
+	i_size = i_size_read(inode);
+	if (i_size > MAX_INLINE_DATA)
+		return false;
+
+	return true;
+}
+
+int f2fs_read_inline_data(struct inode *inode, struct page *page)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct page *ipage;
+	void *src_addr, *dst_addr;
+
+	ipage = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(ipage))
+		return PTR_ERR(ipage);
+
+	zero_user_segment(page, INLINE_DATA_OFFSET,
+				INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+
+	/* Copy the whole inline data block */
+	src_addr = inline_data_addr(ipage);
+	dst_addr = kmap(page);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	kunmap(page);
+	f2fs_put_page(ipage, 1);
+
+	SetPageUptodate(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
+{
+	int err;
+	struct page *ipage;
+	struct dnode_of_data dn;
+	void *src_addr, *dst_addr;
+	block_t new_blk_addr;
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = WRITE_SYNC | REQ_PRIO,
+	};
+
+	f2fs_lock_op(sbi);
+	ipage = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(ipage))
+		return PTR_ERR(ipage);
+
+	/*
+	 * i_addr[0] is not used for inline data,
+	 * so reserving new block will not destroy inline data
+	 */
+	set_new_dnode(&dn, inode, ipage, ipage, 0);
+	err = f2fs_reserve_block(&dn, 0);
+	if (err) {
+		f2fs_put_page(ipage, 1);
+		f2fs_unlock_op(sbi);
+		return err;
+	}
+
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+
+	/* Copy the whole inline data block */
+	src_addr = inline_data_addr(ipage);
+	dst_addr = kmap(page);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	kunmap(page);
+
+	/* write data page to try to make data consistent */
+	set_page_writeback(page);
+	write_data_page(page, &dn, &new_blk_addr, &fio);
+	update_extent_cache(new_blk_addr, &dn);
+	f2fs_wait_on_page_writeback(page, DATA, true);
+
+	/* clear inline data and flag after data writeback */
+	zero_user_segment(ipage, INLINE_DATA_OFFSET,
+				 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+	clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+
+	sync_inode_page(&dn);
+	f2fs_put_page(ipage, 1);
+	f2fs_unlock_op(sbi);
+
+	return err;
+}
+
+int f2fs_convert_inline_data(struct inode *inode,
+			     struct page *p, unsigned flags)
+{
+	int err;
+	struct page *page;
+
+	if (!p || p->index) {
+		page = grab_cache_page_write_begin(inode->i_mapping, 0, flags);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+	} else {
+		page = p;
+	}
+
+	err = __f2fs_convert_inline_data(inode, page);
+
+	if (!p || p->index)
+		f2fs_put_page(page, 1);
+
+	return err;
+}
+
+int f2fs_write_inline_data(struct inode *inode,
+			   struct page *page, unsigned size)
+{
+	void *src_addr, *dst_addr;
+	struct page *ipage;
+	struct dnode_of_data dn;
+	int err;
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+	if (err)
+		return err;
+	ipage = dn.inode_page;
+
+	zero_user_segment(ipage, INLINE_DATA_OFFSET,
+				 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+	src_addr = kmap(page);
+	dst_addr = inline_data_addr(ipage);
+	memcpy(dst_addr, src_addr, size);
+	kunmap(page);
+
+	/* Release the first data block if it is allocated */
+	if (!f2fs_has_inline_data(inode)) {
+		truncate_data_blocks_range(&dn, 1);
+		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+	}
+
+	sync_inode_page(&dn);
+	f2fs_put_dnode(&dn);
+
+	return 0;
+}
-- 
cgit v1.2.3-70-g09d2


From 9ffe0fb5f3bbd01c766162bb17a62dee0df7e125 Mon Sep 17 00:00:00 2001
From: Huajun Li <huajun.li@intel.com>
Date: Sun, 10 Nov 2013 23:13:20 +0800
Subject: f2fs: handle inline data operations

Hook inline data read/write, truncate, fallocate, setattr, etc.

Files need meet following 2 requirement to inline:
 1) file size is not greater than MAX_INLINE_DATA;
 2) file doesn't pre-allocate data blocks by fallocate().

FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.

While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].

On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.

Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/f2fs/file.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 86 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e0965b43d16..bf39eed2442 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -706,13 +706,28 @@ out:
 
 static int f2fs_read_data_page(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, get_data_block);
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	/* If the file has inline data, try to read it directlly */
+	if (f2fs_has_inline_data(inode))
+		ret = f2fs_read_inline_data(inode, page);
+	else
+		ret = mpage_readpage(page, get_data_block);
+
+	return ret;
 }
 
 static int f2fs_read_data_pages(struct file *file,
 			struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages)
 {
+	struct inode *inode = file->f_mapping->host;
+
+	/* If the file has inline data, skip readpages */
+	if (f2fs_has_inline_data(inode))
+		return 0;
+
 	return mpage_readpages(mapping, pages, nr_pages, get_data_block);
 }
 
@@ -761,7 +776,7 @@ static int f2fs_write_data_page(struct page *page,
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = ((unsigned long long) i_size)
 							>> PAGE_CACHE_SHIFT;
-	unsigned offset;
+	unsigned offset = 0;
 	bool need_balance_fs = false;
 	int err = 0;
 	struct f2fs_io_info fio = {
@@ -799,7 +814,15 @@ write:
 		err = do_write_data_page(page, &fio);
 	} else {
 		f2fs_lock_op(sbi);
-		err = do_write_data_page(page, &fio);
+
+		if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
+			err = f2fs_write_inline_data(inode, page, offset);
+			f2fs_unlock_op(sbi);
+			goto out;
+		} else {
+			err = do_write_data_page(page, &fio);
+		}
+
 		f2fs_unlock_op(sbi);
 		need_balance_fs = true;
 	}
@@ -888,6 +911,15 @@ repeat:
 		return -ENOMEM;
 	*pagep = page;
 
+	if ((pos + len) < MAX_INLINE_DATA) {
+		if (f2fs_has_inline_data(inode))
+			goto inline_data;
+	} else if (f2fs_has_inline_data(inode)) {
+		err = f2fs_convert_inline_data(inode, page, flags);
+		if (err)
+			return err;
+	}
+
 	f2fs_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = f2fs_reserve_block(&dn, index);
@@ -897,7 +929,7 @@ repeat:
 		f2fs_put_page(page, 1);
 		return err;
 	}
-
+inline_data:
 	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
 		return 0;
 
@@ -913,7 +945,10 @@ repeat:
 	if (dn.data_blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 	} else {
-		err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+		if (f2fs_has_inline_data(inode))
+			err = f2fs_read_inline_data(inode, page);
+		else
+			err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
 							READ_SYNC);
 		if (err)
 			return err;
@@ -977,6 +1012,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
+	/* Let buffer I/O handle the inline data case. */
+	if (f2fs_has_inline_data(inode))
+		return 0;
+
 	if (check_direct_IO(inode, rw, iov, offset, nr_segs))
 		return 0;
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5accc964368..dd80e725acb 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -257,8 +257,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	unsigned int blocksize = inode->i_sb->s_blocksize;
 	struct dnode_of_data dn;
 	pgoff_t free_from;
-	int count = 0;
-	int err;
+	int count = 0, err = 0;
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
 
@@ -266,6 +265,10 @@ static int truncate_blocks(struct inode *inode, u64 from)
 			((from + blocksize - 1) >> (sbi->log_blocksize));
 
 	f2fs_lock_op(sbi);
+
+	if (f2fs_has_inline_data(inode))
+		goto done;
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
 	if (err) {
@@ -292,6 +295,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	f2fs_put_dnode(&dn);
 free_next:
 	err = truncate_inode_blocks(inode, free_from);
+done:
 	f2fs_unlock_op(sbi);
 
 	/* lastly zero out the first data page */
@@ -367,8 +371,17 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if ((attr->ia_valid & ATTR_SIZE) &&
 			attr->ia_size != i_size_read(inode)) {
+		if (f2fs_has_inline_data(inode) &&
+				(attr->ia_size > MAX_INLINE_DATA)) {
+			unsigned flags = AOP_FLAG_NOFS;
+			err = f2fs_convert_inline_data(inode, NULL, flags);
+			if (err)
+				return err;
+		}
+
 		truncate_setsize(inode, attr->ia_size);
-		f2fs_truncate(inode);
+		if (!f2fs_has_inline_data(inode))
+			f2fs_truncate(inode);
 		f2fs_balance_fs(F2FS_SB(inode->i_sb));
 	}
 
@@ -450,6 +463,26 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	loff_t off_start, off_end;
 	int ret = 0;
 
+	if (f2fs_has_inline_data(inode)) {
+		struct page *page;
+		unsigned flags = AOP_FLAG_NOFS;
+		page = grab_cache_page_write_begin(inode->i_mapping, 0, flags);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		if (offset + len > MAX_INLINE_DATA) {
+			ret = f2fs_convert_inline_data(inode, page, flags);
+			f2fs_put_page(page, 1);
+			if (ret)
+				return ret;
+		} else {
+			zero_user_segment(page, offset, offset + len);
+			SetPageUptodate(page);
+			set_page_dirty(page);
+			f2fs_put_page(page, 1);
+			return ret;
+		}
+	}
+
 	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
 	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
 
@@ -496,6 +529,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	loff_t off_start, off_end;
 	int ret = 0;
 
+	if (f2fs_has_inline_data(inode) && (offset + len > MAX_INLINE_DATA)) {
+		ret = f2fs_convert_inline_data(inode, NULL, AOP_FLAG_NOFS);
+		if (ret)
+			return ret;
+	}
+
 	ret = inode_newsize_ok(inode, (len + offset));
 	if (ret)
 		return ret;
-- 
cgit v1.2.3-70-g09d2


From f185ff979f97943c9d4966ed4edc9819c3342c5b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 27 Dec 2013 11:01:54 +0900
Subject: f2fs: don't need to get f2fs_lock_op for the inline_data test

This patch locates checking the inline_data prior to calling f2fs_lock_op()
in truncate_blocks(), since getting the lock is unnecessary.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/file.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index dd80e725acb..7ef2d6af24c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -261,14 +261,14 @@ static int truncate_blocks(struct inode *inode, u64 from)
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
 
+	if (f2fs_has_inline_data(inode))
+		goto done;
+
 	free_from = (pgoff_t)
 			((from + blocksize - 1) >> (sbi->log_blocksize));
 
 	f2fs_lock_op(sbi);
 
-	if (f2fs_has_inline_data(inode))
-		goto done;
-
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
 	if (err) {
@@ -295,9 +295,8 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	f2fs_put_dnode(&dn);
 free_next:
 	err = truncate_inode_blocks(inode, free_from);
-done:
 	f2fs_unlock_op(sbi);
-
+done:
 	/* lastly zero out the first data page */
 	truncate_partial_data_page(inode, from);
 
-- 
cgit v1.2.3-70-g09d2


From 61f68816211ee4b884dc0dda8dd4d977548f4865 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Thu, 28 Nov 2013 14:28:14 +0800
Subject: ceph: check caps in filemap_fault and page_mkwrite

Adds cap check to the page fault handler. The check prevents page
fault handler from adding new page to the page cache while Fcb caps
are being revoked. This solves Fc revoking hang in multiple clients
mmap IO workload.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/addr.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 77 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c346b8479f9..ebda329611b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1210,6 +1210,41 @@ const struct address_space_operations ceph_aops = {
 /*
  * vm ops
  */
+static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *fi = vma->vm_file->private_data;
+	loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+	int want, got, ret;
+
+	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
+	     inode, ceph_vinop(inode), off, PAGE_CACHE_SIZE);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_CACHE;
+	while (1) {
+		got = 0;
+		ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+		if (ret == 0)
+			break;
+		if (ret != -ERESTARTSYS) {
+			WARN_ON(1);
+			return VM_FAULT_SIGBUS;
+		}
+	}
+	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
+	     inode, off, PAGE_CACHE_SIZE, ceph_cap_string(got));
+
+	ret = filemap_fault(vma, vmf);
+
+	dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
+	     inode, off, PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+	ceph_put_cap_refs(ci, got);
+
+	return ret;
+}
 
 /*
  * Reuse write_begin here for simplicity.
@@ -1217,23 +1252,41 @@ const struct address_space_operations ceph_aops = {
 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vma->vm_file);
-	struct page *page = vmf->page;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *fi = vma->vm_file->private_data;
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct page *page = vmf->page;
 	loff_t off = page_offset(page);
-	loff_t size, len;
-	int ret;
-
-	/* Update time before taking page lock */
-	file_update_time(vma->vm_file);
+	loff_t size = i_size_read(inode);
+	size_t len;
+	int want, got, ret;
 
-	size = i_size_read(inode);
 	if (off + PAGE_CACHE_SIZE <= size)
 		len = PAGE_CACHE_SIZE;
 	else
 		len = size & ~PAGE_CACHE_MASK;
 
-	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
-	     off, len, page, page->index);
+	dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
+	     inode, ceph_vinop(inode), off, len, size);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+	while (1) {
+		got = 0;
+		ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
+		if (ret == 0)
+			break;
+		if (ret != -ERESTARTSYS) {
+			WARN_ON(1);
+			return VM_FAULT_SIGBUS;
+		}
+	}
+	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
+	     inode, off, len, ceph_cap_string(got));
+
+	/* Update time before taking page lock */
+	file_update_time(vma->vm_file);
 
 	lock_page(page);
 
@@ -1255,14 +1308,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 			ret = VM_FAULT_SIGBUS;
 	}
 out:
-	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
-	if (ret != VM_FAULT_LOCKED)
+	if (ret != VM_FAULT_LOCKED) {
 		unlock_page(page);
+	} else {
+		int dirty;
+		spin_lock(&ci->i_ceph_lock);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
+	     inode, off, len, ceph_cap_string(got), ret);
+	ceph_put_cap_refs(ci, got);
+
 	return ret;
 }
 
 static struct vm_operations_struct ceph_vmops = {
-	.fault		= filemap_fault,
+	.fault		= ceph_filemap_fault,
 	.page_mkwrite	= ceph_page_mkwrite,
 	.remap_pages	= generic_file_remap_pages,
 };
-- 
cgit v1.2.3-70-g09d2


From 7221fe4c2ed72804b28633c8e0217d65abb0023f Mon Sep 17 00:00:00 2001
From: Guangliang Zhao <lucienchao@gmail.com>
Date: Mon, 11 Nov 2013 15:18:03 +0800
Subject: ceph: add acl for cephfs

Signed-off-by: Guangliang Zhao <lucienchao@gmail.com>
Reviewed-by: Li Wang <li.wang@ubuntykylin.com>
Reviewed-by: Zheng Yan <zheng.z.yan@intel.com>
---
 fs/ceph/Kconfig  |  13 +++
 fs/ceph/Makefile |   1 +
 fs/ceph/acl.c    | 332 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/caps.c   |   1 +
 fs/ceph/dir.c    |   5 +
 fs/ceph/inode.c  |  11 ++
 fs/ceph/super.c  |   4 +
 fs/ceph/super.h  |  37 ++++++-
 fs/ceph/xattr.c  |  60 ++++++++--
 9 files changed, 451 insertions(+), 13 deletions(-)
 create mode 100644 fs/ceph/acl.c

(limited to 'fs')

diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index ac9a2ef5bb9..264e9bf83ff 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -25,3 +25,16 @@ config CEPH_FSCACHE
 	  caching support for Ceph clients using FS-Cache
 
 endif
+
+config CEPH_FS_POSIX_ACL
+	bool "Ceph POSIX Access Control Lists"
+	depends on CEPH_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 32e30106a2f..85a4230b9bf 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 	debugfs.o
 
 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 00000000000..64fddbc1d17
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,332 @@
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "super.h"
+
+static inline void ceph_set_cached_acl(struct inode *inode,
+					int type, struct posix_acl *acl)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+		set_cached_acl(inode, type, acl);
+	spin_unlock(&ci->i_ceph_lock);
+}
+
+static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
+							int type)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct posix_acl *acl = ACL_NOT_CACHED;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+		acl = get_cached_acl(inode, type);
+	spin_unlock(&ci->i_ceph_lock);
+
+	return acl;
+}
+
+void ceph_forget_all_cached_acls(struct inode *inode)
+{
+	forget_all_cached_acls(inode);
+}
+
+struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+{
+	int size;
+	const char *name;
+	char *value = NULL;
+	struct posix_acl *acl;
+
+	if (!IS_POSIXACL(inode))
+		return NULL;
+
+	acl = ceph_get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	size = __ceph_getxattr(inode, name, "", 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __ceph_getxattr(inode, name, value, size);
+	}
+
+	if (size > 0)
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	else if (size == -ERANGE || size == -ENODATA || size == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(-EIO);
+
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		ceph_set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+static int ceph_set_acl(struct dentry *dentry, struct inode *inode,
+				struct posix_acl *acl, int type)
+{
+	int ret = 0, size = 0;
+	const char *name = NULL;
+	char *value = NULL;
+	struct iattr newattrs;
+	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+
+	if (acl) {
+		ret = posix_acl_valid(acl);
+		if (ret < 0)
+			goto out;
+	}
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &new_mode);
+			if (ret < 0)
+				goto out;
+			if (ret == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode)) {
+			ret = acl ? -EINVAL : 0;
+			goto out;
+		}
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (ret < 0)
+			goto out_free;
+	}
+
+	if (new_mode != old_mode) {
+		newattrs.ia_mode = new_mode;
+		newattrs.ia_valid = ATTR_MODE;
+		ret = ceph_setattr(dentry, &newattrs);
+		if (ret)
+			goto out_free;
+	}
+
+	if (value)
+		ret = __ceph_setxattr(dentry, name, value, size, 0);
+	else
+		ret = __ceph_removexattr(dentry, name);
+
+	if (ret) {
+		if (new_mode != old_mode) {
+			newattrs.ia_mode = old_mode;
+			newattrs.ia_valid = ATTR_MODE;
+			ceph_setattr(dentry, &newattrs);
+		}
+		goto out_free;
+	}
+
+	ceph_set_cached_acl(inode, type, acl);
+
+out_free:
+	kfree(value);
+out:
+	return ret;
+}
+
+int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (IS_POSIXACL(dir)) {
+			acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl)) {
+				ret = PTR_ERR(acl);
+				goto out;
+			}
+		}
+
+		if (!acl)
+			inode->i_mode &= ~current_umask();
+	}
+
+	if (IS_POSIXACL(dir) && acl) {
+		if (S_ISDIR(inode->i_mode)) {
+			ret = ceph_set_acl(dentry, inode, acl,
+						ACL_TYPE_DEFAULT);
+			if (ret)
+				goto out_release;
+		}
+		ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		if (ret < 0)
+			goto out;
+		else if (ret > 0)
+			ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS);
+		else
+			cache_no_acl(inode);
+	} else {
+		cache_no_acl(inode);
+	}
+
+out_release:
+	posix_acl_release(acl);
+out:
+	return ret;
+}
+
+int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (S_ISLNK(inode->i_mode)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!IS_POSIXACL(inode))
+		goto out;
+
+	acl = ceph_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR_OR_NULL(acl)) {
+		ret = PTR_ERR(acl);
+		goto out;
+	}
+
+	ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	if (ret)
+		goto out;
+	ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS);
+	posix_acl_release(acl);
+out:
+	return ret;
+}
+
+static int ceph_xattr_acl_get(struct dentry *dentry, const char *name,
+				void *value, size_t size, int type)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+
+	acl = ceph_get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+static int ceph_xattr_acl_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int ret = 0;
+	struct posix_acl *acl = NULL;
+
+	if (!inode_owner_or_capable(dentry->d_inode)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (!IS_POSIXACL(dentry->d_inode)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (value) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		if (IS_ERR(acl)) {
+			ret = PTR_ERR(acl);
+			goto out;
+		}
+
+		if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto out_release;
+		}
+	}
+
+	ret = ceph_set_acl(dentry, dentry->d_inode, acl, type);
+
+out_release:
+	posix_acl_release(acl);
+out:
+	return ret;
+}
+
+const struct xattr_handler ceph_xattr_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.flags  = ACL_TYPE_DEFAULT,
+	.get    = ceph_xattr_acl_get,
+	.set    = ceph_xattr_acl_set,
+};
+
+const struct xattr_handler ceph_xattr_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.flags  = ACL_TYPE_ACCESS,
+	.get    = ceph_xattr_acl_get,
+	.set    = ceph_xattr_acl_set,
+};
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3c0a4bd7499..9289c6b2f1b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2464,6 +2464,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 				ceph_buffer_put(ci->i_xattrs.blob);
 			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
 			ci->i_xattrs.version = version;
+			ceph_forget_all_cached_acls(inode);
 		}
 	}
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2a0bcaeb189..b629e9d59a3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -693,6 +693,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 	if (!err && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
 	ceph_mdsc_put_request(req);
+
+	if (!err)
+		err = ceph_init_acl(dentry, dentry->d_inode, dir);
+
 	if (err)
 		d_drop(dentry);
 	return err;
@@ -1293,6 +1297,7 @@ const struct inode_operations ceph_dir_iops = {
 	.getxattr = ceph_getxattr,
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
+	.get_acl = ceph_get_acl,
 	.mknod = ceph_mknod,
 	.symlink = ceph_symlink,
 	.mkdir = ceph_mkdir,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d37b2dc01d3..a808bfb8d8d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -95,6 +95,7 @@ const struct inode_operations ceph_file_iops = {
 	.getxattr = ceph_getxattr,
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
+	.get_acl = ceph_get_acl,
 };
 
 
@@ -680,6 +681,7 @@ static int fill_inode(struct inode *inode,
 			memcpy(ci->i_xattrs.blob->vec.iov_base,
 			       iinfo->xattr_data, iinfo->xattr_len);
 		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+		ceph_forget_all_cached_acls(inode);
 		xattr_blob = NULL;
 	}
 
@@ -1612,6 +1614,7 @@ static const struct inode_operations ceph_symlink_iops = {
 	.getxattr = ceph_getxattr,
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
+	.get_acl = ceph_get_acl,
 };
 
 /*
@@ -1685,6 +1688,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
 			   attr->ia_mode != inode->i_mode) {
+			inode->i_mode = attr->ia_mode;
 			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
 			mask |= CEPH_SETATTR_MODE;
 			release |= CEPH_CAP_AUTH_SHARED;
@@ -1800,6 +1804,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	if (inode_dirty_flags)
 		__mark_inode_dirty(inode, inode_dirty_flags);
 
+	if (ia_valid & ATTR_MODE) {
+		err = ceph_acl_chmod(dentry, inode);
+		if (err)
+			goto out_put;
+	}
+
 	if (mask) {
 		req->r_inode = inode;
 		ihold(inode);
@@ -1819,6 +1829,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	return err;
 out:
 	spin_unlock(&ci->i_ceph_lock);
+out_put:
 	ceph_mdsc_put_request(req);
 	return err;
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e58bd4a23bf..c6740e43a35 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -819,7 +819,11 @@ static int ceph_set_super(struct super_block *s, void *data)
 
 	s->s_flags = fsc->mount_options->sb_flags;
 	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	s->s_flags |= MS_POSIXACL;
+#endif
 
+	s->s_xattr = ceph_xattr_handlers;
 	s->s_fs_info = fsc;
 	fsc->sb = s;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 8de94b564d6..7fa78a7c889 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -335,7 +335,6 @@ struct ceph_inode_info {
 	u32 i_fscache_gen; /* sequence, for delayed fscache validate */
 	struct work_struct i_revalidate_work;
 #endif
-
 	struct inode vfs_inode; /* at end */
 };
 
@@ -725,6 +724,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 /* xattr.c */
 extern int ceph_setxattr(struct dentry *, const char *, const void *,
 			 size_t, int);
+int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
+int __ceph_removexattr(struct dentry *, const char *);
 extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern int ceph_removexattr(struct dentry *, const char *);
@@ -733,6 +735,39 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
 extern void __init ceph_xattr_init(void);
 extern void ceph_xattr_exit(void);
 
+/* acl.c */
+extern const struct xattr_handler ceph_xattr_acl_access_handler;
+extern const struct xattr_handler ceph_xattr_acl_default_handler;
+extern const struct xattr_handler *ceph_xattr_handlers[];
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+
+struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
+int ceph_acl_chmod(struct dentry *, struct inode *);
+void ceph_forget_all_cached_acls(struct inode *inode);
+
+#else
+
+#define ceph_get_acl NULL
+
+static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
+				struct inode *dir)
+{
+	return 0;
+}
+
+static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+	return 0;
+}
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+
+#endif
+
 /* caps.c */
 extern const char *ceph_cap_string(int c);
 extern void ceph_handle_caps(struct ceph_mds_session *session,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index be661d8f532..c7581f3733c 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -11,11 +11,24 @@
 #define XATTR_CEPH_PREFIX "ceph."
 #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
 
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	&ceph_xattr_acl_access_handler,
+	&ceph_xattr_acl_default_handler,
+#endif
+	NULL,
+};
+
 static bool ceph_is_valid_xattr(const char *name)
 {
 	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SECURITY_PREFIX,
 			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
@@ -663,10 +676,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 	}
 }
 
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 		      size_t size)
 {
-	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int err;
 	struct ceph_inode_xattr *xattr;
@@ -675,7 +687,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
 
-
 	/* let's see if a virtual xattr was requested */
 	vxattr = ceph_match_vxattr(inode, name);
 	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
@@ -725,6 +736,15 @@ out:
 	return err;
 }
 
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, value, size);
+
+	return __ceph_getxattr(dentry->d_inode, name, value, size);
+}
+
 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
@@ -863,8 +883,8 @@ out:
 	return err;
 }
 
-int ceph_setxattr(struct dentry *dentry, const char *name,
-		  const void *value, size_t size, int flags)
+int __ceph_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_vxattr *vxattr;
@@ -879,9 +899,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	struct ceph_inode_xattr *xattr = NULL;
 	int required_blob_size;
 
-	if (ceph_snap(inode) != CEPH_NOSNAP)
-		return -EROFS;
-
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
@@ -958,6 +975,18 @@ out:
 	return err;
 }
 
+int ceph_setxattr(struct dentry *dentry, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, size, flags);
+
+	return __ceph_setxattr(dentry, name, value, size, flags);
+}
+
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
@@ -984,7 +1013,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 	return err;
 }
 
-int ceph_removexattr(struct dentry *dentry, const char *name)
+int __ceph_removexattr(struct dentry *dentry, const char *name)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_vxattr *vxattr;
@@ -994,9 +1023,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	int required_blob_size;
 	int dirty;
 
-	if (ceph_snap(inode) != CEPH_NOSNAP)
-		return -EROFS;
-
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
@@ -1053,3 +1079,13 @@ out:
 	return err;
 }
 
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	return __ceph_removexattr(dentry, name);
+}
-- 
cgit v1.2.3-70-g09d2


From 3f42bc4beadef554fd0d4e6408e9142da268613b Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@ubuntukylin.com>
Date: Thu, 19 Dec 2013 06:03:48 -0800
Subject: ceph fscache: Introduce a routine for uncaching single no data page
 from fscache

Signed-off-by: Li Wang <liwang@ubuntukylin.com>
Reviewed-by: Milosz Tanski <milosz@adfin.com>
---
 fs/ceph/cache.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba949408a33..da95f61b7a0 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
 	return fscache_maybe_release_page(ci->fscache, page, gfp);
 }
 
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+						struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+		__fscache_uncache_page(ci->fscache, page);
+}
+
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
 						 struct list_head *pages)
 {
@@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
 	return 1;
 }
 
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+						struct page *page)
+{
+}
+
 static inline void ceph_fscache_readpages_cancel(struct inode *inode,
 						 struct list_head *pages)
 {
-- 
cgit v1.2.3-70-g09d2


From 183028052b48db2b34c09fd54f0bc465eaa305eb Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@ubuntukylin.com>
Date: Thu, 19 Dec 2013 06:03:49 -0800
Subject: ceph fscache: Uncaching no data page from fscache in readpage()

Currently, if one new page allocated into fscache in readpage(), however,
with no data read into due to error encountered during reading from OSDs,
the slot in fscache is not uncached. This patch fixes this.

Signed-off-by: Li Wang <liwang@ubuntukylin.com>
Reviewed-by: Milosz Tanski <milosz@adfin.com>
---
 fs/ceph/addr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ebda329611b..791a9a23fc6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 		err = 0;
 	if (err < 0) {
 		SetPageError(page);
+		ceph_fscache_readpage_cancel(inode, page);
 		goto out;
 	} else {
 		if (err < PAGE_CACHE_SIZE) {
-- 
cgit v1.2.3-70-g09d2


From 12b4629a9fb80fecaebadc217b13b8776ed8dbef Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 24 Dec 2013 21:19:23 +0200
Subject: libceph: all features fields must be u64

In preparation for ceph_features.h update, change all features fields
from unsigned int/u32 to u64.  (ceph.git has ~40 feature bits at this
point.)

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c           | 14 +++++++-------
 fs/ceph/super.c                |  4 ++--
 include/linux/ceph/libceph.h   |  8 ++++----
 include/linux/ceph/messenger.h | 10 +++++-----
 net/ceph/ceph_common.c         |  4 ++--
 net/ceph/messenger.c           |  4 ++--
 6 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d90861f4521..4a13f6e7206 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops;
  */
 static int parse_reply_info_in(void **p, void *end,
 			       struct ceph_mds_reply_info_in *info,
-			       int features)
+			       u64 features)
 {
 	int err = -EIO;
 
@@ -98,7 +98,7 @@ bad:
  */
 static int parse_reply_info_trace(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  int features)
+				  u64 features)
 {
 	int err;
 
@@ -145,7 +145,7 @@ out_bad:
  */
 static int parse_reply_info_dir(void **p, void *end,
 				struct ceph_mds_reply_info_parsed *info,
-				int features)
+				u64 features)
 {
 	u32 num, i = 0;
 	int err;
@@ -217,7 +217,7 @@ out_bad:
  */
 static int parse_reply_info_filelock(void **p, void *end,
 				     struct ceph_mds_reply_info_parsed *info,
-				     int features)
+				     u64 features)
 {
 	if (*p + sizeof(*info->filelock_reply) > end)
 		goto bad;
@@ -238,7 +238,7 @@ bad:
  */
 static int parse_reply_info_create(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  int features)
+				  u64 features)
 {
 	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
 		if (*p == end) {
@@ -262,7 +262,7 @@ bad:
  */
 static int parse_reply_info_extra(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  int features)
+				  u64 features)
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
@@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end,
  */
 static int parse_reply_info(struct ceph_msg *msg,
 			    struct ceph_mds_reply_info_parsed *info,
-			    int features)
+			    u64 features)
 {
 	void *p, *end;
 	u32 len;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c6740e43a35..2df963f1cf5 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -490,10 +490,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 					struct ceph_options *opt)
 {
 	struct ceph_fs_client *fsc;
-	const unsigned supported_features =
+	const u64 supported_features =
 		CEPH_FEATURE_FLOCK |
 		CEPH_FEATURE_DIRLAYOUTHASH;
-	const unsigned required_features = 0;
+	const u64 required_features = 0;
 	int page_count;
 	size_t size;
 	int err = -ENOMEM;
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 2e3024881a5..7d704db60cb 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -122,8 +122,8 @@ struct ceph_client {
 
 	int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
 
-	u32 supported_features;
-	u32 required_features;
+	u64 supported_features;
+	u64 required_features;
 
 	struct ceph_messenger msgr;   /* messenger instance */
 	struct ceph_mon_client monc;
@@ -192,8 +192,8 @@ extern int ceph_compare_options(struct ceph_options *new_opt,
 				struct ceph_client *client);
 extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
 					      void *private,
-					      unsigned supported_features,
-					      unsigned required_features);
+					      u64 supported_features,
+					      u64 required_features);
 extern u64 ceph_client_id(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
 extern int __ceph_open_session(struct ceph_client *client,
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 7c1420bb1dc..c1d3f5a6527 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -60,8 +60,8 @@ struct ceph_messenger {
 	u32 global_seq;
 	spinlock_t global_seq_lock;
 
-	u32 supported_features;
-	u32 required_features;
+	u64 supported_features;
+	u64 required_features;
 };
 
 enum ceph_msg_data_type {
@@ -192,7 +192,7 @@ struct ceph_connection {
 
 	struct ceph_entity_name peer_name; /* peer name */
 
-	unsigned peer_features;
+	u64 peer_features;
 	u32 connect_seq;      /* identify the most recent connection
 				 attempt for this connection, client */
 	u32 peer_global_seq;  /* peer's global seq for this connection */
@@ -256,8 +256,8 @@ extern void ceph_msgr_flush(void);
 
 extern void ceph_messenger_init(struct ceph_messenger *msgr,
 			struct ceph_entity_addr *myaddr,
-			u32 supported_features,
-			u32 required_features,
+			u64 supported_features,
+			u64 required_features,
 			bool nocrc);
 
 extern void ceph_con_init(struct ceph_connection *con, void *private,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 34b11ee8124..43d8177a52e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -461,8 +461,8 @@ EXPORT_SYMBOL(ceph_client_id);
  * create a fresh client instance
  */
 struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
-				       unsigned int supported_features,
-				       unsigned int required_features)
+				       u64 supported_features,
+				       u64 required_features)
 {
 	struct ceph_client *client;
 	struct ceph_entity_addr *myaddr = NULL;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4a5df7b1cc9..26067055874 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2853,8 +2853,8 @@ static void con_fault(struct ceph_connection *con)
  */
 void ceph_messenger_init(struct ceph_messenger *msgr,
 			struct ceph_entity_addr *myaddr,
-			u32 supported_features,
-			u32 required_features,
+			u64 supported_features,
+			u64 required_features,
 			bool nocrc)
 {
 	msgr->supported_features = supported_features;
-- 
cgit v1.2.3-70-g09d2


From 0439e091e3b1fe41a350540c84857a573fde3d72 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Thu, 2 Jan 2014 11:30:42 -0600
Subject: jfs: fix xattr value size overflow in __jfs_setxattr

There is a potential overflow if the specified EA value size is
greater than USHRT_MAX because the size of value is limited by
the on-disk format (i.e, __le16), this issue could be reflected
via the tests below:
 # touch /jfs/testfile
 # setfattr -n user.comment -v `perl -e 'print "A"x65536'` /jfs/testfile
   setfattr: /jfs/testfile: Invalid argument

Syslog:
 ... jfs_xsetattr: xattr_size = 21, new_size = 65557

This patch add pre-checkups of EA value size against USHRT_MAX to
avoid this problem, and return -E2BIG which is consistent with the
VFS setxattr interface.  Moreover, fix the debug code to print the
correct function name.

With this fix:
 setfattr: /jfs/testfile: Argument list too long

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/xattr.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index d3472f4cd53..9c6904eee0c 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -860,6 +860,19 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
 			/* Completely new ea list */
 			xattr_size = sizeof (struct jfs_ea_list);
 
+		/*
+		 * The size of EA value is limitted by on-disk format up to
+		 *  __le16, there would be an overflow if the size is equal
+		 * to XATTR_SIZE_MAX (65536).  In order to avoid this issue,
+		 * we can pre-checkup the value size against USHRT_MAX, and
+		 * return -E2BIG in this case, which is consistent with the
+		 * VFS setxattr interface.
+		 */
+		if (value_len >= USHRT_MAX) {
+			rc = -E2BIG;
+			goto release;
+		}
+
 		ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
 		ea->flag = 0;
 		ea->namelen = namelen;
@@ -874,7 +887,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
 	/* DEBUG - If we did this right, these number match */
 	if (xattr_size != new_size) {
 		printk(KERN_ERR
-		       "jfs_xsetattr: xattr_size = %d, new_size = %d\n",
+		       "__jfs_setxattr: xattr_size = %d, new_size = %d\n",
 		       xattr_size, new_size);
 
 		rc = -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From a8bb84bc9e57ad214024425d480a722f304df9e8 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Tue, 10 Dec 2013 15:24:36 +0800
Subject: nfsd: calculate the missing length of bitmap in EXCHANGE_ID

commit 58cd57bfd9db3bc213bf9d6a10920f82095f0114
"nfsd: Fix SP4_MACH_CRED negotiation in EXCHANGE_ID"
miss calculating the length of bitmap for spo_must_enforce and spo_must_allow.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c |  3 ++-
 fs/nfsd/nfs4xdr.c  | 24 ++++++++++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 419572f33b7..7bac4bdbdde 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1524,7 +1524,8 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
-		1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\
+		1 + 1 + /* eir_flags, spr_how */\
+		4 + /* spo_must_enforce & _allow with bitmap */\
 		2 + /*eir_server_owner.so_minor_id */\
 		/* eir_server_owner.so_major_id<> */\
 		XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 79754139ccd..1dface03bd3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3398,35 +3398,43 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
 		8 /* eir_clientid */ +
 		4 /* eir_sequenceid */ +
 		4 /* eir_flags */ +
-		4 /* spr_how */ +
-		8 /* spo_must_enforce, spo_must_allow */ +
-		8 /* so_minor_id */ +
-		4 /* so_major_id.len */ +
-		(XDR_QUADLEN(major_id_sz) * 4) +
-		4 /* eir_server_scope.len */ +
-		(XDR_QUADLEN(server_scope_sz) * 4) +
-		4 /* eir_server_impl_id.count (0) */);
+		4 /* spr_how */);
 
 	WRITEMEM(&exid->clientid, 8);
 	WRITE32(exid->seqid);
 	WRITE32(exid->flags);
 
 	WRITE32(exid->spa_how);
+	ADJUST_ARGS();
+
 	switch (exid->spa_how) {
 	case SP4_NONE:
 		break;
 	case SP4_MACH_CRED:
+		/* spo_must_enforce, spo_must_allow */
+		RESERVE_SPACE(16);
+
 		/* spo_must_enforce bitmap: */
 		WRITE32(2);
 		WRITE32(nfs4_minimal_spo_must_enforce[0]);
 		WRITE32(nfs4_minimal_spo_must_enforce[1]);
 		/* empty spo_must_allow bitmap: */
 		WRITE32(0);
+
+		ADJUST_ARGS();
 		break;
 	default:
 		WARN_ON_ONCE(1);
 	}
 
+	RESERVE_SPACE(
+		8 /* so_minor_id */ +
+		4 /* so_major_id.len */ +
+		(XDR_QUADLEN(major_id_sz) * 4) +
+		4 /* eir_server_scope.len */ +
+		(XDR_QUADLEN(server_scope_sz) * 4) +
+		4 /* eir_server_impl_id.count (0) */);
+
 	/* The server_owner struct */
 	WRITE64(minor_id);      /* Minor id */
 	/* major id */
-- 
cgit v1.2.3-70-g09d2


From b9b284df6c2013aeceb974055426f35e03ac43fc Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 16 Dec 2013 10:48:49 +0800
Subject: nfsd: get rid of unused function definition

commit 557ce2646e775f6bda734dd92b10d4780874b9c7
"nfsd41: replace page based DRC with buffer based DRC"
have remove unused nfsd4_set_statp, but miss the function definition.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/cache.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index d5c5b3e0026..b582f9ab6b2 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,12 +84,4 @@ int	nfsd_cache_lookup(struct svc_rqst *);
 void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
 int	nfsd_reply_cache_stats_open(struct inode *, struct file *);
 
-#ifdef CONFIG_NFSD_V4
-void	nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
-#else  /* CONFIG_NFSD_V4 */
-static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
-{
-}
-#endif /* CONFIG_NFSD_V4 */
-
 #endif /* NFSCACHE_H */
-- 
cgit v1.2.3-70-g09d2


From 5ce13431dd3365d5dd4f3890394dac59b687c0ed Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 6 Nov 2013 10:55:52 -0500
Subject: GFS2: If requested is too large, use the largest extent in the rgrp

Here is a second try at a patch I posted earlier, which also implements
suggestions Steve made:

Before this patch, GFS2 would keep searching through all the rgrps
until it found one that had a chunk of free blocks big enough to
satisfy the size hint, which is based on the file write size,
regardless of whether the chunk was big enough to perform the write.
However, when doing big writes there may not be a large enough
chunk of free blocks in any rgrp, due to file system fragmentation.
The largest chunk may be big enough to satisfy the write request,
but it may not meet the ideal reservation size from the "size hint".
The writes would slow to a crawl because every write would search
every rgrp, then finally give up and default to a single-block write.
In my case, performance would drop from 425MB/s to 18KB/s, or 24000
times slower.

This patch basically makes it so that if we can't find a contiguous
chunk of blocks big enough to satisfy the sizehint, we'll use the
largest chunk of blocks we found that will still contain the write.
It does so by keeping track of the largest run of blocks within the
rgrp.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c8d6161bd68..809fecd8297 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -57,6 +57,11 @@
  * 3 = Used (metadata)
  */
 
+struct gfs2_extent {
+	struct gfs2_rbm rbm;
+	u32 len;
+};
+
 static const char valid_change[16] = {
 	        /* current */
 	/* n */ 0, 1, 1, 1,
@@ -65,8 +70,9 @@ static const char valid_change[16] = {
 	        1, 0, 0, 0
 };
 
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
-                         const struct gfs2_inode *ip, bool nowrap);
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+			 const struct gfs2_inode *ip, bool nowrap,
+			 const struct gfs2_alloc_parms *ap);
 
 
 /**
@@ -1455,7 +1461,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
 		return;
 
-	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true);
+	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
 	if (ret == 0) {
 		rs->rs_rbm = rbm;
 		rs->rs_free = extlen;
@@ -1520,6 +1526,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
  * @rbm: The current position in the resource group
  * @ip: The inode for which we are searching for blocks
  * @minext: The minimum extent length
+ * @maxext: A pointer to the maximum extent structure
  *
  * This checks the current position in the rgrp to see whether there is
  * a reservation covering this block. If not then this function is a
@@ -1532,7 +1539,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
 
 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
 					     const struct gfs2_inode *ip,
-					     u32 minext)
+					     u32 minext,
+					     struct gfs2_extent *maxext)
 {
 	u64 block = gfs2_rbm_to_block(rbm);
 	u32 extlen = 1;
@@ -1545,8 +1553,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
 	 */
 	if (minext) {
 		extlen = gfs2_free_extlen(rbm, minext);
-		nblock = block + extlen;
-		if (extlen < minext)
+		if (extlen <= maxext->len)
 			goto fail;
 	}
 
@@ -1555,9 +1562,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
 	 * and skip if parts of it are already reserved
 	 */
 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
-	if (nblock == block)
-		return 0;
+	if (nblock == block) {
+		if (!minext || extlen >= minext)
+			return 0;
+
+		if (extlen > maxext->len) {
+			maxext->len = extlen;
+			maxext->rbm = *rbm;
+		}
 fail:
+		nblock = block + extlen;
+	}
 	ret = gfs2_rbm_from_block(rbm, nblock);
 	if (ret < 0)
 		return ret;
@@ -1568,10 +1583,12 @@ fail:
  * gfs2_rbm_find - Look for blocks of a particular state
  * @rbm: Value/result starting position and final position
  * @state: The state which we want to find
- * @minext: The requested extent length (0 for a single block)
+ * @minext: Pointer to the requested extent length (NULL for a single block)
+ *          This is updated to be the actual reservation size.
  * @ip: If set, check for reservations
  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
  *          around until we've reached the starting point.
+ * @ap: the allocation parameters
  *
  * Side effects:
  * - If looking for free blocks, we set GBF_FULL on each bitmap which
@@ -1580,8 +1597,9 @@ fail:
  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
  */
 
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
-			 const struct gfs2_inode *ip, bool nowrap)
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+			 const struct gfs2_inode *ip, bool nowrap,
+			 const struct gfs2_alloc_parms *ap)
 {
 	struct buffer_head *bh;
 	int initial_bii;
@@ -1592,6 +1610,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
 	int iters = rbm->rgd->rd_length;
 	int ret;
 	struct gfs2_bitmap *bi;
+	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
 
 	/* If we are not starting at the beginning of a bitmap, then we
 	 * need to add one to the bitmap count to ensure that we search
@@ -1620,7 +1639,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
 			return 0;
 
 		initial_bii = rbm->bii;
-		ret = gfs2_reservation_check_and_update(rbm, ip, minext);
+		ret = gfs2_reservation_check_and_update(rbm, ip,
+							minext ? *minext : 0,
+							&maxext);
 		if (ret == 0)
 			return 0;
 		if (ret > 0) {
@@ -1655,6 +1676,17 @@ next_iter:
 			break;
 	}
 
+	if (minext == NULL || state != GFS2_BLKST_FREE)
+		return -ENOSPC;
+
+	/* If the maximum extent we found is big enough to fulfill the
+	   minimum requirements, use it anyway. */
+	if (maxext.len) {
+		*rbm = maxext.rbm;
+		*minext = maxext.len;
+		return 0;
+	}
+
 	return -ENOSPC;
 }
 
@@ -1680,7 +1712,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 
 	while (1) {
 		down_write(&sdp->sd_log_flush_lock);
-		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
+				      true, NULL);
 		up_write(&sdp->sd_log_flush_lock);
 		if (error == -ENOSPC)
 			break;
@@ -2184,11 +2217,12 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	int error;
 
 	gfs2_set_alloc_start(&rbm, ip, dinode);
-	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
+	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
 
 	if (error == -ENOSPC) {
 		gfs2_set_alloc_start(&rbm, ip, dinode);
-		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
+				      NULL);
 	}
 
 	/* Since all blocks are reserved in advance, this shouldn't happen */
-- 
cgit v1.2.3-70-g09d2


From 1330edbeaf304703052fb583dd660b96309e4536 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 6 Nov 2013 10:58:00 -0500
Subject: GFS2: Drop inadequate rgrps from the reservation tree

This is just basically a resend of a patch I posted earlier.
It didn't change from its original, except in diff offsets, etc:

This patch fixes a bug in the GFS2 block allocation code. The problem
starts if a process already has a multi-block reservation, but for
some reason, another process disqualifies it from further allocations.
For example, the other process might set on the GFS2_RDF_ERROR bit.
The process holding the reservation jumps to label skip_rgrp, but
that label comes after the code that removes the reservation from the
tree. Therefore, the no longer usable reservation is not removed from
the rgrp's reservations tree; it's lost. Eventually, the lost reservation
causes the count of reserved blocks to get off, and eventually that
causes a BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free) to trigger.
This patch moves the call to after label skip_rgrp so that the
disqualified reservation is properly removed from the tree, thus keeping
the rgrp rd_reserved count sane.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 809fecd8297..1ccf89ab42b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1944,15 +1944,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
 			return 0;
 		}
 
-		/* Drop reservation, if we couldn't use reserved rgrp */
-		if (gfs2_rs_active(rs))
-			gfs2_rs_deltree(rs);
 check_rgrp:
 		/* Check for unlinked inodes which can be reclaimed */
 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
 					ip->i_no_addr);
 skip_rgrp:
+		/* Drop reservation, if we couldn't use reserved rgrp */
+		if (gfs2_rs_active(rs))
+			gfs2_rs_deltree(rs);
+
 		/* Unlock rgrp if required */
 		if (!rg_locked)
 			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
-- 
cgit v1.2.3-70-g09d2


From 5ea5050cec9c02e86ceb5e707a889003f895a690 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 25 Nov 2013 11:16:25 +0000
Subject: GFS2: Implement a "rgrp has no extents longer than X" scheme

With the preceding patch, we started accepting block reservations
smaller than the ideal size, which requires a lot more parsing of the
bitmaps. To reduce the amount of bitmap searching, this patch
implements a scheme whereby each rgrp keeps track of the point
at this multi-block reservations will fail.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  1 +
 fs/gfs2/lops.c   |  1 +
 fs/gfs2/rgrp.c   | 32 ++++++++++++++++++++++++++------
 3 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ba1ea67f4ee..01328162c95 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -93,6 +93,7 @@ struct gfs2_rgrpd {
 	struct gfs2_rgrp_lvb *rd_rgl;
 	u32 rd_last_alloc;
 	u32 rd_flags;
+	u32 rd_extfail_pt;		/* extent failure point */
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec..cdadb92372f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
 	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
 	clear_bit(GBF_FULL, &bi->bi_flags);
 	rgd->rd_free_clone = rgd->rd_free;
+	rgd->rd_extfail_pt = rgd->rd_free;
 }
 
 /**
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 1ccf89ab42b..797f1d3114e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -641,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
 		/* return reserved blocks to the rgrp */
 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
+		/* The rgrp extent failure point is likely not to increase;
+		   it will only do so if the freed blocks are somehow
+		   contiguous with a span of free blocks that follows. Still,
+		   it will force the number to be recalculated later. */
+		rgd->rd_extfail_pt += rs->rs_free;
 		rs->rs_free = 0;
 		clear_bit(GBF_FULL, &bi->bi_flags);
-		smp_mb__after_clear_bit();
 	}
 }
 
@@ -1132,6 +1136,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
 		rgd->rd_free_clone = rgd->rd_free;
+		/* max out the rgrp allocation failure point */
+		rgd->rd_extfail_pt = rgd->rd_free;
 	}
 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
@@ -1593,6 +1599,8 @@ fail:
  * Side effects:
  * - If looking for free blocks, we set GBF_FULL on each bitmap which
  *   has no free blocks in it.
+ * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
+ *   has come up short on a free block search.
  *
  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
  */
@@ -1604,6 +1612,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
 	struct buffer_head *bh;
 	int initial_bii;
 	u32 initial_offset;
+	int first_bii = rbm->bii;
+	u32 first_offset = rbm->offset;
 	u32 offset;
 	u8 *buffer;
 	int n = 0;
@@ -1679,6 +1689,13 @@ next_iter:
 	if (minext == NULL || state != GFS2_BLKST_FREE)
 		return -ENOSPC;
 
+	/* If the extent was too small, and it's smaller than the smallest
+	   to have failed before, remember for future reference that it's
+	   useless to search this rgrp again for this amount or more. */
+	if ((first_offset == 0) && (first_bii == 0) &&
+	    (*minext < rbm->rgd->rd_extfail_pt))
+		rbm->rgd->rd_extfail_pt = *minext;
+
 	/* If the maximum extent we found is big enough to fulfill the
 	   minimum requirements, use it anyway. */
 	if (maxext.len) {
@@ -1924,7 +1941,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
 		}
 
 		/* Skip unuseable resource groups */
-		if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
+						 GFS2_RDF_ERROR)) ||
+		    (ap && (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)))
 			goto skip_rgrp;
 
 		if (sdp->sd_args.ar_rgrplvb)
@@ -2106,10 +2125,10 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 
 	if (rgd == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
-		       rgd->rd_reserved);
+		       rgd->rd_reserved, rgd->rd_extfail_pt);
 	spin_lock(&rgd->rd_rsspin);
 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
@@ -2228,9 +2247,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 
 	/* Since all blocks are reserved in advance, this shouldn't happen */
 	if (error) {
-		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n",
+		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
 			(unsigned long long)ip->i_no_addr, error, *nblocks,
-			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags));
+			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
+			rbm.rgd->rd_extfail_pt);
 		goto rgrp_error;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From e4f2920625cc2c562941b280914823a553f6973d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 Nov 2013 13:21:08 +0000
Subject: GFS2: Clean up releasepage

For historical reasons, we drop and retake the log lock in ->releasepage()
however, since there is no reason why we cannot hold the log lock over
the whole function, this allows some simplification. In particular,
pinning a buffer is only ever done under the log lock, so it is possible
here to remove the test for pinned buffers in the second loop, since it
is impossible for that to happen (it is also tested in the first loop).

As a result, two tests made later in the second loop become constants
and can also be reduced to the only possible branch. So the net result
is to remove various bits of unreachable code and make this more
readable.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 73f3e4ee403..cf858dad75d 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1080,30 +1080,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 		bh = bh->b_this_page;
 	} while(bh != head);
 	spin_unlock(&sdp->sd_ail_lock);
-	gfs2_log_unlock(sdp);
 
 	head = bh = page_buffers(page);
 	do {
-		gfs2_log_lock(sdp);
 		bd = bh->b_private;
 		if (bd) {
 			gfs2_assert_warn(sdp, bd->bd_bh == bh);
-			if (!list_empty(&bd->bd_list)) {
-				if (!buffer_pinned(bh))
-					list_del_init(&bd->bd_list);
-				else
-					bd = NULL;
-			}
-			if (bd)
-				bd->bd_bh = NULL;
+			if (!list_empty(&bd->bd_list))
+				list_del_init(&bd->bd_list);
+			bd->bd_bh = NULL;
 			bh->b_private = NULL;
-		}
-		gfs2_log_unlock(sdp);
-		if (bd)
 			kmem_cache_free(gfs2_bufdata_cachep, bd);
+		}
 
 		bh = bh->b_this_page;
 	} while (bh != head);
+	gfs2_log_unlock(sdp);
 
 	return try_to_free_buffers(page);
 
-- 
cgit v1.2.3-70-g09d2


From 7aed98fb1dfbd3f6d7b78647ff2956de2fcc4150 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 Nov 2013 15:17:09 +0000
Subject: GFS2: Remove gfs2_quota_change_host structure

There is only one place this is used, when reading in the quota
changes at mount time. It is not really required and much
simpler to just convert the fields from the on-disk structure
as required.

There should be no functional change as a result of this patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 98236d0df3c..1b6b3675ee1 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -67,12 +67,6 @@
 #include "inode.h"
 #include "util.h"
 
-struct gfs2_quota_change_host {
-	u64 qc_change;
-	u32 qc_flags; /* GFS2_QCF_... */
-	struct kqid qc_id;
-};
-
 /* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
 static DEFINE_SPINLOCK(qd_lock);
 struct list_lru gfs2_qd_lru;
@@ -1214,17 +1208,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
 	return error;
 }
 
-static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
-{
-	const struct gfs2_quota_change *str = buf;
-
-	qc->qc_change = be64_to_cpu(str->qc_change);
-	qc->qc_flags = be32_to_cpu(str->qc_flags);
-	qc->qc_id = make_kqid(&init_user_ns,
-			      (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
-			      be32_to_cpu(str->qc_id));
-}
-
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1257,6 +1240,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 
 	for (x = 0; x < blocks; x++) {
 		struct buffer_head *bh;
+		const struct gfs2_quota_change *qc;
 		unsigned int y;
 
 		if (!extlen) {
@@ -1274,25 +1258,28 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			goto fail;
 		}
 
+		qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
 		for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
 		     y++, slot++) {
-			struct gfs2_quota_change_host qc;
 			struct gfs2_quota_data *qd;
-
-			gfs2_quota_change_in(&qc, bh->b_data +
-					  sizeof(struct gfs2_meta_header) +
-					  y * sizeof(struct gfs2_quota_change));
-			if (!qc.qc_change)
+			s64 qc_change = be64_to_cpu(qc->qc_change);
+			u32 qc_flags = be32_to_cpu(qc->qc_flags);
+			enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
+						USRQUOTA : GRPQUOTA;
+			struct kqid qc_id = make_kqid(&init_user_ns, qtype,
+						      be32_to_cpu(qc->qc_id));
+			qc++;
+			if (!qc_change)
 				continue;
 
-			error = qd_alloc(sdp, qc.qc_id, &qd);
+			error = qd_alloc(sdp, qc_id, &qd);
 			if (error) {
 				brelse(bh);
 				goto fail;
 			}
 
 			set_bit(QDF_CHANGE, &qd->qd_flags);
-			qd->qd_change = qc.qc_change;
+			qd->qd_change = qc_change;
 			qd->qd_slot = slot;
 			qd->qd_slot_count = 1;
 
-- 
cgit v1.2.3-70-g09d2


From 7de41d36ff5885141a16c74a044936cf878c770f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 4 Dec 2013 11:14:05 +0000
Subject: GFS2: Remove test which is always true

Since gfs2_inplace_reserve() is always called with a valid
alloc parms structure, there is no need to test for this
within the function itself - and in any case, after we've
all ready dereferenced it anyway.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 797f1d3114e..2584710f645 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1943,7 +1943,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
 		/* Skip unuseable resource groups */
 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
 						 GFS2_RDF_ERROR)) ||
-		    (ap && (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)))
+		    (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
 			goto skip_rgrp;
 
 		if (sdp->sd_args.ar_rgrplvb)
-- 
cgit v1.2.3-70-g09d2


From 7005c3e4ae42858dbb695b2d03d340af799b1f1b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 6 Dec 2013 10:16:14 +0000
Subject: GFS2: Use range based functions for rgrp sync/invalidation

Each rgrp header is represented as a single extent on disk, so we
can calculate the position within the address space, since we are
using address spaces mapped 1:1 to the disk. This means that it
is possible to use the range based versions of filemap_fdatawrite/wait
and for invalidating the page cache.

Our eventual intent is to then be able to merge the address spaces
used for rgrps into a single address space, rather than to have
one for each glock, saving memory and reducing complexity.

Since during umount, the rgrp structures are disposed of before
the glocks, we need to store the extent information in the glock
so that is is available for a final invalidation. This patch uses
a field which is otherwise unused in rgrp glocks to do that, so
that we do not have to expand the size of a glock.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c  |  6 +++---
 fs/gfs2/incore.h | 10 +++++++++-
 fs/gfs2/rgrp.c   |  3 +++
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f88dcd92501..1b192c8d404 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -142,8 +142,8 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
 	gfs2_log_flush(gl->gl_sbd, gl);
-	filemap_fdatawrite(metamapping);
-	error = filemap_fdatawait(metamapping);
+	filemap_fdatawrite_range(metamapping, gl->gl_vm.start, gl->gl_vm.end);
+	error = filemap_fdatawait_range(metamapping, gl->gl_vm.start, gl->gl_vm.end);
         mapping_set_error(metamapping, error);
 	gfs2_ail_empty_gl(gl);
 
@@ -170,7 +170,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 
 	WARN_ON_ONCE(!(flags & DIO_METADATA));
 	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
-	truncate_inode_pages(mapping, 0);
+	truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 
 	if (gl->gl_object) {
 		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 01328162c95..e6544ee0c13 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -351,7 +351,15 @@ struct gfs2_glock {
 	atomic_t gl_ail_count;
 	atomic_t gl_revokes;
 	struct delayed_work gl_work;
-	struct work_struct gl_delete;
+	union {
+		/* For inode and iopen glocks only */
+		struct work_struct gl_delete;
+		/* For rgrp glocks only */
+		struct {
+			loff_t start;
+			loff_t end;
+		} gl_vm;
+	};
 	struct rcu_head gl_rcu;
 };
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2584710f645..183cf0f0052 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -886,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
 static int read_rindex_entry(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	const unsigned bsize = sdp->sd_sb.sb_bsize;
 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
 	struct gfs2_rindex buf;
 	int error;
@@ -923,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 		goto fail;
 
 	rgd->rd_gl->gl_object = rgd;
+	rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
+	rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
 	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
 	if (rgd->rd_data > sdp->sd_max_rg_data)
-- 
cgit v1.2.3-70-g09d2


From 70d4ee94b370c5ef54d0870600f16bd92d18013c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 6 Dec 2013 16:19:54 +0000
Subject: GFS2: Use only a single address space for rgrps

Prior to this patch, GFS2 had one address space for each rgrp,
stored in the glock. This patch changes them to use a single
address space in the super block. This therefore saves
(sizeof(struct address_space) * nr_of_rgrps) bytes of memory
and for large filesystems, that can be significant.

It would be nice to be able to do something similar and merge
the inode metadata address space into the same global
address space. However, that is rather more complicated as the
on-disk location doesn't have a 1:1 mapping with the inodes in
general. So while it could be done, it will be a more complicated
operation as it requires changing a lot more code paths.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c      | 18 ++++++++++--------
 fs/gfs2/incore.h     |  2 ++
 fs/gfs2/lops.c       |  4 ++++
 fs/gfs2/meta_io.c    |  3 +++
 fs/gfs2/ops_fstype.c | 12 ++++++++++++
 5 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1b192c8d404..b792dbcc83f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-	struct address_space *metamapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = &sdp->sd_aspace;
 	struct gfs2_rgrpd *rgd;
 	int error;
 
@@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 		return;
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(gl->gl_sbd, gl);
-	filemap_fdatawrite_range(metamapping, gl->gl_vm.start, gl->gl_vm.end);
-	error = filemap_fdatawait_range(metamapping, gl->gl_vm.start, gl->gl_vm.end);
-        mapping_set_error(metamapping, error);
+	gfs2_log_flush(sdp, gl);
+	filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+	error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+	mapping_set_error(mapping, error);
 	gfs2_ail_empty_gl(gl);
 
 	spin_lock(&gl->gl_spin);
@@ -166,10 +167,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = &sdp->sd_aspace;
 
 	WARN_ON_ONCE(!(flags & DIO_METADATA));
-	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
 	truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 
 	if (gl->gl_object) {
@@ -558,7 +560,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_unlock = gfs2_rgrp_go_unlock,
 	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
-	.go_flags = GLOF_ASPACE | GLOF_LVB,
+	.go_flags = GLOF_LVB,
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e6544ee0c13..a99f60c9884 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -736,6 +736,8 @@ struct gfs2_sbd {
 
 	/* Log stuff */
 
+	struct address_space sd_aspace;
+
 	spinlock_t sd_log_lock;
 
 	struct gfs2_trans *sd_log_tr;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index cdadb92372f..58f06400b7b 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -589,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 static void gfs2_meta_sync(struct gfs2_glock *gl)
 {
 	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 	int error;
 
+	if (mapping == NULL)
+		mapping = &sdp->sd_aspace;
+
 	filemap_fdatawrite(mapping);
 	error = filemap_fdatawait(mapping);
 
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 52f177be3bf..c7f24690ed0 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 	unsigned long index;
 	unsigned int bufnum;
 
+	if (mapping == NULL)
+		mapping = &sdp->sd_aspace;
+
 	shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
 	index = blkno >> shift;             /* convert block to page */
 	bufnum = blkno - (index << shift);  /* block buf index within page */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fa88314f5..94aab31477b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -36,6 +36,7 @@
 #include "log.h"
 #include "quota.h"
 #include "dir.h"
+#include "meta_io.h"
 #include "trace_gfs2.h"
 
 #define DO 0
@@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp;
+	struct address_space *mapping;
 
 	sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
 	if (!sdp)
@@ -98,6 +100,16 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_trunc_list);
 	spin_lock_init(&sdp->sd_trunc_lock);
 
+	mapping = &sdp->sd_aspace;
+
+	mapping->a_ops = &gfs2_meta_aops;
+	mapping->host = sb->s_bdev->bd_inode;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	mapping->private_data = NULL;
+	mapping->backing_dev_info = sb->s_bdi;
+	mapping->writeback_index = 0;
+
 	spin_lock_init(&sdp->sd_log_lock);
 	atomic_set(&sdp->sd_log_pinned, 0);
 	INIT_LIST_HEAD(&sdp->sd_log_le_buf);
-- 
cgit v1.2.3-70-g09d2


From a9f7b4a06c9704fa3cfe0b0601347e03289a7407 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 9 Dec 2013 19:31:21 +0800
Subject: nfsd: clean up an xdr reserved space calculation

We should use XDR_LEN to calculate reserved space in case the oid is not
a multiple of 4.

RESERVE_SPACE actually rounds up for us, but it's probably better to be
careful here.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1dface03bd3..dbd64a9d268 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3263,7 +3263,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
 
 		if (rpcauth_get_gssinfo(pf, &info) == 0) {
 			supported++;
-			RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4);
+			RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4);
 			WRITE32(RPC_AUTH_GSS);
 			WRITE32(info.oid.len);
 			WRITEMEM(info.oid.data, info.oid.len);
-- 
cgit v1.2.3-70-g09d2


From 43212cc7dfee0ca33d1f0f23652c70317ee031e6 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 9 Dec 2013 19:04:23 +0800
Subject: nfsd: using nfsd4_encode_noop for encoding
 destroy_session/free_stateid

Get rid of the extra code, using nfsd4_encode_noop for encoding destroy_session and free_stateid.
And, delete unused argument (fr_status) int nfsd4_free_stateid.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 26 ++------------------------
 fs/nfsd/xdr4.h    |  1 -
 2 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index dbd64a9d268..776d2f639d6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3500,28 +3500,6 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return 0;
 }
 
-static __be32
-nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
-			     struct nfsd4_destroy_session *destroy_session)
-{
-	return nfserr;
-}
-
-static __be32
-nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
-			  struct nfsd4_free_stateid *free_stateid)
-{
-	__be32 *p;
-
-	if (nfserr)
-		return nfserr;
-
-	RESERVE_SPACE(4);
-	*p++ = nfserr;
-	ADJUST_ARGS();
-	return nfserr;
-}
-
 static __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
 		      struct nfsd4_sequence *seq)
@@ -3620,8 +3598,8 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
 	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
 	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
-	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
-	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_free_stateid,
+	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index b3ed6446ed8..916a4073731 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -374,7 +374,6 @@ struct nfsd4_test_stateid {
 
 struct nfsd4_free_stateid {
 	stateid_t	fr_stateid;         /* request */
-	__be32		fr_status;          /* response */
 };
 
 /* also used for NVERIFY */
-- 
cgit v1.2.3-70-g09d2


From eba1c99ce4590506516ec801d991e36aa8b0d436 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 9 Dec 2013 18:23:46 +0800
Subject: nfsd: clean up unnecessary temporary variable in nfsd4_decode_fattr

host_err was only used for nfs4_acl_new.
This patch delete it, and return nfserr_jukebox directly.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 776d2f639d6..b77f6bdd522 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -276,7 +276,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 	int expected_len, len = 0;
 	u32 dummy32;
 	char *buf;
-	int host_err;
 
 	DECODE_HEAD;
 	iattr->ia_valid = 0;
@@ -303,10 +302,9 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			return nfserr_resource;
 
 		*acl = nfs4_acl_new(nace);
-		if (*acl == NULL) {
-			host_err = -ENOMEM;
-			goto out_nfserr;
-		}
+		if (*acl == NULL)
+			return nfserr_jukebox;
+
 		defer_free(argp, kfree, *acl);
 
 		(*acl)->naces = nace;
@@ -444,10 +442,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		goto xdr_error;
 
 	DECODE_TAIL;
-
-out_nfserr:
-	status = nfserrno(host_err);
-	goto out;
 }
 
 static __be32
-- 
cgit v1.2.3-70-g09d2


From dfeecc829eb8e4ccbbab2ebc9b81b4cebec7fad4 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 9 Dec 2013 18:10:53 +0800
Subject: nfsd: get rid of unused macro definition

Since defined in Linux-2.6.12-rc2, READTIME has not been used.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b77f6bdd522..5bef9cb84b0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -103,11 +103,6 @@ xdr_error:					\
 	(x) = (u64)ntohl(*p++) << 32;		\
 	(x) |= ntohl(*p++);			\
 } while (0)
-#define READTIME(x)       do {			\
-	p++;					\
-	(x) = ntohl(*p++);			\
-	p++;					\
-} while (0)
 #define READMEM(x,nbytes) do {			\
 	x = (char *)p;				\
 	p += XDR_QUADLEN(nbytes);		\
-- 
cgit v1.2.3-70-g09d2


From 2072552c33c82e7f0c7adfac7965ecf40b739f1a Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <zhouzhouyi@gmail.com>
Date: Mon, 2 Dec 2013 18:37:05 +0800
Subject: jffs2: NULL return of kmem_cache_zalloc should be handled

NULL return of kmem_cache_zalloc should be handled in jffs2_alloc_xattr_datum
and jff2_alloc_xattr_ref.

Signed-off-by: Zhouyi Zhou <yizhouzhou@ict.ac.cn>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 fs/jffs2/malloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 4f47aa24b55..b8fd651307a 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 	struct jffs2_xattr_datum *xd;
 	xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", xd);
+	if (!xd)
+		return NULL;
 
 	xd->class = RAWNODE_CLASS_XATTR_DATUM;
 	xd->node = (void *)xd;
@@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 	struct jffs2_xattr_ref *ref;
 	ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", ref);
+	if (!ref)
+		return NULL;
 
 	ref->class = RAWNODE_CLASS_XATTR_REF;
 	ref->node = (void *)ref;
-- 
cgit v1.2.3-70-g09d2


From 2ce02b6b6cf3532df143b85a72bacd611a55616a Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Tue, 26 Nov 2013 22:25:20 +0800
Subject: Add missing recording of back channel attrs in nfsd4_session

commit 5b6feee9608dce7afd2646f457c93e612526d1d8 forgot
recording the back channel attrs in nfsd4_session.

nfsd just check the back channel attars by check_backchannel_attrs,
but do not  record it in nfsd4_session in the latest kernel.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 105d6fa7c51..1aed9be5641 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -999,6 +999,8 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 	spin_unlock(&nn->client_lock);
 	memcpy(&new->se_fchannel, &cses->fore_channel,
 			sizeof(struct nfsd4_channel_attrs));
+	memcpy(&new->se_bchannel, &cses->back_channel,
+			sizeof(struct nfsd4_channel_attrs));
 	if (cses->flags & SESSION4_BACK_CHAN) {
 		struct sockaddr *sa = svc_addr(rqstp);
 		/*
-- 
cgit v1.2.3-70-g09d2


From f403e450e85cd403b63fd163d29b6b7f5e8eaf77 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 23 Dec 2013 17:31:21 +0800
Subject: NFSD: fix a leak which can cause CREATE_SESSION failures

check_forechannel_attrs gets drc memory, so nfsd must put it when
check_backchannel_attrs fails.

After many requests with bad back channel attrs, nfsd will deny any
client's CREATE_SESSION forever.

A new test case named CSESS29 for pynfs will send in another mail.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1aed9be5641..9a6d088247f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1915,7 +1915,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		return status;
 	status = check_backchannel_attrs(&cr_ses->back_channel);
 	if (status)
-		return status;
+		goto out_release_drc_mem;
 	status = nfserr_jukebox;
 	new = alloc_session(&cr_ses->fore_channel);
 	if (!new)
-- 
cgit v1.2.3-70-g09d2


From 8a891633b832874e2a1545abbddfd33ba22eb016 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 23 Dec 2013 18:11:02 +0800
Subject: NFSD: fix bad length checking for backchannel

the length for backchannel checking should be multiplied by sizeof(__be32).

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9a6d088247f..acb95026ae3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1853,6 +1853,11 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
 	return nfs_ok;
 }
 
+#define NFSD_CB_MAX_REQ_SZ	((NFS4_enc_cb_recall_sz + \
+				 RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+#define NFSD_CB_MAX_RESP_SZ	((NFS4_dec_cb_recall_sz + \
+				 RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
+
 static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
 {
 	ca->headerpadsz = 0;
@@ -1863,9 +1868,9 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
 	 * less than 1k.  Tighten up this estimate in the unlikely event
 	 * it turns out to be a problem for some client:
 	 */
-	if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH)
+	if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
 		return nfserr_toosmall;
-	if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH)
+	if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
 		return nfserr_toosmall;
 	ca->maxresp_cached = 0;
 	if (ca->maxops < 2)
-- 
cgit v1.2.3-70-g09d2


From 7e55b59b2f32afc83452ae250dfd6173c9a7b515 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Tue, 31 Dec 2013 13:17:20 +0800
Subject: SUNRPC/NFSD: Support a new option for ignoring the result of
 svc_register

NFSv4 clients can contact port 2049 directly instead of needing the
portmapper.

Therefore a failure to register to the portmapper when starting an
NFSv4-only server isn't really a problem.

But Gareth Williams reports that an attempt to start an NFSv4-only
server without starting portmap fails:

  #rpc.nfsd -N 2 -N 3
  rpc.nfsd: writing fd to kernel failed: errno 111 (Connection refused)
  rpc.nfsd: unable to set any sockets for nfsd

Add a flag to svc_version to tell the rpc layer it can safely ignore an
rpcbind failure in the NFSv4-only case.

Reported-by: Gareth Williams <gareth@garethwilliams.me.uk>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c         |  1 +
 include/linux/sunrpc/svc.h |  4 +++-
 net/sunrpc/svc.c           | 25 +++++++++++++++++--------
 3 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7bac4bdbdde..41e34dfd4e5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1882,6 +1882,7 @@ struct svc_version	nfsd_version4 = {
 		.vs_proc	= nfsd_procedures4,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
+		.vs_rpcb_optnl	= 1,
 };
 
 /*
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index b631642318c..04e76322124 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -386,8 +386,10 @@ struct svc_version {
 	struct svc_procedure *	vs_proc;	/* per-procedure info */
 	u32			vs_xdrsize;	/* xdrsize needed for this version */
 
-	unsigned int		vs_hidden : 1;	/* Don't register with portmapper.
+	unsigned int		vs_hidden : 1,	/* Don't register with portmapper.
 						 * Only used for nfsacl so far. */
+				vs_rpcb_optnl:1;/* Don't care the result of register.
+						 * Only used for nfsv4. */
 
 	/* Override dispatch function (e.g. when caching replies).
 	 * A return value of 0 means drop the request. 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e7fbe368b4a..5de6801cd92 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -916,9 +916,6 @@ static int __svc_register(struct net *net, const char *progname,
 #endif
 	}
 
-	if (error < 0)
-		printk(KERN_WARNING "svc: failed to register %sv%u RPC "
-			"service (errno %d).\n", progname, version, -error);
 	return error;
 }
 
@@ -937,6 +934,7 @@ int svc_register(const struct svc_serv *serv, struct net *net,
 		 const unsigned short port)
 {
 	struct svc_program	*progp;
+	struct svc_version	*vers;
 	unsigned int		i;
 	int			error = 0;
 
@@ -946,7 +944,8 @@ int svc_register(const struct svc_serv *serv, struct net *net,
 
 	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
 		for (i = 0; i < progp->pg_nvers; i++) {
-			if (progp->pg_vers[i] == NULL)
+			vers = progp->pg_vers[i];
+			if (vers == NULL)
 				continue;
 
 			dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
@@ -955,16 +954,26 @@ int svc_register(const struct svc_serv *serv, struct net *net,
 					proto == IPPROTO_UDP?  "udp" : "tcp",
 					port,
 					family,
-					progp->pg_vers[i]->vs_hidden?
-						" (but not telling portmap)" : "");
+					vers->vs_hidden ?
+					" (but not telling portmap)" : "");
 
-			if (progp->pg_vers[i]->vs_hidden)
+			if (vers->vs_hidden)
 				continue;
 
 			error = __svc_register(net, progp->pg_name, progp->pg_prog,
 						i, family, proto, port);
-			if (error < 0)
+
+			if (vers->vs_rpcb_optnl) {
+				error = 0;
+				continue;
+			}
+
+			if (error < 0) {
+				printk(KERN_WARNING "svc: failed to register "
+					"%sv%u RPC service (errno %d).\n",
+					progp->pg_name, i, -error);
 				break;
+			}
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 8ef667140c52e9b88934664954217f28559c75d6 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Tue, 31 Dec 2013 13:17:30 +0800
Subject: NFSD: Don't start lockd when only NFSv4 is running

When starting without nfsv2 and nfsv3, nfsd does not need to start
lockd (and certainly doesn't need to fail because lockd failed to
register with the portmapper).

Reported-by: Gareth Williams <gareth@garethwilliams.me.uk>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/netns.h  |  1 +
 fs/nfsd/nfssvc.c | 26 +++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 849a7c3ced2..d32b3aa6600 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -95,6 +95,7 @@ struct nfsd_net {
 	time_t nfsd4_grace;
 
 	bool nfsd_net_up;
+	bool lockd_up;
 
 	/*
 	 * Time of server startup
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 760c85a6f53..55b5b57b571 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -241,6 +241,11 @@ static void nfsd_shutdown_generic(void)
 	nfsd_racache_shutdown();
 }
 
+static bool nfsd_needs_lockd(void)
+{
+	return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
+}
+
 static int nfsd_startup_net(int nrservs, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -255,9 +260,14 @@ static int nfsd_startup_net(int nrservs, struct net *net)
 	ret = nfsd_init_socks(net);
 	if (ret)
 		goto out_socks;
-	ret = lockd_up(net);
-	if (ret)
-		goto out_socks;
+
+	if (nfsd_needs_lockd() && !nn->lockd_up) {
+		ret = lockd_up(net);
+		if (ret)
+			goto out_socks;
+		nn->lockd_up = 1;
+	}
+
 	ret = nfs4_state_start_net(net);
 	if (ret)
 		goto out_lockd;
@@ -266,7 +276,10 @@ static int nfsd_startup_net(int nrservs, struct net *net)
 	return 0;
 
 out_lockd:
-	lockd_down(net);
+	if (nn->lockd_up) {
+		lockd_down(net);
+		nn->lockd_up = 0;
+	}
 out_socks:
 	nfsd_shutdown_generic();
 	return ret;
@@ -277,7 +290,10 @@ static void nfsd_shutdown_net(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	nfs4_state_shutdown_net(net);
-	lockd_down(net);
+	if (nn->lockd_up) {
+		lockd_down(net);
+		nn->lockd_up = 0;
+	}
 	nn->nfsd_net_up = false;
 	nfsd_shutdown_generic();
 }
-- 
cgit v1.2.3-70-g09d2


From a8c2275493b866961f4429a741251c630c4fc6d7 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sat, 21 Dec 2013 05:39:04 +0100
Subject: nfs: fix dead code of ipv6_addr_scope

The correct way to check on IPV6_ADDR_SCOPE_LINKLOCAL is to check with
the ipv6_addr_src_scope function.

Currently this can't be work, because ipv6_addr_scope returns a int with
a mask of IPV6_ADDR_SCOPE_MASK (0x00f0U) and IPV6_ADDR_SCOPE_LINKLOCAL
is 0x02. So the condition is always false.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4filelayoutdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index c7c295e556e..efac602edb3 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
 		b6 = (struct sockaddr_in6 *)addr2;
 
 		/* LINKLOCAL addresses must have matching scope_id */
-		if (ipv6_addr_scope(&a6->sin6_addr) ==
+		if (ipv6_addr_src_scope(&a6->sin6_addr) ==
 		    IPV6_ADDR_SCOPE_LINKLOCAL &&
 		    a6->sin6_scope_id != b6->sin6_scope_id)
 			return false;
-- 
cgit v1.2.3-70-g09d2


From 1e8968c5b0582392d5f132422f581e3ebc24e627 Mon Sep 17 00:00:00 2001
From: Niels de Vos <ndevos@redhat.com>
Date: Tue, 17 Dec 2013 18:20:16 +0100
Subject: NFS: dprintk() should not print negative fileids and inode numbers

A fileid in NFS is a uint64. There are some occurrences where dprintk()
outputs a signed fileid. This leads to confusion and more difficult to
read debugging (negative fileids matching positive inode numbers).

Signed-off-by: Niels de Vos <ndevos@redhat.com>
CC: Santosh Pradhan <spradhan@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c            | 18 +++++++++---------
 fs/nfs/direct.c         |  4 ++--
 fs/nfs/file.c           |  6 +++---
 fs/nfs/inode.c          | 29 +++++++++++++++--------------
 fs/nfs/nfs3acl.c        |  4 ++--
 fs/nfs/nfs4filelayout.c |  8 ++++----
 fs/nfs/read.c           | 12 ++++++------
 fs/nfs/write.c          |  8 ++++----
 8 files changed, 45 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 812154aff98..b266f734bd5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1404,7 +1404,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	/* Expect a negative dentry */
 	BUG_ON(dentry->d_inode);
 
-	dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n",
+	dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	err = nfs_check_flags(open_flags);
@@ -1594,7 +1594,7 @@ int nfs_create(struct inode *dir, struct dentry *dentry,
 	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
 	int error;
 
-	dfprintk(VFS, "NFS: create(%s/%ld), %pd\n",
+	dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	attr.ia_mode = mode;
@@ -1621,7 +1621,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 	struct iattr attr;
 	int status;
 
-	dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n",
+	dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	if (!new_valid_dev(rdev))
@@ -1650,7 +1650,7 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct iattr attr;
 	int error;
 
-	dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n",
+	dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	attr.ia_valid = ATTR_MODE;
@@ -1678,7 +1678,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 
-	dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n",
+	dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	trace_nfs_rmdir_enter(dir, dentry);
@@ -1747,7 +1747,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
 	int error;
 	int need_rehash = 0;
 
-	dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id,
+	dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry);
 
 	trace_nfs_unlink_enter(dir, dentry);
@@ -1798,7 +1798,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	unsigned int pathlen = strlen(symname);
 	int error;
 
-	dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id,
+	dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry, symname);
 
 	if (pathlen > PAGE_SIZE)
@@ -1821,7 +1821,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
 	trace_nfs_symlink_exit(dir, dentry, error);
 	if (error != 0) {
-		dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n",
+		dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
 			dir->i_sb->s_id, dir->i_ino,
 			dentry, symname, error);
 		d_drop(dentry);
@@ -2304,7 +2304,7 @@ out:
 	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
 		res = -EACCES;
 
-	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
+	dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
 		inode->i_sb->s_id, inode->i_ino, mask, res);
 	return res;
 out_notsup:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d71d66c9e0a..049b3408fb9 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -237,9 +237,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 
 static void nfs_direct_readpage_release(struct nfs_page *req)
 {
-	dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
+	dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
 		req->wb_context->dentry->d_inode->i_sb->s_id,
-		(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+		(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 		req->wb_bytes,
 		(long long)req_offset(req));
 	nfs_release_request(req);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e2fcacf07de..5bb790a69c7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	int once_thru = 0;
 
-	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n",
+	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
 		file, mapping->host->i_ino, len, (long long) pos);
 
 start:
@@ -395,7 +395,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	int status;
 
-	dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n",
+	dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
 		file, mapping->host->i_ino, len, (long long) pos);
 
 	/*
@@ -585,7 +585,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	int ret = VM_FAULT_NOPAGE;
 	struct address_space *mapping;
 
-	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n",
+	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
 		filp, filp->f_mapping->host->i_ino,
 		(long long)page_offset(page));
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00ad1c2b217..5feec233895 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -458,9 +458,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		unlock_new_inode(inode);
 	} else
 		nfs_refresh_inode(inode, fattr);
-	dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
+	dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",
 		inode->i_sb->s_id,
-		(long long)NFS_FILEID(inode),
+		(unsigned long long)NFS_FILEID(inode),
 		nfs_display_fhandle_hash(fh),
 		atomic_read(&inode->i_count));
 
@@ -870,8 +870,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	struct nfs_fattr *fattr = NULL;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
-		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n",
+		inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));
 
 	trace_nfs_revalidate_inode_enter(inode);
 
@@ -895,9 +895,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 
 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
 	if (status != 0) {
-		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
+		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
 			 inode->i_sb->s_id,
-			 (long long)NFS_FILEID(inode), status);
+			 (unsigned long long)NFS_FILEID(inode), status);
 		if (status == -ESTALE) {
 			nfs_zap_caches(inode);
 			if (!S_ISDIR(inode->i_mode))
@@ -908,9 +908,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 
 	status = nfs_refresh_inode(inode, fattr);
 	if (status) {
-		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
+		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
 			 inode->i_sb->s_id,
-			 (long long)NFS_FILEID(inode), status);
+			 (unsigned long long)NFS_FILEID(inode), status);
 		goto err_out;
 	}
 
@@ -919,9 +919,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 
 	nfs_setsecurity(inode, fattr, label);
 
-	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
+	dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
 		inode->i_sb->s_id,
-		(long long)NFS_FILEID(inode));
+		(unsigned long long)NFS_FILEID(inode));
 
 err_out:
 	nfs4_label_free(label);
@@ -985,8 +985,9 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
-	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
-			inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+	dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
+			inode->i_sb->s_id,
+			(unsigned long long)NFS_FILEID(inode));
 	return 0;
 }
 
@@ -1434,7 +1435,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	unsigned long now = jiffies;
 	unsigned long save_cache_validity;
 
-	dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n",
+	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			nfs_display_fhandle_hash(NFS_FH(inode)),
 			atomic_read(&inode->i_count), fattr->valid);
@@ -1455,7 +1456,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		/*
 		* Big trouble! The inode has become a different object.
 		*/
-		printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
+		printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",
 				__func__, inode->i_ino, inode->i_mode, fattr->mode);
 		goto out_err;
 	}
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 4a1aafba6a2..34e918d265b 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -130,7 +130,7 @@ static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
 
 void nfs3_forget_cached_acls(struct inode *inode)
 {
-	dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
+	dprintk("NFS: nfs3_forget_cached_acls(%s/%lu)\n", inode->i_sb->s_id,
 		inode->i_ino);
 	spin_lock(&inode->i_lock);
 	__nfs3_forget_cached_acls(NFS_I(inode));
@@ -161,7 +161,7 @@ static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
 		acl = posix_acl_dup(acl);
 out:
 	spin_unlock(&inode->i_lock);
-	dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
+	dprintk("NFS: nfs3_get_cached_acl(%s/%lu, %d) = %p\n", inode->i_sb->s_id,
 		inode->i_ino, type, acl);
 	return acl;
 }
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index b86464ba25e..0a93e798df3 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -91,10 +91,10 @@ static void filelayout_reset_write(struct nfs_write_data *data)
 
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		dprintk("%s Reset task %5u for i/o through MDS "
-			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 			data->task.tk_pid,
 			hdr->inode->i_sb->s_id,
-			(long long)NFS_FILEID(hdr->inode),
+			(unsigned long long)NFS_FILEID(hdr->inode),
 			data->args.count,
 			(unsigned long long)data->args.offset);
 
@@ -112,10 +112,10 @@ static void filelayout_reset_read(struct nfs_read_data *data)
 
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		dprintk("%s Reset task %5u for i/o through MDS "
-			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 			data->task.tk_pid,
 			hdr->inode->i_sb->s_id,
-			(long long)NFS_FILEID(hdr->inode),
+			(unsigned long long)NFS_FILEID(hdr->inode),
 			data->args.count,
 			(unsigned long long)data->args.offset);
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 31db5c366b8..411aedda14b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -163,9 +163,9 @@ static void nfs_readpage_release(struct nfs_page *req)
 
 	unlock_page(req->wb_page);
 
-	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
+	dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+			(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 	nfs_release_request(req);
@@ -228,11 +228,11 @@ int nfs_initiate_read(struct rpc_clnt *clnt,
 	/* Set up the initial task struct. */
 	NFS_PROTO(inode)->read_setup(data, &msg);
 
-	dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+	dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ "
 			"offset %llu)\n",
 			data->task.tk_pid,
 			inode->i_sb->s_id,
-			(long long)NFS_FILEID(inode),
+			(unsigned long long)NFS_FILEID(inode),
 			data->args.count,
 			(unsigned long long)data->args.offset);
 
@@ -630,9 +630,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	unsigned long npages;
 	int ret = -ESTALE;
 
-	dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
+	dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
 			inode->i_sb->s_id,
-			(long long)NFS_FILEID(inode),
+			(unsigned long long)NFS_FILEID(inode),
 			nr_pages);
 	nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c1d548211c3..77a00c66c7d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1013,10 +1013,10 @@ int nfs_initiate_write(struct rpc_clnt *clnt,
 	NFS_PROTO(inode)->write_setup(data, &msg);
 
 	dprintk("NFS: %5u initiated write call "
-		"(req %s/%lld, %u bytes @ offset %llu)\n",
+		"(req %s/%llu, %u bytes @ offset %llu)\n",
 		data->task.tk_pid,
 		inode->i_sb->s_id,
-		(long long)NFS_FILEID(inode),
+		(unsigned long long)NFS_FILEID(inode),
 		data->args.count,
 		(unsigned long long)data->args.offset);
 
@@ -1606,9 +1606,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 		nfs_list_remove_request(req);
 		nfs_clear_page_commit(req->wb_page);
 
-		dprintk("NFS:       commit (%s/%lld %d@%lld)",
+		dprintk("NFS:       commit (%s/%llu %d@%lld)",
 			req->wb_context->dentry->d_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+			(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (status < 0) {
-- 
cgit v1.2.3-70-g09d2


From 16a6ddc70920a0686dbf90e092a539c1a4fd7b77 Mon Sep 17 00:00:00 2001
From: Toralf Förster <toralf.foerster@gmx.de>
Date: Thu, 12 Dec 2013 19:09:00 +0100
Subject: point to the right include file in a comment (left over from
 a9004abc3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Toralf Förster <toralf.foerster@gmx.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 059c01b67a7..e5be72518bd 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1071,7 +1071,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
 /*
  * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
  * failed with a seqid incrementing error -
- * see comments nfs_fs.h:seqid_mutating_error()
+ * see comments nfs4.h:seqid_mutating_error()
  */
 static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 {
@@ -1116,7 +1116,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 /*
  * Increment the seqid if the LOCK/LOCKU succeeded, or
  * failed with a seqid incrementing error -
- * see comments nfs_fs.h:seqid_mutating_error()
+ * see comments nfs4.h:seqid_mutating_error()
  */
 void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
 {
-- 
cgit v1.2.3-70-g09d2


From 8230a0a49fc3f587644b284e51ac092607aa74bf Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 27 Dec 2013 11:13:21 +0900
Subject: f2fs: convert inline_data for punch_hole

In the punch_hole(), let's convert inline_data all the time for simplicity and
to avoid potential deadlock conditions.
It is pretty much not a big deal to do this.

Reviewed-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/file.c | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7ef2d6af24c..f64a1c8291a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -462,25 +462,9 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	loff_t off_start, off_end;
 	int ret = 0;
 
-	if (f2fs_has_inline_data(inode)) {
-		struct page *page;
-		unsigned flags = AOP_FLAG_NOFS;
-		page = grab_cache_page_write_begin(inode->i_mapping, 0, flags);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-		if (offset + len > MAX_INLINE_DATA) {
-			ret = f2fs_convert_inline_data(inode, page, flags);
-			f2fs_put_page(page, 1);
-			if (ret)
-				return ret;
-		} else {
-			zero_user_segment(page, offset, offset + len);
-			SetPageUptodate(page);
-			set_page_dirty(page);
-			f2fs_put_page(page, 1);
-			return ret;
-		}
-	}
+	ret = f2fs_convert_inline_data(inode, NULL, AOP_FLAG_NOFS);
+	if (ret)
+		return ret;
 
 	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
 	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
-- 
cgit v1.2.3-70-g09d2


From 26f466f4a948ddc765f9b474ad6e0bdb94fb1a66 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 27 Dec 2013 12:21:49 +0900
Subject: f2fs: call f2fs_put_page at the error case

In f2fs_write_begin(), if f2fs_conver_inline_data() returns an error like
-ENOSPC, f2fs should call f2fs_put_page().
Otherwise, it is remained as a locked page, resulting in the following bug.

[<ffffffff8114657e>] sleep_on_page+0xe/0x20
[<ffffffff81146567>] __lock_page+0x67/0x70
[<ffffffff81157d08>] truncate_inode_pages_range+0x368/0x5d0
[<ffffffff81157ff5>] truncate_inode_pages+0x15/0x20
[<ffffffff8115804b>] truncate_pagecache+0x4b/0x70
[<ffffffff81158082>] truncate_setsize+0x12/0x20
[<ffffffffa02a1842>] f2fs_setattr+0x72/0x270 [f2fs]
[<ffffffff811cdae3>] notify_change+0x213/0x400
[<ffffffff811ab376>] do_truncate+0x66/0xa0
[<ffffffff811ab541>] vfs_truncate+0x191/0x1b0
[<ffffffff811ab5bc>] do_sys_truncate+0x5c/0xa0
[<ffffffff811ab78e>] SyS_truncate+0xe/0x10
[<ffffffff81756052>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index bf39eed2442..253e6633dbf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -916,8 +916,10 @@ repeat:
 			goto inline_data;
 	} else if (f2fs_has_inline_data(inode)) {
 		err = f2fs_convert_inline_data(inode, page, flags);
-		if (err)
+		if (err) {
+			f2fs_put_page(page, 1);
 			return err;
+		}
 	}
 
 	f2fs_lock_op(sbi);
-- 
cgit v1.2.3-70-g09d2


From 9e09fc855dd6f6ed510b3db7f3c3c1dd73631ac7 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 27 Dec 2013 12:28:59 +0900
Subject: f2fs: refactor f2fs_convert_inline_data

Change log from v1:
 o handle NULL pointer of grab_cache_page_write_begin() pointed by Chao Yu.

This patch refactors f2fs_convert_inline_data to check a couple of conditions
internally for deciding whether it needs to convert inline_data or not.

So, the new f2fs_convert_inline_data initially checks:
1) f2fs_has_inline_data(), and
2) the data size to be changed.

If the inode has inline_data but the size to fill is less than MAX_INLINE_DATA,
then we don't need to convert the inline_data with data allocation.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c   | 16 ++++++----------
 fs/f2fs/f2fs.h   |  2 +-
 fs/f2fs/file.c   | 25 +++++++++----------------
 fs/f2fs/inline.c | 26 ++++++++++++--------------
 4 files changed, 28 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 253e6633dbf..fc7a28c5ad2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -906,21 +906,17 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 
 	f2fs_balance_fs(sbi);
 repeat:
+	err = f2fs_convert_inline_data(inode, pos + len);
+	if (err)
+		return err;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 
-	if ((pos + len) < MAX_INLINE_DATA) {
-		if (f2fs_has_inline_data(inode))
-			goto inline_data;
-	} else if (f2fs_has_inline_data(inode)) {
-		err = f2fs_convert_inline_data(inode, page, flags);
-		if (err) {
-			f2fs_put_page(page, 1);
-			return err;
-		}
-	}
+	if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
+		goto inline_data;
 
 	f2fs_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b01ccaa73b2..af35039c38f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1302,6 +1302,6 @@ extern const struct inode_operations f2fs_special_inode_operations;
 inline int f2fs_has_inline_data(struct inode *);
 bool f2fs_may_inline(struct inode *);
 int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, struct page *, unsigned);
+int f2fs_convert_inline_data(struct inode *, pgoff_t);
 int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f64a1c8291a..68dd7bfce1a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -370,17 +370,12 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if ((attr->ia_valid & ATTR_SIZE) &&
 			attr->ia_size != i_size_read(inode)) {
-		if (f2fs_has_inline_data(inode) &&
-				(attr->ia_size > MAX_INLINE_DATA)) {
-			unsigned flags = AOP_FLAG_NOFS;
-			err = f2fs_convert_inline_data(inode, NULL, flags);
-			if (err)
-				return err;
-		}
+		err = f2fs_convert_inline_data(inode, attr->ia_size);
+		if (err)
+			return err;
 
 		truncate_setsize(inode, attr->ia_size);
-		if (!f2fs_has_inline_data(inode))
-			f2fs_truncate(inode);
+		f2fs_truncate(inode);
 		f2fs_balance_fs(F2FS_SB(inode->i_sb));
 	}
 
@@ -462,7 +457,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	loff_t off_start, off_end;
 	int ret = 0;
 
-	ret = f2fs_convert_inline_data(inode, NULL, AOP_FLAG_NOFS);
+	ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
 	if (ret)
 		return ret;
 
@@ -512,16 +507,14 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	loff_t off_start, off_end;
 	int ret = 0;
 
-	if (f2fs_has_inline_data(inode) && (offset + len > MAX_INLINE_DATA)) {
-		ret = f2fs_convert_inline_data(inode, NULL, AOP_FLAG_NOFS);
-		if (ret)
-			return ret;
-	}
-
 	ret = inode_newsize_ok(inode, (len + offset));
 	if (ret)
 		return ret;
 
+	ret = f2fs_convert_inline_data(inode, offset + len);
+	if (ret)
+		return ret;
+
 	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
 	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 62c72aa84ac..e8891aa3ab8 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -101,6 +101,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 	dst_addr = kmap(page);
 	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
 	kunmap(page);
+	SetPageUptodate(page);
 
 	/* write data page to try to make data consistent */
 	set_page_writeback(page);
@@ -120,25 +121,22 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 	return err;
 }
 
-int f2fs_convert_inline_data(struct inode *inode,
-			     struct page *p, unsigned flags)
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
 {
-	int err;
 	struct page *page;
+	int err;
 
-	if (!p || p->index) {
-		page = grab_cache_page_write_begin(inode->i_mapping, 0, flags);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-	} else {
-		page = p;
-	}
-
-	err = __f2fs_convert_inline_data(inode, page);
+	if (!f2fs_has_inline_data(inode))
+		return 0;
+	else if (to_size <= MAX_INLINE_DATA)
+		return 0;
 
-	if (!p || p->index)
-		f2fs_put_page(page, 1);
+	page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS);
+	if (!page)
+		return -ENOMEM;
 
+	err = __f2fs_convert_inline_data(inode, page);
+	f2fs_put_page(page, 1);
 	return err;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0dbdc2ae9bba0a358816cc4a22e41a6ef16db8a2 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 26 Nov 2013 11:08:57 +0900
Subject: f2fs: add the number of inline_data files to status info

This patch adds the number of inline_data files into the status information.
Note that the number is reset whenever the filesystem is newly mounted.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/debug.c  |  3 +++
 fs/f2fs/f2fs.h   | 22 ++++++++++++++++++++--
 fs/f2fs/inline.c |  7 ++-----
 fs/f2fs/inode.c  |  1 +
 4 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 61adbcbe9b8..711a0d4d1cb 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -45,6 +45,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->valid_count = valid_user_blocks(sbi);
 	si->valid_node_count = valid_node_count(sbi);
 	si->valid_inode_count = valid_inode_count(sbi);
+	si->inline_inode = sbi->inline_inode;
 	si->utilization = utilization(sbi);
 
 	si->free_segs = free_segments(sbi);
@@ -200,6 +201,8 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "Other: %u)\n  - Data: %u\n",
 			   si->valid_node_count - si->valid_inode_count,
 			   si->valid_count - si->valid_node_count);
+		seq_printf(s, "  - Inline_data Inode: %u\n",
+			   si->inline_inode);
 		seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
 			   si->main_area_segs, si->main_area_sections,
 			   si->main_area_zones);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index af35039c38f..0b6badadbfd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -460,6 +460,7 @@ struct f2fs_sb_info {
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
 	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
+	int inline_inode;			/* # of inline_data inodes */
 	int bg_gc;				/* background gc calls */
 	unsigned int n_dirty_dirs;		/* # of dir inodes */
 #endif
@@ -992,6 +993,11 @@ static inline int inline_xattr_size(struct inode *inode)
 		return 0;
 }
 
+static inline int f2fs_has_inline_data(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+}
+
 static inline void *inline_data_addr(struct page *page)
 {
 	struct f2fs_inode *ri;
@@ -1201,7 +1207,7 @@ struct f2fs_stat_info {
 	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
 	int nats, sits, fnids;
 	int total_count, utilization;
-	int bg_gc;
+	int bg_gc, inline_inode;
 	unsigned int valid_count, valid_node_count, valid_inode_count;
 	unsigned int bimodal, avg_vblocks;
 	int util_free, util_valid, util_invalid;
@@ -1230,6 +1236,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
 #define stat_inc_total_hit(sb)		((F2FS_SB(sb))->total_hit_ext++)
 #define stat_inc_read_hit(sb)		((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_inline_inode(inode)					\
+	do {								\
+		if (f2fs_has_inline_data(inode))			\
+			((F2FS_SB(inode->i_sb))->inline_inode++);	\
+	} while (0)
+#define stat_dec_inline_inode(inode)					\
+	do {								\
+		if (f2fs_has_inline_data(inode))			\
+			((F2FS_SB(inode->i_sb))->inline_inode--);	\
+	} while (0)
+
 #define stat_inc_seg_type(sbi, curseg)					\
 		((sbi)->segment_count[(curseg)->alloc_type]++)
 #define stat_inc_block_count(sbi, curseg)				\
@@ -1273,6 +1290,8 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_dirty_dir(sbi)
 #define stat_inc_total_hit(sb)
 #define stat_inc_read_hit(sb)
+#define stat_inc_inline_inode(inode)
+#define stat_dec_inline_inode(inode)
 #define stat_inc_seg_type(sbi, curseg)
 #define stat_inc_block_count(sbi, curseg)
 #define stat_inc_seg_count(si, type)
@@ -1299,7 +1318,6 @@ extern const struct inode_operations f2fs_special_inode_operations;
 /*
  * inline.c
  */
-inline int f2fs_has_inline_data(struct inode *);
 bool f2fs_may_inline(struct inode *);
 int f2fs_read_inline_data(struct inode *, struct page *);
 int f2fs_convert_inline_data(struct inode *, pgoff_t);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index e8891aa3ab8..3c9261cd215 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -13,11 +13,6 @@
 
 #include "f2fs.h"
 
-inline int f2fs_has_inline_data(struct inode *inode)
-{
-	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
-}
-
 bool f2fs_may_inline(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -113,6 +108,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 	zero_user_segment(ipage, INLINE_DATA_OFFSET,
 				 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
 	clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+	stat_dec_inline_inode(inode);
 
 	sync_inode_page(&dn);
 	f2fs_put_page(ipage, 1);
@@ -165,6 +161,7 @@ int f2fs_write_inline_data(struct inode *inode,
 	if (!f2fs_has_inline_data(inode)) {
 		truncate_data_blocks_range(&dn, 1);
 		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+		stat_inc_inline_inode(inode);
 	}
 
 	sync_inode_page(&dn);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index a91f45177cd..915f9a8f3ee 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -277,6 +277,7 @@ void f2fs_evict_inode(struct inode *inode)
 
 	f2fs_lock_op(sbi);
 	remove_inode_page(inode);
+	stat_dec_inline_inode(inode);
 	f2fs_unlock_op(sbi);
 
 	sb_end_intwrite(inode->i_sb);
-- 
cgit v1.2.3-70-g09d2


From 1e1bb4baf10be371f72150e2801d97a04d40b3b9 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 26 Dec 2013 12:49:48 +0900
Subject: f2fs: add inline_data recovery routine

This patch adds a inline_data recovery routine with the following policy.

[prev.] [next] of inline_data flag
   o       o  -> recover inline_data
   o       x  -> remove inline_data, and then recover data blocks
   x       o  -> remove inline_data, and then recover inline_data
   x       x  -> recover data blocks

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h          |  2 ++
 fs/f2fs/file.c          |  2 +-
 fs/f2fs/inline.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/f2fs/recovery.c      |  8 ++++++--
 include/linux/f2fs_fs.h |  3 +--
 5 files changed, 58 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0b6badadbfd..07a7ae0d441 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1015,6 +1015,7 @@ static inline int f2fs_readonly(struct super_block *sb)
  */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
+int truncate_blocks(struct inode *, u64);
 void f2fs_truncate(struct inode *);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1322,4 +1323,5 @@ bool f2fs_may_inline(struct inode *);
 int f2fs_read_inline_data(struct inode *, struct page *);
 int f2fs_convert_inline_data(struct inode *, pgoff_t);
 int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
+int recover_inline_data(struct inode *, struct page *);
 #endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 68dd7bfce1a..c77ad4d8b56 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -251,7 +251,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
 	f2fs_put_page(page, 1);
 }
 
-static int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	unsigned int blocksize = inode->i_sb->s_blocksize;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3c9261cd215..2a756e57aed 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -169,3 +169,51 @@ int f2fs_write_inline_data(struct inode *inode,
 
 	return 0;
 }
+
+int recover_inline_data(struct inode *inode, struct page *npage)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_inode *ri = NULL;
+	void *src_addr, *dst_addr;
+	struct page *ipage;
+
+	/*
+	 * The inline_data recovery policy is as follows.
+	 * [prev.] [next] of inline_data flag
+	 *    o       o  -> recover inline_data
+	 *    o       x  -> remove inline_data, and then recover data blocks
+	 *    x       o  -> remove inline_data, and then recover inline_data
+	 *    x       x  -> recover data blocks
+	 */
+	if (IS_INODE(npage))
+		ri = F2FS_INODE(npage);
+
+	if (f2fs_has_inline_data(inode) &&
+			ri && ri->i_inline & F2FS_INLINE_DATA) {
+process_inline:
+		ipage = get_node_page(sbi, inode->i_ino);
+		f2fs_bug_on(IS_ERR(ipage));
+
+		src_addr = inline_data_addr(npage);
+		dst_addr = inline_data_addr(ipage);
+		memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+		update_inode(inode, ipage);
+		f2fs_put_page(ipage, 1);
+		return -1;
+	}
+
+	if (f2fs_has_inline_data(inode)) {
+		ipage = get_node_page(sbi, inode->i_ino);
+		f2fs_bug_on(IS_ERR(ipage));
+		zero_user_segment(ipage, INLINE_DATA_OFFSET,
+				 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+		clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+		update_inode(inode, ipage);
+		f2fs_put_page(ipage, 1);
+	} else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
+		truncate_blocks(inode, 0);
+		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+		goto process_inline;
+	}
+	return 0;
+}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 96e690b6f0f..655791e518c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -298,6 +298,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	struct node_info ni;
 	int err = 0, recovered = 0;
 
+	if (recover_inline_data(inode, page))
+		goto out;
+
 	start = start_bidx_of_node(ofs_of_node(page), fi);
 	if (IS_INODE(page))
 		end = start + ADDRS_PER_INODE(fi);
@@ -305,12 +308,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 		end = start + ADDRS_PER_BLOCK;
 
 	f2fs_lock_op(sbi);
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 
 	err = get_dnode_of_data(&dn, start, ALLOC_NODE);
 	if (err) {
 		f2fs_unlock_op(sbi);
-		return err;
+		goto out;
 	}
 
 	wait_on_page_writeback(dn.node_page);
@@ -361,7 +365,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 err:
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
-
+out:
 	f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
 			"recovered_data = %d blocks, err = %d",
 			inode->i_ino, recovered, err);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index aea5eedbb1c..da74d878dc4 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -155,12 +155,11 @@ struct f2fs_extent {
 #define F2FS_INLINE_XATTR	0x01	/* file inline xattr flag */
 #define F2FS_INLINE_DATA	0x02	/* file inline data flag */
 
-
 #define MAX_INLINE_DATA		(sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
 						F2FS_INLINE_XATTR_ADDRS - 1))
 
 #define INLINE_DATA_OFFSET	(PAGE_CACHE_SIZE - sizeof(struct node_footer) \
-				- sizeof(__le32)*(DEF_ADDRS_PER_INODE + 5 - 1))
+			- sizeof(__le32) * (DEF_ADDRS_PER_INODE + 5 - 1))
 
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
-- 
cgit v1.2.3-70-g09d2


From a8865372a8414298982e07f4ac8d6dc0ab1e0a3d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 27 Dec 2013 17:04:17 +0900
Subject: f2fs: handle errors correctly during f2fs_reserve_block

The get_dnode_of_data nullifies inode and node page when error is occurred.

There are two cases that passes inode page into get_dnode_of_data().

1. make_empty_dir()
    -> get_new_data_page()
      -> f2fs_reserve_block(ipage)
	-> get_dnode_of_data()

2. f2fs_convert_inline_data()
    -> __f2fs_convert_inline_data()
      -> f2fs_reserve_block(ipage)
	-> get_dnode_of_data()

This patch adds correct error handling codes when get_dnode_of_data() returns
an error.

At first, f2fs_reserve_block() calls f2fs_put_dnode() whenever reserve_new_block
returns an error.
So, the rule of f2fs_reserve_block() is to nullify inode page when there is any
error internally.

Finally, two callers of f2fs_reserve_block() should call f2fs_put_dnode()
appropriately if they got an error since successful f2fs_reserve_block().

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c   | 29 ++++++++++++++++++++---------
 fs/f2fs/dir.c    |  7 ++++---
 fs/f2fs/inline.c |  6 ++----
 3 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fc7a28c5ad2..63d190264a3 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -258,13 +258,16 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 	bool need_put = dn->inode_page ? false : true;
 	int err;
 
+	/* if inode_page exists, index should be zero */
+	f2fs_bug_on(!need_put && index);
+
 	err = get_dnode_of_data(dn, index, ALLOC_NODE);
 	if (err)
 		return err;
+
 	if (dn->data_blkaddr == NULL_ADDR)
 		err = reserve_new_block(dn);
-
-	if (need_put)
+	if (err || need_put)
 		f2fs_put_dnode(dn);
 	return err;
 }
@@ -510,10 +513,10 @@ repeat:
  *
  * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
  * f2fs_unlock_op().
- * Note that, npage is set only by make_empty_dir.
+ * Note that, ipage is set only by make_empty_dir.
  */
 struct page *get_new_data_page(struct inode *inode,
-		struct page *npage, pgoff_t index, bool new_i_size)
+		struct page *ipage, pgoff_t index, bool new_i_size)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
@@ -521,14 +524,16 @@ struct page *get_new_data_page(struct inode *inode,
 	struct dnode_of_data dn;
 	int err;
 
-	set_new_dnode(&dn, inode, npage, npage, 0);
+	set_new_dnode(&dn, inode, ipage, NULL, 0);
 	err = f2fs_reserve_block(&dn, index);
 	if (err)
 		return ERR_PTR(err);
 repeat:
 	page = grab_cache_page(mapping, index);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
+	if (!page) {
+		err = -ENOMEM;
+		goto put_err;
+	}
 
 	if (PageUptodate(page))
 		return page;
@@ -540,11 +545,13 @@ repeat:
 		err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
 								READ_SYNC);
 		if (err)
-			return ERR_PTR(err);
+			goto put_err;
+
 		lock_page(page);
 		if (unlikely(!PageUptodate(page))) {
 			f2fs_put_page(page, 1);
-			return ERR_PTR(-EIO);
+			err = -EIO;
+			goto put_err;
 		}
 		if (unlikely(page->mapping != mapping)) {
 			f2fs_put_page(page, 1);
@@ -560,6 +567,10 @@ repeat:
 		mark_inode_dirty_sync(inode);
 	}
 	return page;
+
+put_err:
+	f2fs_put_dnode(&dn);
+	return ERR_PTR(err);
 }
 
 static int __allocate_data_block(struct dnode_of_data *dn)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 6da77e5c753..f815ca0c581 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -342,11 +342,11 @@ static struct page *init_inode_metadata(struct inode *inode,
 
 		err = f2fs_init_acl(inode, dir, page);
 		if (err)
-			goto error;
+			goto put_error;
 
 		err = f2fs_init_security(inode, dir, name, page);
 		if (err)
-			goto error;
+			goto put_error;
 
 		wait_on_page_writeback(page);
 	} else {
@@ -370,8 +370,9 @@ static struct page *init_inode_metadata(struct inode *inode,
 	}
 	return page;
 
-error:
+put_error:
 	f2fs_put_page(page, 1);
+error:
 	remove_inode_page(inode);
 	return ERR_PTR(err);
 }
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2a756e57aed..688305afbc7 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -81,10 +81,9 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 	 * i_addr[0] is not used for inline data,
 	 * so reserving new block will not destroy inline data
 	 */
-	set_new_dnode(&dn, inode, ipage, ipage, 0);
+	set_new_dnode(&dn, inode, ipage, NULL, 0);
 	err = f2fs_reserve_block(&dn, 0);
 	if (err) {
-		f2fs_put_page(ipage, 1);
 		f2fs_unlock_op(sbi);
 		return err;
 	}
@@ -111,9 +110,8 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 	stat_dec_inline_inode(inode);
 
 	sync_inode_page(&dn);
-	f2fs_put_page(ipage, 1);
+	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
-
 	return err;
 }
 
-- 
cgit v1.2.3-70-g09d2


From a225dca39495f7c18f1210306281c9c41a3454f6 Mon Sep 17 00:00:00 2001
From: "shifei10.ge" <shifei10.ge@samsung.com>
Date: Tue, 29 Oct 2013 15:32:34 +0800
Subject: f2fs: fix truncate_partial_nodes bug

The truncate_partial_nodes puts pages incorrectly in the following two cases.
Note that the value for argc 'depth' can only be 2 or 3.
Please see truncate_inode_blocks() and truncate_partial_nodes().

1) An err is occurred in the first 'for' loop
  When err is occurred with depth = 2, pages[0] is invalid, so this page doesn't
  need to be put. There is no problem, however, when depth is 3, it doesn't put
  the pages correctly where pages[0] is valid and pages[1] is invalid.
  In this case, depth is set to 2 (ref to statemnt depth = i + 1), and then
  'goto fail'.
  In label 'fail', for (i = depth - 3; i >= 0; i--) cannot meet the condition
  because i = -1, so pages[0] cann't be put.

2) An err happened in the second 'for' loop
  Now we've got pages[0] with depth = 2, or we've got pages[0] and pages[1]
  with depth = 3. When an err is detected, we need 'goto fail' to put such
  the pages.
  When depth is 2, in label 'fail', for (i = depth - 3; i >= 0; i--) cann't
  meet the condition because i = -1, so pages[0] cann't be put.
  When depth is 3, in label 'fail', for (i = depth - 3; i >= 0; i--) can
  only put pages[0], pages[1] also cann't be put.

Note that 'depth' has been changed before first 'goto fail' (ref to statemnt
depth = i + 1), so passing this modified 'depth' to the tracepoint,
trace_f2fs_truncate_partial_nodes, is also incorrect.

Signed-off-by: Shifei Ge <shifei10.ge@samsung.com>
[Jaegeuk Kim: modify the description and fix one bug]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/node.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9405a17a671..0230326be49 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -633,19 +633,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 		return 0;
 
 	/* get indirect nodes in the path */
-	for (i = 0; i < depth - 1; i++) {
+	for (i = 0; i < idx + 1; i++) {
 		/* refernece count'll be increased */
 		pages[i] = get_node_page(sbi, nid[i]);
 		if (IS_ERR(pages[i])) {
-			depth = i + 1;
 			err = PTR_ERR(pages[i]);
+			idx = i - 1;
 			goto fail;
 		}
 		nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
 	}
 
 	/* free direct nodes linked to a partial indirect node */
-	for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
+	for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
 		child_nid = get_nid(pages[idx], i, false);
 		if (!child_nid)
 			continue;
@@ -656,7 +656,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 		set_nid(pages[idx], i, 0, false);
 	}
 
-	if (offset[depth - 1] == 0) {
+	if (offset[idx + 1] == 0) {
 		dn->node_page = pages[idx];
 		dn->nid = nid[idx];
 		truncate_node(dn);
@@ -664,9 +664,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 		f2fs_put_page(pages[idx], 1);
 	}
 	offset[idx]++;
-	offset[depth - 1] = 0;
+	offset[idx + 1] = 0;
+	idx--;
 fail:
-	for (i = depth - 3; i >= 0; i--)
+	for (i = idx; i >= 0; i--)
 		f2fs_put_page(pages[i], 1);
 
 	trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
-- 
cgit v1.2.3-70-g09d2


From 18309aaa41909cfddb93e932b16a7d14ec425c9b Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 30 Dec 2013 09:29:06 +0800
Subject: f2fs: avoid to left uninitialized data in page when read inline data

Change log from v1:
 o reduce unneeded memset in __f2fs_convert_inline_data

>From 58796be2bd2becbe8d52305210fb2a64e7dd80b6 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 30 Dec 2013 09:21:33 +0800
Subject: [PATCH] f2fs: avoid to left uninitialized data in page when read
 inline data

We left uninitialized data in the tail of page when we read an inline data
page. So let's initialize left part of the page excluding inline data region.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/inline.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 688305afbc7..89f0c18cd73 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -43,8 +43,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
-	zero_user_segment(page, INLINE_DATA_OFFSET,
-				INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
 
 	/* Copy the whole inline data block */
 	src_addr = inline_data_addr(ipage);
@@ -88,7 +87,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 		return err;
 	}
 
-	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
 
 	/* Copy the whole inline data block */
 	src_addr = inline_data_addr(ipage);
-- 
cgit v1.2.3-70-g09d2


From 04a17fb17fafada39f96bfb41ceb2dc1c11b2af6 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 30 Dec 2013 18:36:23 +0800
Subject: f2fs: avoid to read inline data except first page

Here is a case which could read inline page data not from first page.

1. write inline data
2. lseek to offset 4096
3. read 4096 bytes from offset 4096
	(read_inline_data read inline data page to non-first page,
	And previously VFS has add this page to page cache)
4. ftruncate offset 8192
5. read 4096 bytes from offset 4096
	(we meet this updated page with inline data in cache)

So we should leave this page with inited data and uptodate flag
for this case.

Change log from v1:
 o fix a deadlock bug

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/inline.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 89f0c18cd73..e0d800a1d79 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -39,6 +39,11 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 	struct page *ipage;
 	void *src_addr, *dst_addr;
 
+	if (page->index) {
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		goto out;
+	}
+
 	ipage = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
@@ -52,6 +57,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 	kunmap(page);
 	f2fs_put_page(ipage, 1);
 
+out:
 	SetPageUptodate(page);
 	unlock_page(page);
 
-- 
cgit v1.2.3-70-g09d2


From 3c1c0ae1db74b1f3e606f42158b5dadd89105c1f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 6 Jan 2014 11:28:41 +0000
Subject: GFS2: Add directory addition info structure

The intent is that this structure will hold the information
required when adding entries to a directory (linking). To
start with, it will contain only the number of blocks which
are required to link the new entry into the directory. The
current calculation returns either 0 or the maximim number of
blocks that can ever be requested by such a transaction.

The intent is that in a later patch, we can update the dir
code to calculate this value more accurately. In addition
further patches will also add further fields to the new
structure to increase its utility.

In addition this patch fixes a bug where the link used during
inode creation was adding requesting too many blocks in
some cases. This is harmless unless the fs is close to being
full.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c   | 12 +++++++++---
 fs/gfs2/dir.h   |  7 ++++++-
 fs/gfs2/inode.c | 57 ++++++++++++++++++++++++++++-----------------------------
 3 files changed, 43 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d32..0b6be202a82 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2017,18 +2017,24 @@ out:
  * gfs2_diradd_alloc_required - find if adding entry will require an allocation
  * @ip: the file being written to
  * @filname: the filename that's going to be added
+ * @da: The structure to return dir alloc info
  *
- * Returns: 1 if alloc required, 0 if not, -ve on error
+ * Returns: 0 if ok, -ve on error
  */
 
-int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
+			       struct gfs2_diradd *da)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_dirent *dent;
 	struct buffer_head *bh;
 
+	da->nr_blocks = 0;
+
 	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
 	if (!dent) {
-		return 1;
+		da->nr_blocks = sdp->sd_max_dirres;
+		return 0;
 	}
 	if (IS_ERR(dent))
 		return PTR_ERR(dent);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873..c5573e703a7 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,6 +17,10 @@ struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
 
+struct gfs2_diradd {
+	unsigned nr_blocks;
+};
+
 extern struct inode *gfs2_dir_search(struct inode *dir,
 				     const struct qstr *filename,
 				     bool fail_on_exist);
@@ -33,7 +37,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 
 extern int gfs2_diradd_alloc_required(struct inode *dir,
-				      const struct qstr *filename);
+				      const struct qstr *filename,
+				      struct gfs2_diradd *da);
 extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
 				   struct buffer_head **bhp);
 extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f..9ac8f13a8c3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -470,13 +470,13 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
 }
 
 static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
-		       struct gfs2_inode *ip, int arq)
+		       struct gfs2_inode *ip, struct gfs2_diradd *da)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+	struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
 	int error;
 
-	if (arq) {
+	if (da->nr_blocks) {
 		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto fail_quota_locks;
@@ -485,8 +485,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		if (error)
 			goto fail_quota_locks;
 
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 dip->i_rgd->rd_length +
+		error = gfs2_trans_begin(sdp, da->nr_blocks +
+					 gfs2_rg_blocks(dip, da->nr_blocks) +
 					 2 * RES_DINODE +
 					 RES_STATFS + RES_QUOTA, 0);
 		if (error)
@@ -560,7 +560,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	struct dentry *d;
 	int error;
 	u32 aflags = 0;
-	int arq;
+	struct gfs2_diradd da;
 
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return -ENAMETOOLONG;
@@ -602,7 +602,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		goto fail_gunlock;
 	}
 
-	arq = error = gfs2_diradd_alloc_required(dir, name);
+	error = gfs2_diradd_alloc_required(dir, name, &da);
 	if (error < 0)
 		goto fail_gunlock;
 
@@ -690,7 +690,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		goto fail_gunlock3;
 
-	error = link_dinode(dip, name, ip, arq);
+	error = link_dinode(dip, name, ip, &da);
 	if (error)
 		goto fail_gunlock3;
 
@@ -817,7 +817,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder ghs[2];
 	struct buffer_head *dibh;
-	int alloc_required;
+	struct gfs2_diradd da;
 	int error;
 
 	if (S_ISDIR(inode->i_mode))
@@ -872,13 +872,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (ip->i_inode.i_nlink == (u32)-1)
 		goto out_gunlock;
 
-	alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+	error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
 	if (error < 0)
 		goto out_gunlock;
-	error = 0;
 
-	if (alloc_required) {
-		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+	if (da.nr_blocks) {
+		struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
 		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto out_gunlock;
@@ -887,8 +886,8 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 		if (error)
 			goto out_gunlock_q;
 
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
+		error = gfs2_trans_begin(sdp, da.nr_blocks +
+					 gfs2_rg_blocks(dip, da.nr_blocks) +
 					 2 * RES_DINODE + RES_STATFS +
 					 RES_QUOTA, 0);
 		if (error)
@@ -919,10 +918,10 @@ out_brelse:
 out_end_trans:
 	gfs2_trans_end(sdp);
 out_ipres:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_inplace_release(dip);
 out_gunlock_q:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_quota_unlock(dip);
 out_gunlock:
 	gfs2_glock_dq(ghs + 1);
@@ -1254,7 +1253,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
-	int alloc_required = 0;
+	struct gfs2_diradd da = { .nr_blocks = 0, };
 	unsigned int x;
 	int error;
 
@@ -1388,14 +1387,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock;
 	}
 
-	if (nip == NULL)
-		alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
-	error = alloc_required;
-	if (error < 0)
-		goto out_gunlock;
+	if (nip == NULL) {
+		error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
+		if (error)
+			goto out_gunlock;
+	}
 
-	if (alloc_required) {
-		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+	if (da.nr_blocks) {
+		struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
 		error = gfs2_quota_lock_check(ndip);
 		if (error)
 			goto out_gunlock;
@@ -1404,8 +1403,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_gunlock_q;
 
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(ndip, sdp->sd_max_dirres) +
+		error = gfs2_trans_begin(sdp, da.nr_blocks +
+					 gfs2_rg_blocks(ndip, da.nr_blocks) +
 					 4 * RES_DINODE + 4 * RES_LEAF +
 					 RES_STATFS + RES_QUOTA + 4, 0);
 		if (error)
@@ -1448,10 +1447,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 out_end_trans:
 	gfs2_trans_end(sdp);
 out_ipreserv:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_inplace_release(ndip);
 out_gunlock_q:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_quota_unlock(ndip);
 out_gunlock:
 	while (x--) {
-- 
cgit v1.2.3-70-g09d2


From 534cf9ca553953e4c12fa5f0d23e543f9a6ccbaf Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 6 Jan 2014 12:03:05 +0000
Subject: GFS2: Consolidate transaction blocks calculation for dir add

There are three cases where we need to calculate the number of
blocks to reserve in a transaction involving linking an inode
into a directory. The one in rename is a bit more complicated,
but the basis of it is the same as for link and create. So it
makes sense to move this calculation into a single function
rather than repeating it three times.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9ac8f13a8c3..fa4624feef0 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -469,6 +469,28 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
 	brelse(dibh);
 }
 
+/**
+ * gfs2_trans_da_blocks - Calculate number of blocks to link inode
+ * @dip: The directory we are linking into
+ * @da: The dir add information
+ * @nr_inodes: The number of inodes involved
+ *
+ * This calculate the number of blocks we need to reserve in a
+ * transaction to link @nr_inodes into a directory. In most cases
+ * @nr_inodes will be 2 (the directory plus the inode being linked in)
+ * but in case of rename, 4 may be required.
+ *
+ * Returns: Number of blocks
+ */
+
+static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
+				   const struct gfs2_diradd *da,
+				   unsigned nr_inodes)
+{
+	return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
+	       (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
+}
+
 static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		       struct gfs2_inode *ip, struct gfs2_diradd *da)
 {
@@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		if (error)
 			goto fail_quota_locks;
 
-		error = gfs2_trans_begin(sdp, da->nr_blocks +
-					 gfs2_rg_blocks(dip, da->nr_blocks) +
-					 2 * RES_DINODE +
-					 RES_STATFS + RES_QUOTA, 0);
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
 		if (error)
 			goto fail_ipreserv;
 	} else {
@@ -886,10 +905,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 		if (error)
 			goto out_gunlock_q;
 
-		error = gfs2_trans_begin(sdp, da.nr_blocks +
-					 gfs2_rg_blocks(dip, da.nr_blocks) +
-					 2 * RES_DINODE + RES_STATFS +
-					 RES_QUOTA, 0);
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
 		if (error)
 			goto out_ipres;
 	} else {
@@ -1403,10 +1419,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_gunlock_q;
 
-		error = gfs2_trans_begin(sdp, da.nr_blocks +
-					 gfs2_rg_blocks(ndip, da.nr_blocks) +
-					 4 * RES_DINODE + 4 * RES_LEAF +
-					 RES_STATFS + RES_QUOTA + 4, 0);
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
+					 4 * RES_LEAF + 4, 0);
 		if (error)
 			goto out_ipreserv;
 	} else {
-- 
cgit v1.2.3-70-g09d2


From 2b47dad866d04f14c328f888ba5406057b8c7d33 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 6 Jan 2014 12:49:43 +0000
Subject: GFS2: Remember directory insert point

When we look to see if there is enough space to add a dir
entry without allocation, we have then been repeating the
same search later when we do the actual insertion. This
patch caches the details of the location in the gfs2_diradd
structure, so that we do not have to repeat the search.

This will provide a performance improvement which will be
greater as the size of the directory increases.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c   | 33 +++++++++++++++++++++++----------
 fs/gfs2/dir.h   | 12 +++++++++++-
 fs/gfs2/inode.c | 13 ++++++++-----
 3 files changed, 42 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 0b6be202a82..d5988aafaa7 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1659,26 +1659,34 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 
 /**
  * gfs2_dir_add - Add new filename into directory
- * @dip: The GFS2 inode
- * @filename: The new name
- * @inode: The inode number of the entry
- * @type: The type of the entry
+ * @inode: The directory inode
+ * @name: The new name
+ * @nip: The GFS2 inode to be linked in to the directory
+ * @da: The directory addition info
+ *
+ * If the call to gfs2_diradd_alloc_required resulted in there being
+ * no need to allocate any new directory blocks, then it will contain
+ * a pointer to the directory entry and the bh in which it resides. We
+ * can use that without having to repeat the search. If there was no
+ * free space, then we must now create more space.
  *
  * Returns: 0 on success, error code on failure
  */
 
 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
-		 const struct gfs2_inode *nip)
+		 const struct gfs2_inode *nip, struct gfs2_diradd *da)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *bh;
-	struct gfs2_dirent *dent;
+	struct buffer_head *bh = da->bh;
+	struct gfs2_dirent *dent = da->dent;
 	struct gfs2_leaf *leaf;
 	int error;
 
 	while(1) {
-		dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
-					  &bh);
+		if (da->bh == NULL) {
+			dent = gfs2_dirent_search(inode, name,
+						  gfs2_dirent_find_space, &bh);
+		}
 		if (dent) {
 			if (IS_ERR(dent))
 				return PTR_ERR(dent);
@@ -1689,6 +1697,8 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 				leaf = (struct gfs2_leaf *)bh->b_data;
 				be16_add_cpu(&leaf->lf_entries, 1);
 			}
+			da->dent = NULL;
+			da->bh = NULL;
 			brelse(bh);
 			ip->i_entries++;
 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
@@ -2030,6 +2040,8 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
 	struct buffer_head *bh;
 
 	da->nr_blocks = 0;
+	da->bh = NULL;
+	da->dent = NULL;
 
 	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
 	if (!dent) {
@@ -2038,7 +2050,8 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
 	}
 	if (IS_ERR(dent))
 		return PTR_ERR(dent);
-	brelse(bh);
+	da->bh = bh;
+	da->dent = dent;
 	return 0;
 }
 
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index c5573e703a7..126c65dda02 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,9 +16,13 @@
 struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
+struct buffer_head;
+struct gfs2_dirent;
 
 struct gfs2_diradd {
 	unsigned nr_blocks;
+	struct gfs2_dirent *dent;
+	struct buffer_head *bh;
 };
 
 extern struct inode *gfs2_dir_search(struct inode *dir,
@@ -27,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
 			  const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-			const struct gfs2_inode *ip);
+			const struct gfs2_inode *ip, struct gfs2_diradd *da);
+static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
+{
+	if (da->bh)
+		brelse(da->bh);
+	da->bh = NULL;
+}
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 			 struct file_ra_state *f_ra);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fa4624feef0..4acc584038e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -516,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 			goto fail_quota_locks;
 	}
 
-	error = gfs2_dir_add(&dip->i_inode, name, ip);
+	error = gfs2_dir_add(&dip->i_inode, name, ip, da);
 	if (error)
 		goto fail_end_trans;
 
@@ -579,7 +579,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	struct dentry *d;
 	int error;
 	u32 aflags = 0;
-	struct gfs2_diradd da;
+	struct gfs2_diradd da = { .bh = NULL, };
 
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return -ENAMETOOLONG;
@@ -738,6 +738,7 @@ fail_free_inode:
 	free_inode_nonrcu(inode);
 	inode = NULL;
 fail_gunlock:
+	gfs2_dir_no_add(&da);
 	gfs2_glock_dq_uninit(ghs);
 	if (inode && !IS_ERR(inode)) {
 		clear_nlink(inode);
@@ -836,7 +837,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder ghs[2];
 	struct buffer_head *dibh;
-	struct gfs2_diradd da;
+	struct gfs2_diradd da = { .bh = NULL, };
 	int error;
 
 	if (S_ISDIR(inode->i_mode))
@@ -918,7 +919,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out_end_trans;
 
-	error = gfs2_dir_add(dir, &dentry->d_name, ip);
+	error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
 	if (error)
 		goto out_brelse;
 
@@ -940,6 +941,7 @@ out_gunlock_q:
 	if (da.nr_blocks)
 		gfs2_quota_unlock(dip);
 out_gunlock:
+	gfs2_dir_no_add(&da);
 	gfs2_glock_dq(ghs + 1);
 out_child:
 	gfs2_glock_dq(ghs);
@@ -1454,7 +1456,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	if (error)
 		goto out_end_trans;
 
-	error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+	error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
 	if (error)
 		goto out_end_trans;
 
@@ -1467,6 +1469,7 @@ out_gunlock_q:
 	if (da.nr_blocks)
 		gfs2_quota_unlock(ndip);
 out_gunlock:
+	gfs2_dir_no_add(&da);
 	while (x--) {
 		gfs2_glock_dq(ghs + x);
 		gfs2_holder_uninit(ghs + x);
-- 
cgit v1.2.3-70-g09d2


From ff88825fbb9f5a503164bb5ad4a8c65dabfa13e0 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 6 Jan 2014 11:28:41 +0800
Subject: NFSD: fix compile warning without CONFIG_NFSD_V3

Without CONFIG_NFSD_V3, compile will get warning as,

fs/nfsd/nfssvc.c: In function 'nfsd_svc':
>> fs/nfsd/nfssvc.c:246:60: warning: array subscript is above array bounds [-Warray-bounds]
        return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
                                                               ^

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfssvc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 55b5b57b571..9a4a5f9e746 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -243,7 +243,11 @@ static void nfsd_shutdown_generic(void)
 
 static bool nfsd_needs_lockd(void)
 {
+#if defined(CONFIG_NFSD_V3)
 	return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
+#else
+	return (nfsd_versions[2] != NULL);
+#endif
 }
 
 static int nfsd_startup_net(int nrservs, struct net *net)
-- 
cgit v1.2.3-70-g09d2


From 3ff69309fed8ac3755864addfa064b51abfcde06 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 5 Dec 2013 10:41:40 +0800
Subject: Define op_iattr for nfsd4_open instead using macro

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/xdr4.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 916a4073731..d278a0d0349 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -228,7 +228,7 @@ struct nfsd4_open {
 	u32		op_create;     	    /* request */
 	u32		op_createmode;      /* request */
 	u32		op_bmval[3];        /* request */
-	struct iattr	iattr;              /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+	struct iattr	op_iattr;           /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
 	nfs4_verifier	op_verf __attribute__((aligned(32)));
 					    /* EXCLUSIVE4 */
 	clientid_t	op_clientid;        /* request */
@@ -250,7 +250,6 @@ struct nfsd4_open {
 	struct nfs4_acl *op_acl;
 	struct xdr_netobj op_label;
 };
-#define op_iattr	iattr
 
 struct nfsd4_open_confirm {
 	stateid_t	oc_req_stateid		/* request */;
-- 
cgit v1.2.3-70-g09d2


From 73ca65904c5abaa29b8d9699089292239564300f Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Thu, 5 Dec 2013 11:07:20 +0800
Subject: nfsd: get rid of unused function definition

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a4be2e38967..fd8c0cc9c25 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -89,8 +89,6 @@ __be32		nfsd_link(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_rename(struct svc_rqst *,
 				struct svc_fh *, char *, int,
 				struct svc_fh *, char *, int);
-__be32		nfsd_remove(struct svc_rqst *,
-				struct svc_fh *, char *, int);
 __be32		nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
 				char *name, int len);
 __be32		nfsd_readdir(struct svc_rqst *, struct svc_fh *,
-- 
cgit v1.2.3-70-g09d2


From 9cb00419faa7dd81e921328a71931d2b95ed5876 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Mon, 6 Jan 2014 14:01:23 -0500
Subject: ext4: enable punch hole for bigalloc

After applied this commit (d23142c6), ext4 has supported punch hole for
a file system with bigalloc feature.  But we forgot to enable it.  This
commit fixes it.

Cc: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61d49ff22c8..90a4ea781cd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3501,11 +3501,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	if (EXT4_SB(sb)->s_cluster_ratio > 1) {
-		/* TODO: Add support for bigalloc file systems */
-		return -EOPNOTSUPP;
-	}
-
 	trace_ext4_punch_hole(inode, offset, length);
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From bc0ca9df3b2abb13f7da9d8d255ec60718badd84 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 6 Jan 2014 14:02:23 -0500
Subject: ext4: retry allocation when inline->extent conversion failed

Similarly as other ->write_begin functions in ext4, also
ext4_da_write_inline_data_begin() should retry allocation if the
conversion failed because of ENOSPC. This avoids returning ENOSPC
prematurely because of uncommitted block deletions.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inline.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index bae987549dc..ed6e71fe5e9 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -849,11 +849,13 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
 	handle_t *handle;
 	struct page *page;
 	struct ext4_iloc iloc;
+	int retries;
 
 	ret = ext4_get_inode_loc(inode, &iloc);
 	if (ret)
 		return ret;
 
+retry_journal:
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -875,6 +877,11 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
 							    inode,
 							    flags,
 							    fsdata);
+		ext4_journal_stop(handle);
+		handle = NULL;
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
 		goto out;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 52e4477758eef45c2fa28b087abf83847126bc28 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 6 Jan 2014 14:03:23 -0500
Subject: ext4: standardize error handling in ext4_da_write_inline_data_begin()

The function has a bit non-standard (for ext4) error recovery in that it
used a mix of 'out' labels and testing for 'handle' being NULL. There
isn't a good reason for that in the function so clean it up a bit.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inline.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index ed6e71fe5e9..c417e52d194 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -859,7 +859,6 @@ retry_journal:
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		handle = NULL;
 		goto out;
 	}
 
@@ -869,7 +868,7 @@ retry_journal:
 	if (inline_size >= pos + len) {
 		ret = ext4_prepare_inline_data(handle, inode, pos + len);
 		if (ret && ret != -ENOSPC)
-			goto out;
+			goto out_journal;
 	}
 
 	if (ret == -ENOSPC) {
@@ -878,7 +877,6 @@ retry_journal:
 							    flags,
 							    fsdata);
 		ext4_journal_stop(handle);
-		handle = NULL;
 		if (ret == -ENOSPC &&
 		    ext4_should_retry_alloc(inode->i_sb, &retries))
 			goto retry_journal;
@@ -894,7 +892,7 @@ retry_journal:
 	page = grab_cache_page_write_begin(mapping, 0, flags);
 	if (!page) {
 		ret = -ENOMEM;
-		goto out;
+		goto out_journal;
 	}
 
 	down_read(&EXT4_I(inode)->xattr_sem);
@@ -911,16 +909,15 @@ retry_journal:
 
 	up_read(&EXT4_I(inode)->xattr_sem);
 	*pagep = page;
-	handle = NULL;
 	brelse(iloc.bh);
 	return 1;
 out_release_page:
 	up_read(&EXT4_I(inode)->xattr_sem);
 	unlock_page(page);
 	page_cache_release(page);
+out_journal:
+	ext4_journal_stop(handle);
 out:
-	if (handle)
-		ext4_journal_stop(handle);
 	brelse(iloc.bh);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From a34e15cc35c743f0e49125a4fbd22a7c55b686d8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 6 Jan 2014 14:04:23 -0500
Subject: ext4: use %pd printk specificer

Use the new %pd printk() specifier in Ext4 to replace passing of
dentry name or dentry name and name length * 2 with just passing the
dentry.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
cc: Andreas Dilger <adilger.kernel@dilger.ca>
cc: linux-ext4@vger.kernel.org
---
 fs/ext4/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5a0408d7b11..3e9e7066323 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1425,9 +1425,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 			return ERR_PTR(-EIO);
 		}
 		if (unlikely(ino == dir->i_ino)) {
-			EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
-					 dentry->d_name.len,
-					 dentry->d_name.name);
+			EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
+					 dentry);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
-- 
cgit v1.2.3-70-g09d2


From 9e740568bc65a82845330b9439514255140b94aa Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Mon, 6 Jan 2014 14:05:23 -0500
Subject: ext4: fix a typo in extents.c

Signed-off-by: Yongqiang Yang <yangyongqiang01@baidu.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3384dc4bed4..10cff4736b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3477,7 +3477,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	WARN_ON(map->m_lblk < ee_block);
 	/*
 	 * It is safe to convert extent to initialized via explicit
-	 * zeroout only if extent is fully insde i_size or new_size.
+	 * zeroout only if extent is fully inside i_size or new_size.
 	 */
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
-- 
cgit v1.2.3-70-g09d2


From 65eddb56f465f314502679ceade6fc5848a53a50 Mon Sep 17 00:00:00 2001
From: Yongqiang Yang <xiaoqiangnk@gmail.com>
Date: Mon, 6 Jan 2014 14:06:18 -0500
Subject: ext4: ext4_inode_is_fast_symlink should use EXT4_CLUSTER_SIZE

Can be reproduced by xfstests 62 with bigalloc and 128bit size inode.

Signed-off-by: Yongqiang Yang <yangyongqiang01@baidu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 90a4ea781cd..8454ebe238a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -144,8 +144,8 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
  */
 static int ext4_inode_is_fast_symlink(struct inode *inode)
 {
-	int ea_blocks = EXT4_I(inode)->i_file_acl ?
-		(inode->i_sb->s_blocksize >> 9) : 0;
+        int ea_blocks = EXT4_I(inode)->i_file_acl ?
+		EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
 
 	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
-- 
cgit v1.2.3-70-g09d2


From 85dd0707f0cad26d60f2dc574d17a5ab948d10f7 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 1 Jan 2014 19:28:03 +0800
Subject: xfs: fix off-by-one error in xfs_attr3_rmt_verify

With CRC check is enabled, if trying to set an attributes value just
equal to the maximum size of XATTR_SIZE_MAX would cause the v3 remote
attr write verification procedure failure, which would yield the back
trace like below:

<snip>
XFS (sda7): Internal error xfs_attr3_rmt_write_verify at line 191 of file fs/xfs/xfs_attr_remote.c
<snip>
Call Trace:
[<ffffffff816f0042>] dump_stack+0x45/0x56
[<ffffffffa0d99c8b>] xfs_error_report+0x3b/0x40 [xfs]
[<ffffffffa0d96edd>] ? _xfs_buf_ioapply+0x6d/0x390 [xfs]
[<ffffffffa0d99ce5>] xfs_corruption_error+0x55/0x80 [xfs]
[<ffffffffa0dbef6b>] xfs_attr3_rmt_write_verify+0x14b/0x1a0 [xfs]
[<ffffffffa0d96edd>] ? _xfs_buf_ioapply+0x6d/0x390 [xfs]
[<ffffffffa0d97315>] ? xfs_bdstrat_cb+0x55/0xb0 [xfs]
[<ffffffffa0d96edd>] _xfs_buf_ioapply+0x6d/0x390 [xfs]
[<ffffffff81184cda>] ? vm_map_ram+0x31a/0x460
[<ffffffff81097230>] ? wake_up_state+0x20/0x20
[<ffffffffa0d97315>] ? xfs_bdstrat_cb+0x55/0xb0 [xfs]
[<ffffffffa0d9726b>] xfs_buf_iorequest+0x6b/0xc0 [xfs]
[<ffffffffa0d97315>] xfs_bdstrat_cb+0x55/0xb0 [xfs]
[<ffffffffa0d97906>] xfs_bwrite+0x46/0x80 [xfs]
[<ffffffffa0dbfa94>] xfs_attr_rmtval_set+0x334/0x490 [xfs]
[<ffffffffa0db84aa>] xfs_attr_leaf_addname+0x24a/0x410 [xfs]
[<ffffffffa0db8893>] xfs_attr_set_int+0x223/0x470 [xfs]
[<ffffffffa0db8b76>] xfs_attr_set+0x96/0xb0 [xfs]
[<ffffffffa0db13b2>] xfs_xattr_set+0x42/0x70 [xfs]
[<ffffffff811df9b2>] generic_setxattr+0x62/0x80
[<ffffffff811e0213>] __vfs_setxattr_noperm+0x63/0x1b0
[<ffffffff81307afe>] ? evm_inode_setxattr+0xe/0x10
[<ffffffff811e0415>] vfs_setxattr+0xb5/0xc0
[<ffffffff811e054e>] setxattr+0x12e/0x1c0
[<ffffffff811c6e82>] ? final_putname+0x22/0x50
[<ffffffff811c708b>] ? putname+0x2b/0x40
[<ffffffff811cc4bf>] ? user_path_at_empty+0x5f/0x90
[<ffffffff811bdfd9>] ? __sb_start_write+0x49/0xe0
[<ffffffff81168589>] ? vm_mmap_pgoff+0x99/0xc0
[<ffffffff811e07df>] SyS_setxattr+0x8f/0xe0
[<ffffffff81700c2d>] system_call_fastpath+0x1a/0x1f

Tests:
    setfattr -n user.longxattr -v `perl -e 'print "A"x65536'` testfile

This patch fix it to check the remote EA size is greater than the
XATTR_SIZE_MAX rather than more than or equal to it, because it's
valid if the specified EA value size is equal to the limitation as
per VFS setxattr interface.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_remote.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 739e0a52ded..5549d69ddb4 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -110,7 +110,7 @@ xfs_attr3_rmt_verify(
 	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
 		return false;
 	if (be32_to_cpu(rmt->rm_offset) +
-				be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+				be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
 		return false;
 	if (rmt->rm_owner == 0)
 		return false;
-- 
cgit v1.2.3-70-g09d2


From 60810e5489dffd0bd12e4f99fe9fc330c9a636e1 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Wed, 1 Jan 2014 00:35:47 +0800
Subject: NFSD: Fix a memory leak in nfsd4_create_session

If failed after calling alloc_session but before init_session, nfsd will call __free_session to
free se_slots in session. But, session->se_fchannel.maxreqs is not initialized (value is zero).
So that, the memory malloced for slots will be lost in free_session_slots for maxreqs is zero.

This path sets the information for channel in alloc_session after mallocing slots succeed,
instead in init_session.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index acb95026ae3..5795d5f58f4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -832,10 +832,11 @@ static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
 	spin_unlock(&nfsd_drc_lock);
 }
 
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
+					   struct nfsd4_channel_attrs *battrs)
 {
-	int numslots = attrs->maxreqs;
-	int slotsize = slot_bytes(attrs);
+	int numslots = fattrs->maxreqs;
+	int slotsize = slot_bytes(fattrs);
 	struct nfsd4_session *new;
 	int mem, i;
 
@@ -852,6 +853,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
 		if (!new->se_slots[i])
 			goto out_free;
 	}
+
+	memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
+	memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs));
+
 	return new;
 out_free:
 	while (i--)
@@ -997,10 +1002,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 	list_add(&new->se_perclnt, &clp->cl_sessions);
 	spin_unlock(&clp->cl_lock);
 	spin_unlock(&nn->client_lock);
-	memcpy(&new->se_fchannel, &cses->fore_channel,
-			sizeof(struct nfsd4_channel_attrs));
-	memcpy(&new->se_bchannel, &cses->back_channel,
-			sizeof(struct nfsd4_channel_attrs));
+
 	if (cses->flags & SESSION4_BACK_CHAN) {
 		struct sockaddr *sa = svc_addr(rqstp);
 		/*
@@ -1922,7 +1924,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	if (status)
 		goto out_release_drc_mem;
 	status = nfserr_jukebox;
-	new = alloc_session(&cr_ses->fore_channel);
+	new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
 	if (!new)
 		goto out_release_drc_mem;
 	conn = alloc_conn_from_crses(rqstp, cr_ses);
-- 
cgit v1.2.3-70-g09d2


From 62e96cf81988101fe9e086b2877307b6adda5197 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 6 Jan 2014 17:16:01 -0500
Subject: GFS2: Increase i_writecount during gfs2_setattr_chown

This patch calls get_write_access in function gfs2_setattr_chown,
which merely increases inode->i_writecount for the duration of the
function. That will ensure that any file closes won't delete the
inode's multi-block reservation while the function is running.
It also ensures that a multi-block reservation exists when needed
for quota change operations during the chown.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4acc584038e..51cf10df83e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1623,10 +1623,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
 		ogid = ngid = NO_GID_QUOTA_CHANGE;
 
-	error = gfs2_quota_lock(ip, nuid, ngid);
+	error = get_write_access(inode);
 	if (error)
 		return error;
 
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_lock(ip, nuid, ngid);
+	if (error)
+		goto out;
+
 	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
 	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
 		error = gfs2_quota_check(ip, nuid, ngid);
@@ -1653,6 +1665,8 @@ out_end_trans:
 	gfs2_trans_end(sdp);
 out_gunlock_q:
 	gfs2_quota_unlock(ip);
+out:
+	put_write_access(inode);
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8faaaead62c3c7394fa6302303ce70e484b509ba Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Tue, 7 Jan 2014 21:58:06 +0900
Subject: treewide: fix comments and printk msgs

This patch fixed several typo in printk from various
part of kernel source.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/powerpc/platforms/powernv/opal-flash.c | 4 ++--
 drivers/i2c/busses/i2c-stu300.c             | 2 +-
 drivers/misc/fsa9480.c                      | 2 +-
 drivers/misc/mei/nfc.c                      | 2 +-
 drivers/net/wan/lmc/lmc_main.c              | 2 +-
 drivers/scsi/qla2xxx/qla_nx2.c              | 2 +-
 drivers/scsi/qla4xxx/ql4_83xx.c             | 2 +-
 drivers/tty/serial/crisv10.c                | 2 +-
 drivers/usb/serial/kobil_sct.c              | 2 +-
 fs/btrfs/tests/free-space-tests.c           | 2 +-
 net/nfc/hci/llc_shdlc.c                     | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
index 6ffa6b1ec5b..d8773079ce1 100644
--- a/arch/powerpc/platforms/powernv/opal-flash.c
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -126,7 +126,7 @@ struct opal_sg_list {
 
 struct validate_flash_t {
 	int		status;		/* Return status */
-	void		*buf;		/* Candiate image buffer */
+	void		*buf;		/* Candidate image buffer */
 	uint32_t	buf_size;	/* Image size */
 	uint32_t	result;		/* Update results token */
 };
@@ -500,7 +500,7 @@ static int alloc_image_buf(char *buffer, size_t count)
 
 	memcpy(&image_header, (void *)buffer, sizeof(struct image_header_t));
 	image_data.size = be32_to_cpu(image_header.size);
-	pr_debug("FLASH: Candiate image size = %u\n", image_data.size);
+	pr_debug("FLASH: Candidate image size = %u\n", image_data.size);
 
 	if (image_data.size > MAX_IMAGE_SIZE) {
 		pr_warn("FLASH: Too large image\n");
diff --git a/drivers/i2c/busses/i2c-stu300.c b/drivers/i2c/busses/i2c-stu300.c
index 04a17b9b38b..5b80ef31084 100644
--- a/drivers/i2c/busses/i2c-stu300.c
+++ b/drivers/i2c/busses/i2c-stu300.c
@@ -801,7 +801,7 @@ static int stu300_xfer_msg(struct i2c_adapter *adap,
 	/* Check that the bus is free, or wait until some timeout occurs */
 	ret = stu300_wait_while_busy(dev);
 	if (ret != 0) {
-		dev_err(&dev->pdev->dev, "timout waiting for transfer "
+		dev_err(&dev->pdev->dev, "timeout waiting for transfer "
 		       "to commence.\n");
 		goto exit_disable;
 	}
diff --git a/drivers/misc/fsa9480.c b/drivers/misc/fsa9480.c
index a725c79c35f..71d2793b372 100644
--- a/drivers/misc/fsa9480.c
+++ b/drivers/misc/fsa9480.c
@@ -396,7 +396,7 @@ static int fsa9480_irq_init(struct fsa9480_usbsw *usbsw)
 				IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
 				"fsa9480 micro USB", usbsw);
 		if (ret) {
-			dev_err(&client->dev, "failed to reqeust IRQ\n");
+			dev_err(&client->dev, "failed to request IRQ\n");
 			return ret;
 		}
 
diff --git a/drivers/misc/mei/nfc.c b/drivers/misc/mei/nfc.c
index 994ca4aff1a..04087d1369d 100644
--- a/drivers/misc/mei/nfc.c
+++ b/drivers/misc/mei/nfc.c
@@ -428,7 +428,7 @@ static void mei_nfc_init(struct work_struct *work)
 	mutex_unlock(&dev->device_lock);
 
 	if (mei_nfc_if_version(ndev) < 0) {
-		dev_err(&dev->pdev->dev, "Could not get the NFC interfave version");
+		dev_err(&dev->pdev->dev, "Could not get the NFC interface version");
 
 		goto err;
 	}
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index 7ef435bab42..f51204cfe12 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -2126,7 +2126,7 @@ bug_out:
 
     spin_unlock_irqrestore(&sc->lmc_lock, flags);
 
-    lmc_trace(dev, "lmc_driver_timout out");
+    lmc_trace(dev, "lmc_driver_timeout out");
 
 
 }
diff --git a/drivers/scsi/qla2xxx/qla_nx2.c b/drivers/scsi/qla2xxx/qla_nx2.c
index 8164cc9e728..4f5d66b2168 100644
--- a/drivers/scsi/qla2xxx/qla_nx2.c
+++ b/drivers/scsi/qla2xxx/qla_nx2.c
@@ -2253,7 +2253,7 @@ qla8044_minidump_process_rdmem(struct scsi_qla_host *vha,
 
 	if (r_addr & 0xf) {
 		ql_dbg(ql_dbg_p3p, vha, 0xb0f1,
-		    "[%s]: Read addr 0x%x not 16 bytes alligned\n",
+		    "[%s]: Read addr 0x%x not 16 bytes aligned\n",
 		    __func__, r_addr);
 		return QLA_FUNCTION_FAILED;
 	}
diff --git a/drivers/scsi/qla4xxx/ql4_83xx.c b/drivers/scsi/qla4xxx/ql4_83xx.c
index 8196c2f7915..919284834ad 100644
--- a/drivers/scsi/qla4xxx/ql4_83xx.c
+++ b/drivers/scsi/qla4xxx/ql4_83xx.c
@@ -465,7 +465,7 @@ int qla4_83xx_drv_lock(struct scsi_qla_host *ha)
 				}
 				/* Recovery Failed, some other function
 				 * has the lock, wait for 2secs and retry */
-				ql4_printk(KERN_INFO, ha, "%s: IDC lock Recovery by %d failed, Retrying timout\n",
+				ql4_printk(KERN_INFO, ha, "%s: IDC lock Recovery by %d failed, Retrying timeout\n",
 					   __func__, ha->func_num);
 				timeout = 0;
 			}
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 477f22f773f..690bdea0a0c 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -2153,7 +2153,7 @@ static void flush_timeout_function(unsigned long data)
 
 	fast_timers[info->line].function = NULL;
 	serial_fast_timer_expired++;
-	TIMERD(DEBUG_LOG(info->line, "flush_timout %i ", info->line));
+	TIMERD(DEBUG_LOG(info->line, "flush_timeout %i ", info->line));
 	TIMERD(DEBUG_LOG(info->line, "num expired: %i\n", serial_fast_timer_expired));
 	check_flush_timeout(info);
 }
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 78b48c31abf..4c94ffcb66c 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -194,7 +194,7 @@ static int kobil_open(struct tty_struct *tty, struct usb_serial_port *port)
 			  KOBIL_TIMEOUT
 	);
 	dev_dbg(dev, "%s - Send get_HW_version URB returns: %i\n", __func__, result);
-	dev_dbg(dev, "Harware version: %i.%i.%i\n", transfer_buffer[0],
+	dev_dbg(dev, "Hardware version: %i.%i.%i\n", transfer_buffer[0],
 		transfer_buffer[1], transfer_buffer[2]);
 
 	/* get firmware version */
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 790f118c327..c8d9ddf84c6 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
 	}
 
 	if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
-		test_msg("Left over peices after removing overlapping\n");
+		test_msg("Left over pieces after removing overlapping\n");
 		return -1;
 	}
 
diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c
index 27b313befc3..3e53c1e029d 100644
--- a/net/nfc/hci/llc_shdlc.c
+++ b/net/nfc/hci/llc_shdlc.c
@@ -300,7 +300,7 @@ static void llc_shdlc_rcv_rej(struct llc_shdlc *shdlc, int y_nr)
 {
 	struct sk_buff *skb;
 
-	pr_debug("remote asks retransmition from frame %d\n", y_nr);
+	pr_debug("remote asks retransmission from frame %d\n", y_nr);
 
 	if (llc_shdlc_x_lteq_y_lt_z(shdlc->dnr, y_nr, shdlc->ns)) {
 		if (shdlc->t2_active) {
-- 
cgit v1.2.3-70-g09d2


From 09c455aaa8f47a94d5bafaa23d58365768210507 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 7 Jan 2014 12:58:19 -0500
Subject: ext4: avoid clearing beyond i_blocks when truncating an inline data
 file

A missing cast means that when we are truncating a file which is less
than 60 bytes, we don't clear the correct area of memory, and in fact
we can end up truncating the next inode in the inode table, or worse
yet, some other kernel data structure.

Addresses-Coverity-Id: #751987

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/inline.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c417e52d194..ed29e720e88 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1928,9 +1928,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 		}
 
 		/* Clear the content within i_blocks. */
-		if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
-			memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
-					EXT4_MIN_INLINE_DATA_SIZE - i_size);
+		if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+			void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+			memset(p + i_size, 0,
+			       EXT4_MIN_INLINE_DATA_SIZE - i_size);
+		}
 
 		EXT4_I(inode)->i_inline_size = i_size <
 					EXT4_MIN_INLINE_DATA_SIZE ?
-- 
cgit v1.2.3-70-g09d2


From 8c9367fd9bf252b57c6d4f8e1a7f9de809d8b862 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 7 Jan 2014 13:08:03 -0500
Subject: ext4: don't pass freed handle to ext4_walk_page_buffers

This is harmless, since ext4_walk_page_buffers only passes the handle
onto the callback function, and in this call site the function in
question, bput_one(), doesn't actually use the handle.  But there's no
point passing in an invalid handle, and it creates a Coverity warning,
so let's just clean it up.

Addresses-Coverity-Id: #1091168

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8454ebe238a..f33b4eb82d8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1772,7 +1772,7 @@ static int __ext4_journalled_writepage(struct page *page,
 		ret = err;
 
 	if (!ext4_has_inline_data(inode))
-		ext4_walk_page_buffers(handle, page_bufs, 0, len,
+		ext4_walk_page_buffers(NULL, page_bufs, 0, len,
 				       NULL, bput_one);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
-- 
cgit v1.2.3-70-g09d2


From 208d0acc49cbf22a71d32b40a69e199717a76687 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 5 Mar 2012 16:40:31 -0500
Subject: nfsd4: break only delegations when appropriate

As a temporary fix, nfsd was breaking all leases on unlink, link,
rename, and setattr.

Now that we can distinguish between leases and delegations, we can be
nicer and break only the delegations, and not bother lease-holders with
operations they don't care about.

And we get to delete some code while we're at it.

Note that in the presence of delegations the vfs calls here all return
-EWOULDBLOCK instead of blocking, so nfsd threads will not get stuck
waiting for delegation returns.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c8aa6ff9925..e85b463fac4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -273,13 +273,6 @@ out:
 	return err;
 }
 
-static int nfsd_break_lease(struct inode *inode)
-{
-	if (!S_ISREG(inode->i_mode))
-		return 0;
-	return break_lease(inode, O_WRONLY | O_NONBLOCK);
-}
-
 /*
  * Commit metadata changes to stable storage.
  */
@@ -448,16 +441,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		goto out_put_write_access;
 	}
 
-	host_err = nfsd_break_lease(inode);
-	if (host_err)
-		goto out_put_write_access_nfserror;
-
 	fh_lock(fhp);
 	host_err = notify_change(dentry, iap, NULL);
 	fh_unlock(fhp);
 
-out_put_write_access_nfserror:
-	err = nfserrno(host_err);
 out_put_write_access:
 	if (size_change)
 		put_write_access(inode);
@@ -1759,11 +1746,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = nfserr_noent;
 	if (!dold->d_inode)
 		goto out_dput;
-	host_err = nfsd_break_lease(dold->d_inode);
-	if (host_err) {
-		err = nfserrno(host_err);
-		goto out_dput;
-	}
 	host_err = vfs_link(dold, dirp, dnew, NULL);
 	if (!host_err) {
 		err = nfserrno(commit_metadata(ffhp));
@@ -1857,14 +1839,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
 		goto out_dput_new;
 
-	host_err = nfsd_break_lease(odentry->d_inode);
-	if (host_err)
-		goto out_dput_new;
-	if (ndentry->d_inode) {
-		host_err = nfsd_break_lease(ndentry->d_inode);
-		if (host_err)
-			goto out_dput_new;
-	}
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
 	if (!host_err) {
 		host_err = commit_metadata(tfhp);
@@ -1934,16 +1908,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (!type)
 		type = rdentry->d_inode->i_mode & S_IFMT;
 
-	host_err = nfsd_break_lease(rdentry->d_inode);
-	if (host_err)
-		goto out_put;
 	if (type != S_IFDIR)
 		host_err = vfs_unlink(dirp, rdentry, NULL);
 	else
 		host_err = vfs_rmdir(dirp, rdentry);
 	if (!host_err)
 		host_err = commit_metadata(fhp);
-out_put:
 	dput(rdentry);
 
 out_nfserr:
-- 
cgit v1.2.3-70-g09d2


From 41ae6e714a6c25a9932d32a323e8c87f6bac4037 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 21 Aug 2013 15:32:50 -0400
Subject: nfsd4: better VERIFY comment

This confuses me every time.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 41e34dfd4e5..dadff09b0b0 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1069,8 +1069,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				    cstate->current_fh.fh_dentry, &p,
 				    count, verify->ve_bmval,
 				    rqstp, 0);
-
-	/* this means that nfsd4_encode_fattr() ran out of space */
+	/*
+	 * If nfsd4_encode_fattr() ran out of space, assume that's because
+	 * the attributes are longer (hence different) than those given:
+	 */
 	if (status == nfserr_resource)
 		status = nfserr_not_same;
 	if (status)
-- 
cgit v1.2.3-70-g09d2


From 6b6d8137f1d3fc7a3970e1e384b8ce2d0967e087 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 16 Jan 2013 17:11:11 -0500
Subject: nfsd4: nfsd4_encode_fattr cleanup

Remove some pointless goto's.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5bef9cb84b0..3bffba63e6a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2230,8 +2230,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 		if ((buflen -= 4) < 0)
 			goto out_resource;
 		dummy = nfs4_file_type(stat.mode);
-		if (dummy == NF4BAD)
-			goto out_serverfault;
+		if (dummy == NF4BAD) {
+			status = nfserr_serverfault;
+			goto out;
+		}
 		WRITE32(dummy);
 	}
 	if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
@@ -2325,8 +2327,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			WRITE32(ace->flag);
 			WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
 			status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);
-			if (status == nfserr_resource)
-				goto out_resource;
 			if (status)
 				goto out;
 		}
@@ -2387,8 +2387,6 @@ out_acl:
 	}
 	if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
 		status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
-		if (status == nfserr_resource)
-			goto out_resource;
 		if (status)
 			goto out;
 	}
@@ -2439,15 +2437,11 @@ out_acl:
 	}
 	if (bmval1 & FATTR4_WORD1_OWNER) {
 		status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen);
-		if (status == nfserr_resource)
-			goto out_resource;
 		if (status)
 			goto out;
 	}
 	if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
 		status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen);
-		if (status == nfserr_resource)
-			goto out_resource;
 		if (status)
 			goto out;
 	}
@@ -2550,9 +2544,6 @@ out_nfserr:
 out_resource:
 	status = nfserr_resource;
 	goto out;
-out_serverfault:
-	status = nfserr_serverfault;
-	goto out;
 }
 
 static inline int attributes_need_mount(u32 *bmval)
-- 
cgit v1.2.3-70-g09d2


From 87915c6472acbc5d7c809f3c9753808797da51a8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 16 Jan 2013 17:33:28 -0500
Subject: nfsd4: encode_rdattr_error cleanup

There's a simpler way to write this.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 3bffba63e6a..67b44963e8a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2620,17 +2620,14 @@ out_put:
 static __be32 *
 nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
 {
-	__be32 *attrlenp;
-
 	if (buflen < 6)
 		return NULL;
 	*p++ = htonl(2);
 	*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
 	*p++ = htonl(0);			 /* bmval1 */
 
-	attrlenp = p++;
+	*p++ = htonl(4);     /* attribute length */
 	*p++ = nfserr;       /* no htonl */
-	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	return p;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fb5566da9181d33ecdd9892e44f90320e7d4cc9f Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Wed, 8 Jan 2014 10:09:51 +0900
Subject: f2fs: improve write performance under frequent fsync calls

When considering a bunch of data writes with very frequent fsync calls, we
are able to think the following performance regression.

N: Node IO, D: Data IO, IO scheduler: cfq

Issue    pending IOs
	 D1 D2 D3 D4
 D1         D2 D3 D4 N1
 D2            D3 D4 N1 N2
 N1            D3 D4 N2 D1
 --> N1 can be selected by cfq becase of the same priority of N and D.
     Then D3 and D4 would be delayed, resuling in performance degradation.

So, when processing the fsync call, it'd better give higher priority to data IOs
than node IOs by assigning WRITE and WRITE_SYNC respectively.
This patch improves the random wirte performance with frequent fsync calls by up
to 10%.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h    | 4 ++--
 fs/f2fs/file.c    | 2 +-
 fs/f2fs/node.c    | 7 ++++++-
 fs/f2fs/segment.c | 8 ++------
 4 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 07a7ae0d441..b69f190fb19 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1118,8 +1118,8 @@ int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
-					block_t, block_t *);
+void write_node_page(struct f2fs_sb_info *, struct page *,
+		struct f2fs_io_info *, unsigned int, block_t, block_t *);
 void write_data_page(struct page *, struct dnode_of_data *, block_t *,
 					struct f2fs_io_info *);
 void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c77ad4d8b56..14511b00fff 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -115,7 +115,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	int ret = 0;
 	bool need_cp = false;
 	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
+		.sync_mode = WB_SYNC_NONE,
 		.nr_to_write = LONG_MAX,
 		.for_reclaim = 0,
 	};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0230326be49..b8c9301db52 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1194,6 +1194,10 @@ static int f2fs_write_node_page(struct page *page,
 	nid_t nid;
 	block_t new_addr;
 	struct node_info ni;
+	struct f2fs_io_info fio = {
+		.type = NODE,
+		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC: WRITE,
+	};
 
 	if (unlikely(sbi->por_doing))
 		goto redirty_out;
@@ -1218,7 +1222,7 @@ static int f2fs_write_node_page(struct page *page,
 
 	mutex_lock(&sbi->node_write);
 	set_page_writeback(page);
-	write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
+	write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
 	set_node_addr(sbi, &ni, new_addr);
 	dec_page_count(sbi, F2FS_DIRTY_NODES);
 	mutex_unlock(&sbi->node_write);
@@ -1253,6 +1257,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 
 	/* if mounting is failed, skip writing node pages */
 	wbc->nr_to_write = 3 * max_hw_blocks(sbi);
+	wbc->sync_mode = WB_SYNC_NONE;
 	sync_node_pages(sbi, 0, wbc);
 	wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
 						wbc->nr_to_write);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 555ae7693ea..5f84639354e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -924,16 +924,12 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 }
 
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+		struct f2fs_io_info *fio,
 		unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
 {
 	struct f2fs_summary sum;
-	struct f2fs_io_info fio = {
-		.type = NODE,
-		.rw = WRITE_SYNC,
-	};
-
 	set_summary(&sum, nid, 0, 0);
-	do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, &fio);
+	do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
 }
 
 void write_data_page(struct page *page, struct dnode_of_data *dn,
-- 
cgit v1.2.3-70-g09d2


From b1c57c1caa753cec299e62bb4272da0e85a01ef0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Wed, 8 Jan 2014 13:45:08 +0900
Subject: f2fs: add a sysfs entry to control max_victim_search

Previously during SSR and GC, the maximum number of retrials to find a victim
segment was hard-coded by MAX_VICTIM_SEARCH, 4096 by default.

This number makes an effect on IO locality, when SSR mode is activated, which
results in performance fluctuation on some low-end devices.

If max_victim_search = 4, the victim will be searched like below.
("D" represents a dirty segment, and "*" indicates a selected victim segment.)

 D1 D2 D3 D4 D5 D6 D7 D8 D9
[   *       ]
      [   *    ]
            [         * ]
	                [ ....]

This patch adds a sysfs entry to control the number dynamically through:
  /sys/fs/f2fs/$dev/max_victim_search

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h  | 3 +++
 fs/f2fs/gc.c    | 4 ++--
 fs/f2fs/gc.h    | 2 +-
 fs/f2fs/super.c | 6 ++++++
 4 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b69f190fb19..5f7dc4f6eb0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -451,6 +451,9 @@ struct f2fs_sb_info {
 	struct f2fs_gc_kthread	*gc_thread;	/* GC thread */
 	unsigned int cur_victim_sec;		/* current victim section num */
 
+	/* maximum # of trials to find a victim segment for SSR and GC */
+	unsigned int max_victim_search;
+
 	/*
 	 * for stat information.
 	 * one is for the LFS mode, and the other is for the SSR mode.
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 599f546d042..9117ccaf254 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -163,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 		p->ofs_unit = sbi->segs_per_sec;
 	}
 
-	if (p->max_search > MAX_VICTIM_SEARCH)
-		p->max_search = MAX_VICTIM_SEARCH;
+	if (p->max_search > sbi->max_victim_search)
+		p->max_search = sbi->max_victim_search;
 
 	p->offset = sbi->last_victim[p->gc_mode];
 }
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 507056d2220..5d5eb6047bf 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -20,7 +20,7 @@
 #define LIMIT_FREE_BLOCK	40 /* percentage over invalid + free space */
 
 /* Search max. number of dirty segments to select a victim segment */
-#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */
+#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
 
 struct f2fs_gc_kthread {
 	struct task_struct *f2fs_gc_task;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f16da92a789..b070f3010ce 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -74,6 +74,7 @@ static match_table_t f2fs_tokens = {
 enum {
 	GC_THREAD,	/* struct f2fs_gc_thread */
 	SM_INFO,	/* struct f2fs_sm_info */
+	F2FS_SBI,	/* struct f2fs_sb_info */
 };
 
 struct f2fs_attr {
@@ -91,6 +92,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 		return (unsigned char *)sbi->gc_thread;
 	else if (struct_type == SM_INFO)
 		return (unsigned char *)SM_I(sbi);
+	else if (struct_type == F2FS_SBI)
+		return (unsigned char *)sbi;
 	return NULL;
 }
 
@@ -180,6 +183,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -191,6 +195,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(max_small_discards),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),
+	ATTR_LIST(max_victim_search),
 	NULL,
 };
 
@@ -775,6 +780,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
 	sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
 	sbi->cur_victim_sec = NULL_SECNO;
+	sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
 
 	for (i = 0; i < NR_COUNT_TYPE; i++)
 		atomic_set(&sbi->nr_pages[i], 0);
-- 
cgit v1.2.3-70-g09d2


From 22b5a6c0c0cd5eb524d31c949d113c6683e37ec9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Jan 2014 11:05:29 +0000
Subject: GFS2: For exhash conversion, only one block is needed

For most cases, only a single new block is needed when we reach
the point of converting from stuffed to exhash directory. The
exception being when the file name is so long that it will not
fit within the new leaf block.

So this patch adds a simple test for that situation so that we
do not need to request the full reservation size in this case.

Potentially we could calculate more accurately the value to use
in other cases too, but that is much more complicated to do and
it is doubtful that the benefit would outweigh the extra cost
in code complexity.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index d5988aafaa7..b2e5ebfb4bb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2035,7 +2035,9 @@ out:
 int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
 			       struct gfs2_diradd *da)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
 	struct gfs2_dirent *dent;
 	struct buffer_head *bh;
 
@@ -2046,6 +2048,9 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
 	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
 	if (!dent) {
 		da->nr_blocks = sdp->sd_max_dirres;
+		if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
+		    (GFS2_DIRENT_SIZE(name->len) < extra))
+			da->nr_blocks = 1;
 		return 0;
 	}
 	if (IS_ERR(dent))
-- 
cgit v1.2.3-70-g09d2


From 01bcb0dedbcd77a5ebb4f2dfd7d3196e3ad5cea5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 8 Jan 2014 12:14:57 +0000
Subject: GFS2: Add hints to directory leaf blocks

This patch adds four new fields to directory leaf blocks.
The intent is not to use them in the kernel itself, although
perhaps we may be able to use them as hints at some later date,
but instead to provide more information for debug/fsck use.

One new field adds a pointer to the inode to which the leaf
belongs. This can be useful if the pointer to the leaf block
has become corrupt, as it will allow us to know which inode
this block should be associated with. This field is set when
the leaf is created and never changed over its lifetime.

The second field is a "distance from the hash table" field.
The meaning is as follows:
 0  = An old leaf in which this value has not been set
 1  = This leaf is pointed to directly from the hash table
 2+ = This leaf is part of a chain, pointed to by another leaf
      block, the value gives the position in the chain.

The third and fourth fields combine to give a time stamp of
the most recent directory insertion or deletion from this
leaf block. The time stamp is not updated when a new leaf
block is chained from the current one. The code is currently
written such that the timestamp on the dir inode will match
that of the leaf block for the most recent insertion/deletion.

For backwards compatibility, any of these new fields which is
zero should be considered to be "unknown".

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c                    | 40 +++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/gfs2_ondisk.h | 11 ++++++++++-
 2 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b2e5ebfb4bb..fa32655449c 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
 	struct qstr name = { .name = "" };
+	struct timespec tv = CURRENT_TIME;
 
 	error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 	if (error)
@@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 	leaf->lf_entries = 0;
 	leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
 	leaf->lf_next = 0;
-	memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+	leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
+	leaf->lf_dist = cpu_to_be32(1);
+	leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+	leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+	memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
 	dent = (struct gfs2_dirent *)(leaf+1);
 	gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
 	*pbh = bh;
@@ -1612,11 +1617,31 @@ out:
 	return ret;
 }
 
+/**
+ * dir_new_leaf - Add a new leaf onto hash chain
+ * @inode: The directory
+ * @name: The name we are adding
+ *
+ * This adds a new dir leaf onto an existing leaf when there is not
+ * enough space to add a new dir entry. This is a last resort after
+ * we've expanded the hash table to max size and also split existing
+ * leaf blocks, so it will only occur for very large directories.
+ *
+ * The dist parameter is set to 1 for leaf blocks directly attached
+ * to the hash table, 2 for one layer of indirection, 3 for two layers
+ * etc. We are thus able to tell the difference between an old leaf
+ * with dist set to zero (i.e. "don't know") and a new one where we
+ * set this information for debug/fsck purposes.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
 static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 {
 	struct buffer_head *bh, *obh;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_leaf *leaf, *oleaf;
+	u32 dist = 1;
 	int error;
 	u32 index;
 	u64 bn;
@@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 	if (error)
 		return error;
 	do {
+		dist++;
 		oleaf = (struct gfs2_leaf *)obh->b_data;
 		bn = be64_to_cpu(oleaf->lf_next);
 		if (!bn)
@@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 		brelse(obh);
 		return -ENOSPC;
 	}
+	leaf->lf_dist = cpu_to_be32(dist);
 	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
 	brelse(bh);
 	brelse(obh);
@@ -1679,6 +1706,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct buffer_head *bh = da->bh;
 	struct gfs2_dirent *dent = da->dent;
+	struct timespec tv;
 	struct gfs2_leaf *leaf;
 	int error;
 
@@ -1693,15 +1721,18 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			dent = gfs2_init_dirent(inode, dent, name, bh);
 			gfs2_inum_out(nip, dent);
 			dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+			tv = CURRENT_TIME;
 			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 				leaf = (struct gfs2_leaf *)bh->b_data;
 				be16_add_cpu(&leaf->lf_entries, 1);
+				leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+				leaf->lf_sec = cpu_to_be64(tv.tv_sec);
 			}
 			da->dent = NULL;
 			da->bh = NULL;
 			brelse(bh);
 			ip->i_entries++;
-			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
 			if (S_ISDIR(nip->i_inode.i_mode))
 				inc_nlink(&ip->i_inode);
 			mark_inode_dirty(inode);
@@ -1752,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_dirent *dent, *prev = NULL;
 	struct buffer_head *bh;
+	struct timespec tv = CURRENT_TIME;
 
 	/* Returns _either_ the entry (if its first in block) or the
 	   previous entry otherwise */
@@ -1777,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 		if (!entries)
 			gfs2_consist_inode(dip);
 		leaf->lf_entries = cpu_to_be16(--entries);
+		leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+		leaf->lf_sec = cpu_to_be64(tv.tv_sec);
 	}
 	brelse(bh);
 
 	if (!dip->i_entries)
 		gfs2_consist_inode(dip);
 	dip->i_entries--;
-	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+	dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
 	if (S_ISDIR(dentry->d_inode->i_mode))
 		drop_nlink(&dip->i_inode);
 	mark_inode_dirty(&dip->i_inode);
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index b2de1f9a88d..0f24c07aed5 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -319,7 +319,16 @@ struct gfs2_leaf {
 	__be32 lf_dirent_format;	/* Format of the dirents */
 	__be64 lf_next;			/* Next leaf, if overflow */
 
-	__u8 lf_reserved[64];
+	union {
+		__u8 lf_reserved[64];
+		struct {
+			__be64 lf_inode;	/* Dir inode number */
+			__be32 lf_dist;		/* Dist from inode on chain */
+			__be32 lf_nsec;		/* Last ins/del usecs */
+			__be64 lf_sec;		/* Last ins/del in secs */
+			__u8 lf_reserved2[40];
+		};
+	};
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 95d4403889acbd98e06d41a255df76452210996a Mon Sep 17 00:00:00 2001
From: Muthukumar Ratty <muthur@gmail.com>
Date: Wed, 8 Jan 2014 09:39:49 -0700
Subject: block: Warn and free bio if bi_end_io is not set

In bio_endio if bio doesn't have bi_end_io (should be an error case),
we set bio to NULL and continue silently without freeing the bio. It
would be good to have a WARN and free the bio to avoid memory leak.

Signed-off-by: Muthukumar Ratty <muthur@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 75c49a38223..9156bd1f151 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1770,6 +1770,15 @@ void bio_endio(struct bio *bio, int error)
 		} else {
 			if (bio->bi_end_io)
 				bio->bi_end_io(bio, error);
+			else {
+				char dev_name[BDEVNAME_SIZE];
+
+				WARN(1, "bio_endio: bio for %s without endio\n",
+					bio->bi_bdev ? bdevname(bio->bi_bdev,
+					dev_name) : "(unknown)");
+				bio_put(bio);
+			}
+
 			bio = NULL;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 3554116d3aae25353713f3d0131d86ae6c1e5674 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 8 Jan 2014 09:49:01 -0500
Subject: nfsd4: simplify xdr encoding of nfsv4 names

We can simplify the idmapping code if it does its own encoding and
returns nfs errors.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/acl.h       |  2 +-
 fs/nfsd/idmap.h     |  4 ++--
 fs/nfsd/nfs4acl.c   | 20 +++++++++++++-------
 fs/nfsd/nfs4idmap.c | 50 +++++++++++++++++++++++++++++++++-----------------
 fs/nfsd/nfs4xdr.c   | 52 ++++++----------------------------------------------
 5 files changed, 55 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 8b186a4955c..afd3e0eae64 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -43,7 +43,7 @@
 
 struct nfs4_acl *nfs4_acl_new(int);
 int nfs4_acl_get_whotype(char *, u32);
-int nfs4_acl_write_who(int who, char *p);
+__be32 nfs4_acl_write_who(int who, __be32 **p, int *len);
 
 #define NFS4_ACL_TYPE_DEFAULT	0x01
 #define NFS4_ACL_DIR		0x02
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index bf95f6b817a..66e58db0193 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net)
 
 __be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
 __be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
-int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *);
-int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *);
+__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *);
+__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *);
 
 #endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 8a50b3c1809..eea24c9a561 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/export.h>
+#include "nfsd.h"
 #include "acl.h"
 
 
@@ -848,18 +849,23 @@ nfs4_acl_get_whotype(char *p, u32 len)
 	return NFS4_ACL_WHO_NAMED;
 }
 
-int
-nfs4_acl_write_who(int who, char *p)
+__be32 nfs4_acl_write_who(int who, __be32 **p, int *len)
 {
 	int i;
+	int bytes;
 
 	for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
-		if (s2t_map[i].type == who) {
-			memcpy(p, s2t_map[i].string, s2t_map[i].stringlen);
-			return s2t_map[i].stringlen;
-		}
+		if (s2t_map[i].type != who)
+			continue;
+		bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2);
+		if (bytes > *len)
+			return nfserr_resource;
+		*p = xdr_encode_opaque(*p, s2t_map[i].string,
+					s2t_map[i].stringlen);
+		*len -= bytes;
+		return 0;
 	}
-	BUG();
+	WARN_ON_ONCE(1);
 	return -1;
 }
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4832fd819f8..c0dfde68742 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -551,27 +551,46 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 	return 0;
 }
 
-static int
-idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
+static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen)
+{
+	char buf[11];
+	int len;
+	int bytes;
+
+	len = sprintf(buf, "%u", id);
+	bytes = 4 + (XDR_QUADLEN(len) << 2);
+	if (bytes > *buflen)
+		return nfserr_resource;
+	*p = xdr_encode_opaque(*p, buf, len);
+	*buflen -= bytes;
+	return 0;
+}
+
+static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
 {
 	struct ent *item, key = {
 		.id = id,
 		.type = type,
 	};
 	int ret;
+	int bytes;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
 	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
 	if (ret == -ENOENT)
-		return sprintf(name, "%u", id);
+		return encode_ascii_id(id, p, buflen);
 	if (ret)
-		return ret;
+		return nfserrno(ret);
 	ret = strlen(item->name);
-	BUG_ON(ret > IDMAP_NAMESZ);
-	memcpy(name, item->name, ret);
+	WARN_ON_ONCE(ret > IDMAP_NAMESZ);
+	bytes = 4 + (XDR_QUADLEN(ret) << 2);
+	if (bytes > *buflen)
+		return nfserr_resource;
+	*p = xdr_encode_opaque(*p, item->name, ret);
+	*buflen -= bytes;
 	cache_put(&item->h, nn->idtoname_cache);
-	return ret;
+	return 0;
 }
 
 static bool
@@ -603,12 +622,11 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
 	return idmap_name_to_id(rqstp, type, name, namelen, id);
 }
 
-static int
-do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
+static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
 {
 	if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
-		return sprintf(name, "%u", id);
-	return idmap_id_to_name(rqstp, type, id, name);
+		return encode_ascii_id(id, p, buflen);
+	return idmap_id_to_name(rqstp, type, id, p, buflen);
 }
 
 __be32
@@ -637,16 +655,14 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
 	return status;
 }
 
-int
-nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name)
+__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid,  __be32 **p, int *buflen)
 {
 	u32 id = from_kuid(&init_user_ns, uid);
-	return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
+	return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen);
 }
 
-int
-nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name)
+__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen)
 {
 	u32 id = from_kgid(&init_user_ns, gid);
-	return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
+	return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen);
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 67b44963e8a..8198ecf3c03 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1965,56 +1965,16 @@ static u32 nfs4_file_type(umode_t mode)
 	};
 }
 
-static __be32
-nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid,
-			__be32 **p, int *buflen)
-{
-	int status;
-
-	if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4)
-		return nfserr_resource;
-	if (whotype != NFS4_ACL_WHO_NAMED)
-		status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
-	else if (gid_valid(gid))
-		status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1));
-	else
-		status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1));
-	if (status < 0)
-		return nfserrno(status);
-	*p = xdr_encode_opaque(*p, NULL, status);
-	*buflen -= (XDR_QUADLEN(status) << 2) + 4;
-	BUG_ON(*buflen < 0);
-	return 0;
-}
-
-static inline __be32
-nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen)
-{
-	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID,
-				 p, buflen);
-}
-
-static inline __be32
-nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen)
-{
-	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group,
-				 p, buflen);
-}
-
 static inline __be32
 nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
 		__be32 **p, int *buflen)
 {
-	kuid_t uid = INVALID_UID;
-	kgid_t gid = INVALID_GID;
-
-	if (ace->whotype == NFS4_ACL_WHO_NAMED) {
-		if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
-			gid = ace->who_gid;
-		else
-			uid = ace->who_uid;
-	}
-	return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen);
+	if (ace->whotype != NFS4_ACL_WHO_NAMED)
+		return nfs4_acl_write_who(ace->whotype, p, buflen);
+	else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+		return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen);
+	else
+		return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen);
 }
 
 #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
-- 
cgit v1.2.3-70-g09d2


From 1331107f8a8611f3520046f50c44af9b2102bfd2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2014 14:14:22 -0700
Subject: Revert "block: Warn and free bio if bi_end_io is not set"

This reverts commit 95d4403889acbd98e06d41a255df76452210996a.

The patch is broken for on-stack bios, amongst other things.
---
 fs/bio.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 9156bd1f151..75c49a38223 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1770,15 +1770,6 @@ void bio_endio(struct bio *bio, int error)
 		} else {
 			if (bio->bi_end_io)
 				bio->bi_end_io(bio, error);
-			else {
-				char dev_name[BDEVNAME_SIZE];
-
-				WARN(1, "bio_endio: bio for %s without endio\n",
-					bio->bi_bdev ? bdevname(bio->bi_bdev,
-					dev_name) : "(unknown)");
-				bio_put(bio);
-			}
-
 			bio = NULL;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From c7b22bb19a24fef1a851a41e5c0657c0c4a41550 Mon Sep 17 00:00:00 2001
From: Muthu Kumar <muthu.lkml@gmail.com>
Date: Wed, 8 Jan 2014 14:19:52 -0700
Subject: btrfs: fix missing increment of bi_remaining

In btrfs_end_bio(), we increment bi_remaining if is_orig_bio. If not,
we restore the orig_bio but failed to increment bi_remaining for
orig_bio, which triggers a BUG_ON later when bio_endio is called. Fix
is to increment bi_remaining when we restore the orig bio as well.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
CC: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Muthukumar Ratty <muthur@gmail.com>
Reviewed-by: Chris Mason <clm@fb.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/btrfs/volumes.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 37972d5db73..54d2685a307 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5297,9 +5297,14 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		if (!is_orig_bio) {
 			bio_put(bio);
 			bio = bbio->orig_bio;
-		} else {
-			atomic_inc(&bio->bi_remaining);
 		}
+
+ 		/*
+		 * We have original bio now. So increment bi_remaining to
+		 * account for it in endio
+		 */
+		atomic_inc(&bio->bi_remaining);
+
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
-- 
cgit v1.2.3-70-g09d2


From 39849d6946a84861a517c09ca0b691ce6d98c7a6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 9 Jan 2014 16:34:04 +0000
Subject: GFS2: Add initialization for address space in super block

Spotted by Andy Price. This should fix the odd messages from
lockdep caused by 70d4ee94b370c5ef54d0870600f16bd92d18013c

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/ops_fstype.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 94aab31477b..3b820b672d1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -102,6 +102,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	mapping = &sdp->sd_aspace;
 
+	address_space_init_once(mapping);
 	mapping->a_ops = &gfs2_meta_aops;
 	mapping->host = sb->s_bdev->bd_inode;
 	mapping->flags = 0;
-- 
cgit v1.2.3-70-g09d2


From 6f96b3063cdd473c68664a190524ed966ac0cd92 Mon Sep 17 00:00:00 2001
From: Chuansheng Liu <chuansheng.liu@intel.com>
Date: Tue, 7 Jan 2014 16:53:34 +0800
Subject: xfs: Calling destroy_work_on_stack() to pair with INIT_WORK_ONSTACK()

In case CONFIG_DEBUG_OBJECTS_WORK is defined, it is needed to
call destroy_work_on_stack() which frees the debug object to pair
with INIT_WORK_ONSTACK().

Signed-off-by: Liu, Chuansheng <chuansheng.liu@intel.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5887e41c032..3f534e0862b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -287,6 +287,7 @@ xfs_bmapi_allocate(
 	INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
 	queue_work(xfs_alloc_wq, &args->work);
 	wait_for_completion(&done);
+	destroy_work_on_stack(&args->work);
 	return args->result;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fb89b45cdfdc8bdab93986f1bc1474e313295c31 Mon Sep 17 00:00:00 2001
From: Dominique Martinet <dominique.martinet@cea.fr>
Date: Fri, 10 Jan 2014 13:44:09 +0100
Subject: 9P: introduction of a new cache=mmap model.

 - Add cache=mmap option
 - Make mmap read-write while keeping it as synchronous as possible
 - Build writeback fid on mmap creation if it is writable

Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c           |   9 +++-
 fs/9p/v9fs.h           |   1 +
 fs/9p/v9fs_vfs.h       |   2 +
 fs/9p/vfs_addr.c       |   7 +++
 fs/9p/vfs_file.c       | 140 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/9p/vfs_inode.c      |  22 +++++---
 fs/9p/vfs_inode_dotl.c |   9 ++--
 fs/9p/vfs_super.c      |   8 +--
 8 files changed, 179 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 08f2e1e9a7e..14da82564f4 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -56,7 +56,7 @@ enum {
 	/* Options that take no arguments */
 	Opt_nodevmap,
 	/* Cache options */
-	Opt_cache_loose, Opt_fscache,
+	Opt_cache_loose, Opt_fscache, Opt_mmap,
 	/* Access options */
 	Opt_access, Opt_posixacl,
 	/* Error token */
@@ -74,6 +74,7 @@ static const match_table_t tokens = {
 	{Opt_cache, "cache=%s"},
 	{Opt_cache_loose, "loose"},
 	{Opt_fscache, "fscache"},
+	{Opt_mmap, "mmap"},
 	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
 	{Opt_posixacl, "posixacl"},
@@ -91,6 +92,9 @@ static int get_cache_mode(char *s)
 	} else if (!strcmp(s, "fscache")) {
 		version = CACHE_FSCACHE;
 		p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
+	} else if (!strcmp(s, "mmap")) {
+		version = CACHE_MMAP;
+		p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
 	} else if (!strcmp(s, "none")) {
 		version = CACHE_NONE;
 		p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
@@ -220,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_fscache:
 			v9ses->cache = CACHE_FSCACHE;
 			break;
+		case Opt_mmap:
+			v9ses->cache = CACHE_MMAP;
+			break;
 		case Opt_cachetag:
 #ifdef CONFIG_9P_FSCACHE
 			v9ses->cachetag = match_strdup(&args[0]);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a8e127c8962..099c7712631 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -64,6 +64,7 @@ enum p9_session_flags {
 
 enum p9_cache_modes {
 	CACHE_NONE,
+	CACHE_MMAP,
 	CACHE_LOOSE,
 	CACHE_FSCACHE,
 };
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index dc95a252523..b83ebfbf3fd 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -50,6 +50,8 @@ extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
 extern const struct file_operations v9fs_cached_file_operations;
 extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern const struct file_operations v9fs_mmap_file_operations;
+extern const struct file_operations v9fs_mmap_file_operations_dotl;
 extern struct kmem_cache *v9fs_inode_cache;
 
 struct inode *v9fs_alloc_inode(struct super_block *sb);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 9ff073f4090..c71e88602ff 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -202,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int retval;
 
+	p9_debug(P9_DEBUG_VFS, "page %p\n", page);
+
 	retval = v9fs_vfs_writepage_locked(page);
 	if (retval < 0) {
 		if (retval == -EAGAIN) {
@@ -282,6 +284,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = mapping->host;
 
+
+	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
 	v9inode = V9FS_I(inode);
 start:
 	page = grab_cache_page_write_begin(mapping, index, flags);
@@ -312,6 +317,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 	loff_t last_pos = pos + copied;
 	struct inode *inode = page->mapping->host;
 
+	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
 	if (unlikely(copied < len)) {
 		/*
 		 * zero out the rest of the area
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 27782bb7f4f..a16b0ff497c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -45,6 +45,7 @@
 #include "cache.h"
 
 static const struct vm_operations_struct v9fs_file_vm_ops;
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
 
 /**
  * v9fs_file_open - open a file (or directory)
@@ -87,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 	file->private_data = fid;
 	mutex_lock(&v9inode->v_mutex);
-	if (v9ses->cache && !v9inode->writeback_fid &&
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
 	    ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -105,7 +107,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		v9inode->writeback_fid = (void *) fid;
 	}
 	mutex_unlock(&v9inode->v_mutex);
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		v9fs_cache_inode_set_cookie(inode, file);
 	return 0;
 out_error:
@@ -579,17 +581,55 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 }
 
 static int
-v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	int retval;
 
-	retval = generic_file_mmap(file, vma);
+
+	retval = generic_file_mmap(filp, vma);
 	if (!retval)
 		vma->vm_ops = &v9fs_file_vm_ops;
 
 	return retval;
 }
 
+static int
+v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int retval;
+	struct inode *inode;
+	struct v9fs_inode *v9inode;
+	struct p9_fid *fid;
+
+	inode = file_inode(filp);
+	v9inode = V9FS_I(inode);
+	mutex_lock(&v9inode->v_mutex);
+	if (!v9inode->writeback_fid &&
+	    (vma->vm_flags & VM_WRITE)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during mmap instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		fid = v9fs_writeback_fid(filp->f_path.dentry);
+		if (IS_ERR(fid)) {
+			retval = PTR_ERR(fid);
+			mutex_unlock(&v9inode->v_mutex);
+			return retval;
+		}
+		v9inode->writeback_fid = (void *) fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+
+	retval = generic_file_mmap(filp, vma);
+	if (!retval)
+		vma->vm_ops = &v9fs_mmap_file_vm_ops;
+
+	return retval;
+}
+
 static int
 v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
@@ -658,6 +698,22 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
 	return do_sync_read(filp, data, count, offset);
 }
 
+/**
+ * v9fs_mmap_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count,
+		      loff_t *offset)
+{
+	/* TODO: Check if there are dirty pages */
+	return v9fs_file_read(filp, data, count, offset);
+}
+
 static ssize_t
 v9fs_direct_write(struct file *filp, const char __user * data,
 		  size_t count, loff_t *offsetp)
@@ -728,12 +784,65 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
 	return do_sync_write(filp, data, count, offset);
 }
 
+
+/**
+ * v9fs_mmap_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_write(struct file *filp, const char __user *data,
+		       size_t count, loff_t *offset)
+{
+	/*
+	 * TODO: invalidate mmaps on filp's inode between
+	 * offset and offset+count
+	 */
+	return v9fs_file_write(filp, data, count, offset);
+}
+
+static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
+{
+	struct inode *inode;
+
+	struct writeback_control wbc = {
+		.nr_to_write = LONG_MAX,
+		.sync_mode = WB_SYNC_ALL,
+		.range_start = vma->vm_pgoff * PAGE_SIZE,
+		 /* absolute end, byte at end included */
+		.range_end = vma->vm_pgoff * PAGE_SIZE +
+			(vma->vm_end - vma->vm_start - 1),
+	};
+
+
+	p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
+
+	inode = file_inode(vma->vm_file);
+
+	if (!mapping_cap_writeback_dirty(inode->i_mapping))
+		wbc.nr_to_write = 0;
+
+	might_sleep();
+	sync_inode(inode, &wbc);
+}
+
+
 static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
 	.remap_pages = generic_file_remap_pages,
 };
 
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
+	.close = v9fs_mmap_vm_close,
+	.fault = filemap_fault,
+	.page_mkwrite = v9fs_vm_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
+};
+
 
 const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
@@ -784,3 +893,26 @@ const struct file_operations v9fs_file_operations_dotl = {
 	.mmap = generic_file_readonly_mmap,
 	.fsync = v9fs_file_fsync_dotl,
 };
+
+const struct file_operations v9fs_mmap_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_mmap_file_read,
+	.write = v9fs_mmap_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = v9fs_mmap_file_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_mmap_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_mmap_file_read,
+	.write = v9fs_mmap_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = v9fs_mmap_file_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index af7d531bdec..bb7991c7e5c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -299,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	case S_IFREG:
 		if (v9fs_proto_dotl(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations_dotl;
-			if (v9ses->cache)
+			if (v9ses->cache == CACHE_LOOSE ||
+			    v9ses->cache == CACHE_FSCACHE)
 				inode->i_fop =
 					&v9fs_cached_file_operations_dotl;
+			else if (v9ses->cache == CACHE_MMAP)
+				inode->i_fop = &v9fs_mmap_file_operations_dotl;
 			else
 				inode->i_fop = &v9fs_file_operations_dotl;
 		} else {
 			inode->i_op = &v9fs_file_inode_operations;
-			if (v9ses->cache)
-				inode->i_fop = &v9fs_cached_file_operations;
+			if (v9ses->cache == CACHE_LOOSE ||
+			    v9ses->cache == CACHE_FSCACHE)
+				inode->i_fop =
+					&v9fs_cached_file_operations;
+			else if (v9ses->cache == CACHE_MMAP)
+				inode->i_fop = &v9fs_mmap_file_operations;
 			else
 				inode->i_fop = &v9fs_file_operations;
 		}
@@ -810,7 +817,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	 * unlink. For cached mode create calls request for new
 	 * inode. But with cache disabled, lookup should do this.
 	 */
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	else
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -876,7 +883,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	v9fs_invalidate_inode_attr(dir);
 	v9inode = V9FS_I(dentry->d_inode);
 	mutex_lock(&v9inode->v_mutex);
-	if (v9ses->cache && !v9inode->writeback_fid &&
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
 	    ((flags & O_ACCMODE) != O_RDONLY)) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -899,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		goto error;
 
 	file->private_data = fid;
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		v9fs_cache_inode_set_cookie(dentry->d_inode, file);
 
 	*opened |= FILE_CREATED;
@@ -1477,7 +1485,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	 */
 	i_size = inode->i_size;
 	v9fs_stat2inode(st, inode, inode->i_sb);
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		inode->i_size = i_size;
 	spin_unlock(&inode->i_lock);
 out:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 98013068f35..59dc8e87647 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -330,7 +330,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 
 	v9inode = V9FS_I(inode);
 	mutex_lock(&v9inode->v_mutex);
-	if (v9ses->cache && !v9inode->writeback_fid &&
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
 	    ((flags & O_ACCMODE) != O_RDONLY)) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -353,7 +354,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	if (err)
 		goto err_clunk_old_fid;
 	file->private_data = ofid;
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		v9fs_cache_inode_set_cookie(inode, file);
 	*opened |= FILE_CREATED;
 out:
@@ -710,7 +711,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 	}
 
 	v9fs_invalidate_inode_attr(dir);
-	if (v9ses->cache) {
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		/* Now walk from the parent so we can get an unopened fid. */
 		fid = p9_client_walk(dfid, 1, &name, 1);
 		if (IS_ERR(fid)) {
@@ -965,7 +966,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	 */
 	i_size = inode->i_size;
 	v9fs_stat2inode_dotl(st, inode);
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		inode->i_size = i_size;
 	spin_unlock(&inode->i_lock);
 out:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2756dcd5de6..0afd0382822 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -144,7 +144,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	}
 	v9fs_fill_super(sb, v9ses, flags, data);
 
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		sb->s_d_op = &v9fs_cached_dentry_operations;
 	else
 		sb->s_d_op = &v9fs_dentry_operations;
@@ -282,7 +282,7 @@ static int v9fs_drop_inode(struct inode *inode)
 {
 	struct v9fs_session_info *v9ses;
 	v9ses = v9fs_inode2v9ses(inode);
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		return generic_drop_inode(inode);
 	/*
 	 * in case of non cached mode always drop the
@@ -325,10 +325,12 @@ static int v9fs_write_inode_dotl(struct inode *inode,
 	 * send an fsync request to server irrespective of
 	 * wbc->sync_mode.
 	 */
-	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
 	v9inode = V9FS_I(inode);
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
+		 __func__, inode, v9inode->writeback_fid);
 	if (!v9inode->writeback_fid)
 		return 0;
+
 	ret = p9_client_fsync(v9inode->writeback_fid, 0);
 	if (ret < 0) {
 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-- 
cgit v1.2.3-70-g09d2


From d92d2e6bd72b653f9811e0c9c46307c743b3fc58 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:18 -0500
Subject: kernfs: fix get_active failure handling in kernfs_seq_*()

When kernfs_seq_start() fails to obtain an active reference, it
returns ERR_PTR(-ENODEV).  kernfs_seq_stop() is then invoked with the
error pointer value; however, it still proceeds to invoke
kernfs_put_active() on the node leading to unbalanced put.

If kernfs_seq_stop() is called even after active ref failure, it
should skip invocation of @ops->seq_stop() and put_active.
Unfortunately, this is a bit complicated because active ref failure
isn't the only thing which may fail with ERR_PTR(-ENODEV).
@ops->seq_start/next() may also fail with the error value and
kernfs_seq_stop() doesn't have a way to tell apart those failures.

Work it around by factoring out the active part of kernfs_seq_stop()
into kernfs_seq_stop_active() and invoking it directly if
@ops->seq_start/next() fail with ERR_PTR(-ENODEV) and updating
kernfs_seq_stop() to skip kernfs_seq_stop_active() on
ERR_PTR(-ENODEV).  This is a bit nasty but ensures that the active put
is skipped iff get_active failed in kernfs_seq_start().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 316604cc3a1..bdd38854ef6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -54,6 +54,38 @@ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
 	return kn->attr.ops;
 }
 
+/*
+ * As kernfs_seq_stop() is also called after kernfs_seq_start() or
+ * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
+ * a seq_file iteration which is fully initialized with an active reference
+ * or an aborted kernfs_seq_start() due to get_active failure.  The
+ * position pointer is the only context for each seq_file iteration and
+ * thus the stop condition should be encoded in it.  As the return value is
+ * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
+ * choice to indicate get_active failure.
+ *
+ * Unfortunately, this is complicated due to the optional custom seq_file
+ * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
+ * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
+ * custom seq_file operations and thus can't decide whether put_active
+ * should be performed or not only on ERR_PTR(-ENODEV).
+ *
+ * This is worked around by factoring out the custom seq_stop() and
+ * put_active part into kernfs_seq_stop_active(), skipping it from
+ * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
+ * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
+ * that kernfs_seq_stop_active() is skipped only after get_active failure.
+ */
+static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
+	kernfs_put_active(of->kn);
+}
+
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 {
 	struct kernfs_open_file *of = sf->private;
@@ -69,7 +101,11 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 
 	ops = kernfs_ops(of->kn);
 	if (ops->seq_start) {
-		return ops->seq_start(sf, ppos);
+		void *next = ops->seq_start(sf, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
 	} else {
 		/*
 		 * The same behavior and code as single_open().  Returns
@@ -85,7 +121,11 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
 	if (ops->seq_next) {
-		return ops->seq_next(sf, v, ppos);
+		void *next = ops->seq_next(sf, v, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
 	} else {
 		/*
 		 * The same behavior and code as single_open(), always
@@ -99,12 +139,9 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 static void kernfs_seq_stop(struct seq_file *sf, void *v)
 {
 	struct kernfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
-	if (ops->seq_stop)
-		ops->seq_stop(sf, v);
-
-	kernfs_put_active(of->kn);
+	if (v != ERR_PTR(-ENODEV))
+		kernfs_seq_stop_active(sf, v);
 	mutex_unlock(&of->mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From ea1c472dfeada211a0100daa7976e8e8e779b858 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:19 -0500
Subject: kernfs: replace kernfs_node->u.completion with
 kernfs_root->deactivate_waitq

kernfs_node->u.completion is used to notify deactivation completion
from kernfs_put_active() to kernfs_deactivate().  We now allow
multiple racing removals of the same node and the current removal
scheme is no longer correct - kernfs_remove() invocation may return
before the node is properly deactivated if it races against another
removal.  The removal path will be restructured to address the issue.

To help such restructure which requires supporting multiple waiters,
this patch replaces kernfs_node->u.completion with
kernfs_root->deactivate_waitq.  This makes deactivation event
notifications share a per-root waitqueue_head; however, the wait path
is quite cold and this will also allow shaving one pointer off
kernfs_node.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 27 +++++++++++----------------
 include/linux/kernfs.h |  4 ++--
 2 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 510b5062ef3..ed62de6cdf8 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -8,6 +8,7 @@
  * This file is released under the GPLv2.
  */
 
+#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/idr.h>
@@ -151,6 +152,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
  */
 void kernfs_put_active(struct kernfs_node *kn)
 {
+	struct kernfs_root *root = kernfs_root(kn);
 	int v;
 
 	if (unlikely(!kn))
@@ -162,11 +164,7 @@ void kernfs_put_active(struct kernfs_node *kn)
 	if (likely(v != KN_DEACTIVATED_BIAS))
 		return;
 
-	/*
-	 * atomic_dec_return() is a mb(), we'll always see the updated
-	 * kn->u.completion.
-	 */
-	complete(kn->u.completion);
+	wake_up_all(&root->deactivate_waitq);
 }
 
 /**
@@ -177,26 +175,22 @@ void kernfs_put_active(struct kernfs_node *kn)
  */
 static void kernfs_deactivate(struct kernfs_node *kn)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
-	int v;
+	struct kernfs_root *root = kernfs_root(kn);
 
 	BUG_ON(!(kn->flags & KERNFS_REMOVED));
 
 	if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
 		return;
 
-	kn->u.completion = (void *)&wait;
-
 	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
-	/* atomic_add_return() is a mb(), put_active() will always see
-	 * the updated kn->u.completion.
-	 */
-	v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
 
-	if (v != KN_DEACTIVATED_BIAS) {
+	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+
+	if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
 		lock_contended(&kn->dep_map, _RET_IP_);
-		wait_for_completion(&wait);
-	}
+
+	wait_event(root->deactivate_waitq,
+		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
 
 	lock_acquired(&kn->dep_map, _RET_IP_);
 	rwsem_release(&kn->dep_map, 1, _RET_IP_);
@@ -613,6 +607,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 
 	root->dir_ops = kdops;
 	root->kn = kn;
+	init_waitqueue_head(&root->deactivate_waitq);
 
 	return root;
 }
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d2c439db4ef..232f1a63238 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -15,7 +15,7 @@
 #include <linux/lockdep.h>
 #include <linux/rbtree.h>
 #include <linux/atomic.h>
-#include <linux/completion.h>
+#include <linux/wait.h>
 
 struct file;
 struct iattr;
@@ -91,7 +91,6 @@ struct kernfs_node {
 	struct rb_node		rb;
 
 	union {
-		struct completion	*completion;
 		struct kernfs_node	*removed_list;
 	} u;
 
@@ -132,6 +131,7 @@ struct kernfs_root {
 	/* private fields, do not use outside kernfs proper */
 	struct ida		ino_ida;
 	struct kernfs_dir_ops	*dir_ops;
+	wait_queue_head_t	deactivate_waitq;
 };
 
 struct kernfs_open_file {
-- 
cgit v1.2.3-70-g09d2


From a69d001cfc712b96ec9d7ba44d6285702a38dabf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:20 -0500
Subject: kernfs: remove KERNFS_ACTIVE_REF and add kernfs_lockdep()

There currently are two mechanisms gating active ref lockdep
annotations - KERNFS_LOCKDEP flag and KERNFS_ACTIVE_REF type mask.
The former disables lockdep annotations in kernfs_get/put_active()
while the latter disables all of kernfs_deactivate().

While KERNFS_ACTIVE_REF also behaves as an optimization to skip the
deactivation step for non-file nodes, the benefit is marginal and it
needlessly diverges code paths.  Let's drop KERNFS_ACTIVE_REF and use
KERNFS_LOCKDEP in kernfs_deactivate() too.

While at it, add a test helper kernfs_lockdep() to test KERNFS_LOCKDEP
flag so that it's more convenient and the related code can be compiled
out when not enabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 31 ++++++++++++++++++++-----------
 include/linux/kernfs.h |  1 -
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index ed62de6cdf8..1c9130a3304 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -22,6 +22,15 @@ DEFINE_MUTEX(kernfs_mutex);
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
+static bool kernfs_lockdep(struct kernfs_node *kn)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	return kn->flags & KERNFS_LOCKDEP;
+#else
+	return false;
+#endif
+}
+
 /**
  *	kernfs_name_hash
  *	@name: Null terminated string to hash
@@ -138,7 +147,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
 	if (!atomic_inc_unless_negative(&kn->active))
 		return NULL;
 
-	if (kn->flags & KERNFS_LOCKDEP)
+	if (kernfs_lockdep(kn))
 		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
 	return kn;
 }
@@ -158,7 +167,7 @@ void kernfs_put_active(struct kernfs_node *kn)
 	if (unlikely(!kn))
 		return;
 
-	if (kn->flags & KERNFS_LOCKDEP)
+	if (kernfs_lockdep(kn))
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	v = atomic_dec_return(&kn->active);
 	if (likely(v != KN_DEACTIVATED_BIAS))
@@ -179,21 +188,21 @@ static void kernfs_deactivate(struct kernfs_node *kn)
 
 	BUG_ON(!(kn->flags & KERNFS_REMOVED));
 
-	if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
-		return;
-
-	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
-
 	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
 
-	if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
-		lock_contended(&kn->dep_map, _RET_IP_);
+	if (kernfs_lockdep(kn)) {
+		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
+			lock_contended(&kn->dep_map, _RET_IP_);
+	}
 
 	wait_event(root->deactivate_waitq,
 		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
 
-	lock_acquired(&kn->dep_map, _RET_IP_);
-	rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	if (kernfs_lockdep(kn)) {
+		lock_acquired(&kn->dep_map, _RET_IP_);
+		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	}
 }
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 232f1a63238..42ad32ff22f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -34,7 +34,6 @@ enum kernfs_node_type {
 };
 
 #define KERNFS_TYPE_MASK	0x000f
-#define KERNFS_ACTIVE_REF	KERNFS_FILE
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
 enum kernfs_node_flag {
-- 
cgit v1.2.3-70-g09d2


From ae34372eb8408b3d07e870f1939f99007a730d28 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:21 -0500
Subject: kernfs: remove KERNFS_REMOVED

KERNFS_REMOVED is used to mark half-initialized and dying nodes so
that they don't show up in lookups and deny adding new nodes under or
renaming it; however, its role overlaps those of deactivation and
removal from rbtree.

It's necessary to deny addition of new children while removal is in
progress; however, this role considerably intersects with deactivation
- KERNFS_REMOVED prevents new children while deactivation prevents new
file operations.  There's no reason to have them separate making
things more complex than necessary.

KERNFS_REMOVED is also used to decide whether a node is still visible
to vfs layer, which is rather redundant as equivalent determination
can be made by testing whether the node is on its parent's children
rbtree or not.

This patch removes KERNFS_REMOVED.

* Instead of KERNFS_REMOVED, each node now starts its life
  deactivated.  This means that we now use both atomic_add() and
  atomic_sub() on KN_DEACTIVATED_BIAS, which is INT_MIN.  The compiler
  generates an overflow warnings when negating INT_MIN as the negation
  can't be represented as a positive number.  Nothing is actually
  broken but let's bump BIAS by one to avoid the warnings for archs
  which negates the subtrahend..

* KERNFS_REMOVED tests in add and rename paths are replaced with
  kernfs_get/put_active() of the target nodes.  Due to the way the add
  path is structured now, active ref handling is done in the callers
  of kernfs_add_one().  This will be consolidated up later.

* kernfs_remove_one() is updated to deactivate instead of setting
  KERNFS_REMOVED.  This removes deactivation from kernfs_deactivate(),
  which is now renamed to kernfs_drain().

* kernfs_dop_revalidate() now tests RB_EMPTY_NODE(&kn->rb) instead of
  KERNFS_REMOVED and KERNFS_REMOVED test in kernfs_dir_pos() is
  dropped.  A node which is removed from the children rbtree is not
  included in the iteration in the first place.  This means that a
  node may be visible through vfs a bit longer - it's now also visible
  after deactivation until the actual removal.  This slightly enlarged
  window difference doesn't make any difference to the userland.

* Sanity check on KERNFS_REMOVED in kernfs_put() is replaced with
  checks on the active ref.

* Some comment style updates in the affected area.

v2: Reordered before removal path restructuring.  kernfs_active()
    dropped and kernfs_get/put_active() used instead.  RB_EMPTY_NODE()
    used in the lookup paths.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 79 +++++++++++++++++++++++++--------------------
 fs/kernfs/file.c            | 10 ++++--
 fs/kernfs/kernfs-internal.h |  3 +-
 fs/kernfs/symlink.c         | 10 ++++--
 include/linux/kernfs.h      |  1 -
 5 files changed, 60 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1c9130a3304..7f8afc1d08f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -127,6 +127,7 @@ static void kernfs_unlink_sibling(struct kernfs_node *kn)
 		kn->parent->dir.subdirs--;
 
 	rb_erase(&kn->rb, &kn->parent->dir.children);
+	RB_CLEAR_NODE(&kn->rb);
 }
 
 /**
@@ -177,18 +178,16 @@ void kernfs_put_active(struct kernfs_node *kn)
 }
 
 /**
- *	kernfs_deactivate - deactivate kernfs_node
- *	@kn: kernfs_node to deactivate
+ * kernfs_drain - drain kernfs_node
+ * @kn: kernfs_node to drain
  *
- *	Deny new active references and drain existing ones.
+ * Drain existing usages.
  */
-static void kernfs_deactivate(struct kernfs_node *kn)
+static void kernfs_drain(struct kernfs_node *kn)
 {
 	struct kernfs_root *root = kernfs_root(kn);
 
-	BUG_ON(!(kn->flags & KERNFS_REMOVED));
-
-	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+	WARN_ON_ONCE(atomic_read(&kn->active) >= 0);
 
 	if (kernfs_lockdep(kn)) {
 		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -233,13 +232,15 @@ void kernfs_put(struct kernfs_node *kn)
 		return;
 	root = kernfs_root(kn);
  repeat:
-	/* Moving/renaming is always done while holding reference.
+	/*
+	 * Moving/renaming is always done while holding reference.
 	 * kn->parent won't change beneath us.
 	 */
 	parent = kn->parent;
 
-	WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
-	     parent ? parent->name : "", kn->name);
+	WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
+		  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
+		  parent ? parent->name : "", kn->name, atomic_read(&kn->active));
 
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
@@ -281,8 +282,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 	kn = dentry->d_fsdata;
 	mutex_lock(&kernfs_mutex);
 
-	/* The kernfs node has been deleted */
-	if (kn->flags & KERNFS_REMOVED)
+	/* Force fresh lookup if removed */
+	if (kn->parent && RB_EMPTY_NODE(&kn->rb))
 		goto out_bad;
 
 	/* The kernfs node has been moved? */
@@ -350,11 +351,12 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 	kn->ino = ret;
 
 	atomic_set(&kn->count, 1);
-	atomic_set(&kn->active, 0);
+	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
+	RB_CLEAR_NODE(&kn->rb);
 
 	kn->name = name;
 	kn->mode = mode;
-	kn->flags = flags | KERNFS_REMOVED;
+	kn->flags = flags;
 
 	return kn;
 
@@ -413,6 +415,8 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	struct kernfs_iattrs *ps_iattr;
 	int ret;
 
+	WARN_ON_ONCE(atomic_read(&parent->active) < 0);
+
 	if (has_ns != (bool)kn->ns) {
 		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
 		     has_ns ? "required" : "invalid", parent->name, kn->name);
@@ -422,9 +426,6 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	if (kernfs_type(parent) != KERNFS_DIR)
 		return -EINVAL;
 
-	if (parent->flags & KERNFS_REMOVED)
-		return -ENOENT;
-
 	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kn->parent = parent;
 	kernfs_get(parent);
@@ -441,8 +442,7 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	}
 
 	/* Mark the entry added into directory tree */
-	kn->flags &= ~KERNFS_REMOVED;
-
+	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
 	return 0;
 }
 
@@ -470,7 +470,7 @@ static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
 	 * Removal can be called multiple times on the same node.  Only the
 	 * first invocation is effective and puts the base ref.
 	 */
-	if (kn->flags & KERNFS_REMOVED)
+	if (atomic_read(&kn->active) < 0)
 		return;
 
 	if (kn->parent) {
@@ -484,7 +484,7 @@ static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
 		}
 	}
 
-	kn->flags |= KERNFS_REMOVED;
+	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
 	kn->u.removed_list = acxt->removed;
 	acxt->removed = kn;
 }
@@ -512,7 +512,7 @@ void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 
 		acxt->removed = kn->u.removed_list;
 
-		kernfs_deactivate(kn);
+		kernfs_drain(kn);
 		kernfs_unmap_bin_file(kn);
 		kernfs_put(kn);
 	}
@@ -610,7 +610,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	kn->flags &= ~KERNFS_REMOVED;
+	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
 	kn->priv = priv;
 	kn->dir.root = root;
 
@@ -662,9 +662,13 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	kn->priv = priv;
 
 	/* link in */
-	kernfs_addrm_start(&acxt);
-	rc = kernfs_add_one(&acxt, kn, parent);
-	kernfs_addrm_finish(&acxt);
+	rc = -ENOENT;
+	if (kernfs_get_active(parent)) {
+		kernfs_addrm_start(&acxt);
+		rc = kernfs_add_one(&acxt, kn, parent);
+		kernfs_addrm_finish(&acxt);
+		kernfs_put_active(parent);
+	}
 
 	if (!rc)
 		return kn;
@@ -899,27 +903,29 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 {
 	int error;
 
-	mutex_lock(&kernfs_mutex);
-
 	error = -ENOENT;
-	if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
+	if (!kernfs_get_active(new_parent))
 		goto out;
+	if (!kernfs_get_active(kn))
+		goto out_put_new_parent;
+
+	mutex_lock(&kernfs_mutex);
 
 	error = 0;
 	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
 	    (strcmp(kn->name, new_name) == 0))
-		goto out;	/* nothing to rename */
+		goto out_unlock;	/* nothing to rename */
 
 	error = -EEXIST;
 	if (kernfs_find_ns(new_parent, new_name, new_ns))
-		goto out;
+		goto out_unlock;
 
 	/* rename kernfs_node */
 	if (strcmp(kn->name, new_name) != 0) {
 		error = -ENOMEM;
 		new_name = kstrdup(new_name, GFP_KERNEL);
 		if (!new_name)
-			goto out;
+			goto out_unlock;
 
 		if (kn->flags & KERNFS_STATIC_NAME)
 			kn->flags &= ~KERNFS_STATIC_NAME;
@@ -941,8 +947,12 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	kernfs_link_sibling(kn);
 
 	error = 0;
- out:
+out_unlock:
 	mutex_unlock(&kernfs_mutex);
+	kernfs_put_active(kn);
+out_put_new_parent:
+	kernfs_put_active(new_parent);
+out:
 	return error;
 }
 
@@ -962,8 +972,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
 	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
 {
 	if (pos) {
-		int valid = !(pos->flags & KERNFS_REMOVED) &&
-			pos->parent == parent && hash == pos->hash;
+		int valid = pos->parent == parent && hash == pos->hash;
 		kernfs_put(pos);
 		if (!valid)
 			pos = NULL;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index bdd38854ef6..231a171f48b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -856,9 +856,13 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
 
-	kernfs_addrm_start(&acxt);
-	rc = kernfs_add_one(&acxt, kn, parent);
-	kernfs_addrm_finish(&acxt);
+	rc = -ENOENT;
+	if (kernfs_get_active(parent)) {
+		kernfs_addrm_start(&acxt);
+		rc = kernfs_add_one(&acxt, kn, parent);
+		kernfs_addrm_finish(&acxt);
+		kernfs_put_active(parent);
+	}
 
 	if (rc) {
 		kernfs_put(kn);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index c6ba5bc37a9..57a93f4d645 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,7 +26,8 @@ struct kernfs_iattrs {
 	struct simple_xattrs	xattrs;
 };
 
-#define KN_DEACTIVATED_BIAS		INT_MIN
+/* +1 to avoid triggering overflow warning when negating it */
+#define KN_DEACTIVATED_BIAS		(INT_MIN + 1)
 
 /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
 
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index a03e26036ef..b2c106ca343 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -40,9 +40,13 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	kn->symlink.target_kn = target;
 	kernfs_get(target);	/* ref owned by symlink */
 
-	kernfs_addrm_start(&acxt);
-	error = kernfs_add_one(&acxt, kn, parent);
-	kernfs_addrm_finish(&acxt);
+	error = -ENOENT;
+	if (kernfs_get_active(parent)) {
+		kernfs_addrm_start(&acxt);
+		error = kernfs_add_one(&acxt, kn, parent);
+		kernfs_addrm_finish(&acxt);
+		kernfs_put_active(parent);
+	}
 
 	if (!error)
 		return kn;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 42ad32ff22f..289d4f639ad 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -37,7 +37,6 @@ enum kernfs_node_type {
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
 enum kernfs_node_flag {
-	KERNFS_REMOVED		= 0x0010,
 	KERNFS_NS		= 0x0020,
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
-- 
cgit v1.2.3-70-g09d2


From 45a140e587f3d32d8d424ed940dffb61e1739047 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:22 -0500
Subject: kernfs: restructure removal path to fix possible premature return

The recursive nature of kernfs_remove() means that, even if
kernfs_remove() is not allowed to be called multiple times on the same
node, there may be race conditions between removal of parent and its
descendants.  While we can claim that kernfs_remove() shouldn't be
called on one of the descendants while the removal of an ancestor is
in progress, such rule is unnecessarily restrictive and very difficult
to enforce.  It's better to simply allow invoking kernfs_remove() as
the caller sees fit as long as the caller ensures that the node is
accessible.

The current behavior in such situations is broken.  Whoever enters
removal path first takes the node off the hierarchy and then
deactivates.  Following removers either return as soon as it notices
that it's not the first one or can't even find the target node as it
has already been removed from the hierarchy.  In both cases, the
following removers may finish prematurely while the nodes which should
be removed and drained are still being processed by the first one.

This patch restructures so that multiple removers, whether through
recursion or direction invocation, always follow the following rules.

* When there are multiple concurrent removers, only one puts the base
  ref.

* Regardless of which one puts the base ref, all removers are blocked
  until the target node is fully deactivated and removed.

To achieve the above, removal path now first deactivates the subtree,
drains it and then unlinks one-by-one.  __kernfs_deactivate() is
called directly from __kernfs_removal() and drops and regrabs
kernfs_mutex for each descendant to drain active refs.  As this means
that multiple removers can enter __kernfs_deactivate() for the same
node, the function is updated so that it can handle multiple
deactivators of the same node - only one actually deactivates but all
wait till drain completion.

The restructured removal path guarantees that a removed node gets
unlinked only after the node is deactivated and drained.  Combined
with proper multiple deactivator handling, this guarantees that any
invocation of kernfs_remove() returns only after the node itself and
all its descendants are deactivated, drained and removed.

v2: Draining separated into a separate loop (used to be in the same
    loop as unlink) and done from __kernfs_deactivate().  This is to
    allow exposing deactivation as a separate interface later.

    Root node removal was broken in v1 patch.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 139 ++++++++++++++++++++++++++++++-------------------
 include/linux/kernfs.h |   1 +
 2 files changed, 87 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 7f8afc1d08f..e565ec096ae 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -181,14 +181,38 @@ void kernfs_put_active(struct kernfs_node *kn)
  * kernfs_drain - drain kernfs_node
  * @kn: kernfs_node to drain
  *
- * Drain existing usages.
+ * Drain existing usages of @kn.  Mutiple removers may invoke this function
+ * concurrently on @kn and all will return after draining is complete.
+ * Returns %true if drain is performed and kernfs_mutex was temporarily
+ * released.  %false if @kn was already drained and no operation was
+ * necessary.
+ *
+ * The caller is responsible for ensuring @kn stays pinned while this
+ * function is in progress even if it gets removed by someone else.
  */
-static void kernfs_drain(struct kernfs_node *kn)
+static bool kernfs_drain(struct kernfs_node *kn)
+	__releases(&kernfs_mutex) __acquires(&kernfs_mutex)
 {
 	struct kernfs_root *root = kernfs_root(kn);
 
+	lockdep_assert_held(&kernfs_mutex);
 	WARN_ON_ONCE(atomic_read(&kn->active) >= 0);
 
+	/*
+	 * We want to go through the active ref lockdep annotation at least
+	 * once for all node removals, but the lockdep annotation can't be
+	 * nested inside kernfs_mutex and deactivation can't make forward
+	 * progress if we keep dropping the mutex.  Use JUST_ACTIVATED to
+	 * force the slow path once for each deactivation if lockdep is
+	 * enabled.
+	 */
+	if ((!kernfs_lockdep(kn) || !(kn->flags & KERNFS_JUST_DEACTIVATED)) &&
+	    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
+		return false;
+
+	kn->flags &= ~KERNFS_JUST_DEACTIVATED;
+	mutex_unlock(&kernfs_mutex);
+
 	if (kernfs_lockdep(kn)) {
 		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
 		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
@@ -202,6 +226,9 @@ static void kernfs_drain(struct kernfs_node *kn)
 		lock_acquired(&kn->dep_map, _RET_IP_);
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	}
+
+	mutex_lock(&kernfs_mutex);
+	return true;
 }
 
 /**
@@ -446,49 +473,6 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	return 0;
 }
 
-/**
- *	kernfs_remove_one - remove kernfs_node from parent
- *	@acxt: addrm context to use
- *	@kn: kernfs_node to be removed
- *
- *	Mark @kn removed and drop nlink of parent inode if @kn is a
- *	directory.  @kn is unlinked from the children list.
- *
- *	This function should be called between calls to
- *	kernfs_addrm_start() and kernfs_addrm_finish() and should be
- *	passed the same @acxt as passed to kernfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by kernfs_addrm_start().
- */
-static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
-			      struct kernfs_node *kn)
-{
-	struct kernfs_iattrs *ps_iattr;
-
-	/*
-	 * Removal can be called multiple times on the same node.  Only the
-	 * first invocation is effective and puts the base ref.
-	 */
-	if (atomic_read(&kn->active) < 0)
-		return;
-
-	if (kn->parent) {
-		kernfs_unlink_sibling(kn);
-
-		/* Update timestamps on the parent */
-		ps_iattr = kn->parent->iattr;
-		if (ps_iattr) {
-			ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
-			ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
-		}
-	}
-
-	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
-	kn->u.removed_list = acxt->removed;
-	acxt->removed = kn;
-}
-
 /**
  *	kernfs_addrm_finish - finish up kernfs_node add/remove
  *	@acxt: addrm context to finish up
@@ -512,7 +496,6 @@ void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 
 		acxt->removed = kn->u.removed_list;
 
-		kernfs_drain(kn);
 		kernfs_unmap_bin_file(kn);
 		kernfs_put(kn);
 	}
@@ -822,23 +805,73 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
 	return pos->parent;
 }
 
+static void __kernfs_deactivate(struct kernfs_node *kn)
+{
+	struct kernfs_node *pos;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	/* prevent any new usage under @kn by deactivating all nodes */
+	pos = NULL;
+	while ((pos = kernfs_next_descendant_post(pos, kn))) {
+		if (atomic_read(&pos->active) >= 0) {
+			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+			pos->flags |= KERNFS_JUST_DEACTIVATED;
+		}
+	}
+
+	/*
+	 * Drain the subtree.  If kernfs_drain() blocked to drain, which is
+	 * indicated by %true return, it temporarily released kernfs_mutex
+	 * and the rbtree might have been modified inbetween breaking our
+	 * future walk.  Restart the walk after each %true return.
+	 */
+	pos = NULL;
+	while ((pos = kernfs_next_descendant_post(pos, kn))) {
+		bool drained;
+
+		kernfs_get(pos);
+		drained = kernfs_drain(pos);
+		kernfs_put(pos);
+		if (drained)
+			pos = NULL;
+	}
+}
+
 static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 			    struct kernfs_node *kn)
 {
-	struct kernfs_node *pos, *next;
+	struct kernfs_node *pos;
+
+	lockdep_assert_held(&kernfs_mutex);
 
 	if (!kn)
 		return;
 
 	pr_debug("kernfs %s: removing\n", kn->name);
 
-	next = NULL;
+	__kernfs_deactivate(kn);
+
+	/* unlink the subtree node-by-node */
 	do {
-		pos = next;
-		next = kernfs_next_descendant_post(pos, kn);
-		if (pos)
-			kernfs_remove_one(acxt, pos);
-	} while (next);
+		struct kernfs_iattrs *ps_iattr;
+
+		pos = kernfs_leftmost_descendant(kn);
+
+		if (pos->parent) {
+			kernfs_unlink_sibling(pos);
+
+			/* update timestamps on the parent */
+			ps_iattr = pos->parent->iattr;
+			if (ps_iattr) {
+				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+			}
+		}
+
+		pos->u.removed_list = acxt->removed;
+		acxt->removed = pos;
+	} while (pos != kn);
 }
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 289d4f639ad..61bd7ae6b8e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -37,6 +37,7 @@ enum kernfs_node_type {
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
 enum kernfs_node_flag {
+	KERNFS_JUST_DEACTIVATED	= 0x0010, /* used to aid lockdep annotation */
 	KERNFS_NS		= 0x0020,
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
-- 
cgit v1.2.3-70-g09d2


From f601f9a2bf7dc1f7ee18feece4c4e2fc6845d6c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:23 -0500
Subject: kernfs: invoke kernfs_unmap_bin_file() directly from
 __kernfs_remove()

kernfs_unmap_bin_file() is supposed to unmap all memory mappings of
the target file before kernfs_remove() finishes; however, it currently
is being called from kernfs_addrm_finish() and has the same race
problem as the original implementation of deactivation when there are
multiple removers - only the remover which snatches the node to its
addrm_cxt->removed list is guaranteed to wait for its completion
before returning.

It can be fixed by moving kernfs_unmap_bin_file() invocation from
kernfs_addrm_finish() to __kernfs_remove().  The function may be
called multiple times but that shouldn't do any harm.

We end up dropping kernfs_mutex in the removal loop and the node may
be removed inbetween by someone else.  kernfs_unlink_sibling() is
updated to test whether the node has already been removed and return
accordingly.  __kernfs_remove() in turn performs post-unlinking
cleanup only if it actually unlinked the node.

KERNFS_HAS_MMAP test is moved out of the unmap function into
__kernfs_remove() so that we don't unlock kernfs_mutex unnecessarily.
While at it, drop the now meaningless "bin" qualifier from the
function name.

v2: Rewritten to fit the v2 restructuring of removal path.  HAS_MMAP
    test relocated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 42 +++++++++++++++++++++++++++++++++---------
 fs/kernfs/file.c            |  5 +----
 fs/kernfs/kernfs-internal.h |  2 +-
 3 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e565ec096ae..f878e4f2efe 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -121,13 +121,17 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
  *	Locking:
  *	mutex_lock(kernfs_mutex)
  */
-static void kernfs_unlink_sibling(struct kernfs_node *kn)
+static bool kernfs_unlink_sibling(struct kernfs_node *kn)
 {
+	if (RB_EMPTY_NODE(&kn->rb))
+		return false;
+
 	if (kernfs_type(kn) == KERNFS_DIR)
 		kn->parent->dir.subdirs--;
 
 	rb_erase(&kn->rb, &kn->parent->dir.children);
 	RB_CLEAR_NODE(&kn->rb);
+	return true;
 }
 
 /**
@@ -496,7 +500,6 @@ void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 
 		acxt->removed = kn->u.removed_list;
 
-		kernfs_unmap_bin_file(kn);
 		kernfs_put(kn);
 	}
 }
@@ -854,23 +857,44 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 
 	/* unlink the subtree node-by-node */
 	do {
-		struct kernfs_iattrs *ps_iattr;
-
 		pos = kernfs_leftmost_descendant(kn);
 
-		if (pos->parent) {
-			kernfs_unlink_sibling(pos);
+		/*
+		 * We're gonna release kernfs_mutex to unmap bin files,
+		 * Make sure @pos doesn't go away inbetween.
+		 */
+		kernfs_get(pos);
+
+		/*
+		 * This must be come before unlinking; otherwise, when
+		 * there are multiple removers, some may finish before
+		 * unmapping is complete.
+		 */
+		if (pos->flags & KERNFS_HAS_MMAP) {
+			mutex_unlock(&kernfs_mutex);
+			kernfs_unmap_file(pos);
+			mutex_lock(&kernfs_mutex);
+		}
+
+		/*
+		 * kernfs_unlink_sibling() succeeds once per node.  Use it
+		 * to decide who's responsible for cleanups.
+		 */
+		if (!pos->parent || kernfs_unlink_sibling(pos)) {
+			struct kernfs_iattrs *ps_iattr =
+				pos->parent ? pos->parent->iattr : NULL;
 
 			/* update timestamps on the parent */
-			ps_iattr = pos->parent->iattr;
 			if (ps_iattr) {
 				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
 				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
 			}
+
+			pos->u.removed_list = acxt->removed;
+			acxt->removed = pos;
 		}
 
-		pos->u.removed_list = acxt->removed;
-		acxt->removed = pos;
+		kernfs_put(pos);
 	} while (pos != kn);
 }
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 231a171f48b..404ffd2f27b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -700,14 +700,11 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn)
+void kernfs_unmap_file(struct kernfs_node *kn)
 {
 	struct kernfs_open_node *on;
 	struct kernfs_open_file *of;
 
-	if (!(kn->flags & KERNFS_HAS_MMAP))
-		return;
-
 	spin_lock_irq(&kernfs_open_node_lock);
 	on = kn->attr.open;
 	if (on)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 57a93f4d645..e9ec38c8607 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -113,7 +113,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
  */
 extern const struct file_operations kernfs_file_fops;
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn);
+void kernfs_unmap_file(struct kernfs_node *kn);
 
 /*
  * symlink.c
-- 
cgit v1.2.3-70-g09d2


From 99177a34110889a8f2c36420c34e3bcc9bfd8a70 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:24 -0500
Subject: kernfs: remove kernfs_addrm_cxt

kernfs_addrm_cxt and the accompanying kernfs_addrm_start/finish() were
added because there were operations which should be performed outside
kernfs_mutex after adding and removing kernfs_nodes.  The necessary
operations were recorded in kernfs_addrm_cxt and performed by
kernfs_addrm_finish(); however, after the recent changes which
relocated deactivation and unmapping so that they're performed
directly during removal, the only operation kernfs_addrm_finish()
performs is kernfs_put(), which can be moved inside the removal path
too.

This patch moves the kernfs_put() of the base ref to __kernfs_remove()
and remove kernfs_addrm_cxt and kernfs_addrm_start/finish().

* kernfs_add_one() is updated to grab and release the parent's active
  ref and kernfs_mutex itself.  kernfs_get/put_active() and
  kernfs_addrm_start/finish() invocations around it are removed from
  all users.

* __kernfs_remove() puts an unlinked node directly instead of chaining
  it to kernfs_addrm_cxt.  Its callers are updated to grab and release
  kernfs_mutex instead of calling kernfs_addrm_start/finish() around
  it.

v2: Updated to fit the v2 restructuring of removal path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 114 ++++++++++----------------------------------
 fs/kernfs/file.c            |  10 +---
 fs/kernfs/kernfs-internal.h |  12 +----
 fs/kernfs/symlink.c         |  10 +---
 include/linux/kernfs.h      |   4 --
 5 files changed, 29 insertions(+), 121 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f878e4f2efe..770d687ee9f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -398,29 +398,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 	return NULL;
 }
 
-/**
- *	kernfs_addrm_start - prepare for kernfs_node add/remove
- *	@acxt: pointer to kernfs_addrm_cxt to be used
- *
- *	This function is called when the caller is about to add or remove
- *	kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
- *	to keep and pass context to other addrm functions.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).  kernfs_mutex is locked on
- *	return.
- */
-void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
-	__acquires(kernfs_mutex)
-{
-	memset(acxt, 0, sizeof(*acxt));
-
-	mutex_lock(&kernfs_mutex);
-}
-
 /**
  *	kernfs_add_one - add kernfs_node to parent without warning
- *	@acxt: addrm context to use
  *	@kn: kernfs_node to be added
  *	@parent: the parent kernfs_node to add @kn to
  *
@@ -428,34 +407,29 @@ void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
  *	parent inode if @kn is a directory and link into the children list
  *	of the parent.
  *
- *	This function should be called between calls to
- *	kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
- *	the same @acxt as passed to kernfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by kernfs_addrm_start().
- *
  *	RETURNS:
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
-		  struct kernfs_node *parent)
+int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent)
 {
-	bool has_ns = kernfs_ns_enabled(parent);
 	struct kernfs_iattrs *ps_iattr;
+	bool has_ns;
 	int ret;
 
-	WARN_ON_ONCE(atomic_read(&parent->active) < 0);
+	if (!kernfs_get_active(parent))
+		return -ENOENT;
 
-	if (has_ns != (bool)kn->ns) {
-		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
-		     has_ns ? "required" : "invalid", parent->name, kn->name);
-		return -EINVAL;
-	}
+	mutex_lock(&kernfs_mutex);
+
+	ret = -EINVAL;
+	has_ns = kernfs_ns_enabled(parent);
+	if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		 has_ns ? "required" : "invalid", parent->name, kn->name))
+		goto out_unlock;
 
 	if (kernfs_type(parent) != KERNFS_DIR)
-		return -EINVAL;
+		goto out_unlock;
 
 	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kn->parent = parent;
@@ -463,7 +437,7 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 
 	ret = kernfs_link_sibling(kn);
 	if (ret)
-		return ret;
+		goto out_unlock;
 
 	/* Update timestamps on the parent */
 	ps_iattr = parent->iattr;
@@ -474,34 +448,11 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 
 	/* Mark the entry added into directory tree */
 	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
-	return 0;
-}
-
-/**
- *	kernfs_addrm_finish - finish up kernfs_node add/remove
- *	@acxt: addrm context to finish up
- *
- *	Finish up kernfs_node add/remove.  Resources acquired by
- *	kernfs_addrm_start() are released and removed kernfs_nodes are
- *	cleaned up.
- *
- *	LOCKING:
- *	kernfs_mutex is released.
- */
-void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
-	__releases(kernfs_mutex)
-{
-	/* release resources acquired by kernfs_addrm_start() */
+	ret = 0;
+out_unlock:
 	mutex_unlock(&kernfs_mutex);
-
-	/* kill removed kernfs_nodes */
-	while (acxt->removed) {
-		struct kernfs_node *kn = acxt->removed;
-
-		acxt->removed = kn->u.removed_list;
-
-		kernfs_put(kn);
-	}
+	kernfs_put_active(parent);
+	return ret;
 }
 
 /**
@@ -633,7 +584,6 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
 					 void *priv, const void *ns)
 {
-	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 	int rc;
 
@@ -648,14 +598,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	kn->priv = priv;
 
 	/* link in */
-	rc = -ENOENT;
-	if (kernfs_get_active(parent)) {
-		kernfs_addrm_start(&acxt);
-		rc = kernfs_add_one(&acxt, kn, parent);
-		kernfs_addrm_finish(&acxt);
-		kernfs_put_active(parent);
-	}
-
+	rc = kernfs_add_one(kn, parent);
 	if (!rc)
 		return kn;
 
@@ -841,8 +784,7 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 	}
 }
 
-static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
-			    struct kernfs_node *kn)
+static void __kernfs_remove(struct kernfs_node *kn)
 {
 	struct kernfs_node *pos;
 
@@ -890,8 +832,7 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
 			}
 
-			pos->u.removed_list = acxt->removed;
-			acxt->removed = pos;
+			kernfs_put(pos);
 		}
 
 		kernfs_put(pos);
@@ -906,11 +847,9 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
  */
 void kernfs_remove(struct kernfs_node *kn)
 {
-	struct kernfs_addrm_cxt acxt;
-
-	kernfs_addrm_start(&acxt);
-	__kernfs_remove(&acxt, kn);
-	kernfs_addrm_finish(&acxt);
+	mutex_lock(&kernfs_mutex);
+	__kernfs_remove(kn);
+	mutex_unlock(&kernfs_mutex);
 }
 
 /**
@@ -925,7 +864,6 @@ void kernfs_remove(struct kernfs_node *kn)
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns)
 {
-	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 
 	if (!parent) {
@@ -934,13 +872,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 		return -ENOENT;
 	}
 
-	kernfs_addrm_start(&acxt);
+	mutex_lock(&kernfs_mutex);
 
 	kn = kernfs_find_ns(parent, name, ns);
 	if (kn)
-		__kernfs_remove(&acxt, kn);
+		__kernfs_remove(kn);
 
-	kernfs_addrm_finish(&acxt);
+	mutex_unlock(&kernfs_mutex);
 
 	if (kn)
 		return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 404ffd2f27b..ffe1bebf919 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -817,7 +817,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 bool name_is_static,
 					 struct lock_class_key *key)
 {
-	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 	unsigned flags;
 	int rc;
@@ -853,14 +852,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
 
-	rc = -ENOENT;
-	if (kernfs_get_active(parent)) {
-		kernfs_addrm_start(&acxt);
-		rc = kernfs_add_one(&acxt, kn, parent);
-		kernfs_addrm_finish(&acxt);
-		kernfs_put_active(parent);
-	}
-
+	rc = kernfs_add_one(kn, parent);
 	if (rc) {
 		kernfs_put(kn);
 		return ERR_PTR(rc);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index e9ec38c8607..4bc57848076 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -45,13 +45,6 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
 	return kn->dir.root;
 }
 
-/*
- * Context structure to be used while adding/removing nodes.
- */
-struct kernfs_addrm_cxt {
-	struct kernfs_node	*removed;
-};
-
 /*
  * mount.c
  */
@@ -101,10 +94,7 @@ extern const struct inode_operations kernfs_dir_iops;
 
 struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
 void kernfs_put_active(struct kernfs_node *kn);
-void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
-int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
-		   struct kernfs_node *parent);
-void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent);
 struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 				    umode_t mode, unsigned flags);
 
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index b2c106ca343..3a939c263ed 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -27,7 +27,6 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       struct kernfs_node *target)
 {
 	struct kernfs_node *kn;
-	struct kernfs_addrm_cxt acxt;
 	int error;
 
 	kn = kernfs_new_node(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
@@ -40,14 +39,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	kn->symlink.target_kn = target;
 	kernfs_get(target);	/* ref owned by symlink */
 
-	error = -ENOENT;
-	if (kernfs_get_active(parent)) {
-		kernfs_addrm_start(&acxt);
-		error = kernfs_add_one(&acxt, kn, parent);
-		kernfs_addrm_finish(&acxt);
-		kernfs_put_active(parent);
-	}
-
+	error = kernfs_add_one(kn, parent);
 	if (!error)
 		return kn;
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 61bd7ae6b8e..9b5a4bb88c6 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -89,10 +89,6 @@ struct kernfs_node {
 
 	struct rb_node		rb;
 
-	union {
-		struct kernfs_node	*removed_list;
-	} u;
-
 	const void		*ns;	/* namespace tag */
 	unsigned int		hash;	/* ns + name hash */
 	union {
-- 
cgit v1.2.3-70-g09d2


From 895a068a524e134900b9d98b519309b7aae7bbb1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:25 -0500
Subject: kernfs: make kernfs_get_active() block if the node is deactivated but
 not removed

Currently, kernfs_get_active() fails if the target node is
deactivated.  This is fine as a node always gets removed after
deactivation; however, we're gonna add reactivation so the assumption
won't hold.  It'd be incorrect for kernfs_get_active() to fail for a
node which was deactivated only temporarily.

This patch makes kernfs_get_active() block if the node is deactivated
but not removed.  If the node gets reactivated (not yet implemented),
it will be retried and succeed.  If the node gets removed, it will be
woken up and fail.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 770d687ee9f..37dd6408f5f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -149,12 +149,25 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
 	if (unlikely(!kn))
 		return NULL;
 
-	if (!atomic_inc_unless_negative(&kn->active))
-		return NULL;
-
 	if (kernfs_lockdep(kn))
 		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
-	return kn;
+
+	/*
+	 * Try to obtain an active ref.  If @kn is deactivated, we block
+	 * till either it's reactivated or killed.
+	 */
+	do {
+		if (atomic_inc_unless_negative(&kn->active))
+			return kn;
+
+		wait_event(kernfs_root(kn)->deactivate_waitq,
+			   atomic_read(&kn->active) >= 0 ||
+			   RB_EMPTY_NODE(&kn->rb));
+	} while (!RB_EMPTY_NODE(&kn->rb));
+
+	if (kernfs_lockdep(kn))
+		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	return NULL;
 }
 
 /**
@@ -786,6 +799,7 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 
 static void __kernfs_remove(struct kernfs_node *kn)
 {
+	struct kernfs_root *root = kernfs_root(kn);
 	struct kernfs_node *pos;
 
 	lockdep_assert_held(&kernfs_mutex);
@@ -837,6 +851,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 		kernfs_put(pos);
 	} while (pos != kn);
+
+	/* some nodes killed, kick get_active waiters */
+	wake_up_all(&root->deactivate_waitq);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 9f010c2ad5194a4b682e747984477850fabd03be Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:26 -0500
Subject: kernfs: implement kernfs_{de|re}activate[_self]()

This patch implements four functions to manipulate deactivation state
- deactivate, reactivate and the _self suffixed pair.  A new fields
kernfs_node->deact_depth is added so that concurrent and nested
deactivations are handled properly.  kernfs_node->hash is moved so
that it's paired with the new field so that it doesn't increase the
size of kernfs_node.

A kernfs user's lock would normally nest inside active ref but during
removal the user may want to perform kernfs_remove() while holding the
said lock, which would introduce a reverse locking dependency.  This
function can be used to break such reverse dependency by allowing
deactivation step to performed separately outside user's critical
section.

This will also be used implement kernfs_remove_self().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 118 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/kernfs.h |   7 ++-
 2 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 37dd6408f5f..1aeb57969bf 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -396,6 +396,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 
 	atomic_set(&kn->count, 1);
 	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
+	kn->deact_depth = 1;
 	RB_CLEAR_NODE(&kn->rb);
 
 	kn->name = name;
@@ -461,6 +462,7 @@ int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent)
 
 	/* Mark the entry added into directory tree */
 	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
+	kn->deact_depth--;
 	ret = 0;
 out_unlock:
 	mutex_unlock(&kernfs_mutex);
@@ -561,6 +563,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 	}
 
 	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
+	kn->deact_depth--;
 	kn->priv = priv;
 	kn->dir.root = root;
 
@@ -773,7 +776,8 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 	/* prevent any new usage under @kn by deactivating all nodes */
 	pos = NULL;
 	while ((pos = kernfs_next_descendant_post(pos, kn))) {
-		if (atomic_read(&pos->active) >= 0) {
+		if (!pos->deact_depth++) {
+			WARN_ON_ONCE(atomic_read(&pos->active) < 0);
 			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
 			pos->flags |= KERNFS_JUST_DEACTIVATED;
 		}
@@ -797,6 +801,118 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 	}
 }
 
+static void __kernfs_reactivate(struct kernfs_node *kn)
+{
+	struct kernfs_node *pos;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	pos = NULL;
+	while ((pos = kernfs_next_descendant_post(pos, kn))) {
+		if (!--pos->deact_depth) {
+			WARN_ON_ONCE(atomic_read(&pos->active) >= 0);
+			atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
+		}
+		WARN_ON_ONCE(pos->deact_depth < 0);
+	}
+
+	/* some nodes reactivated, kick get_active waiters */
+	wake_up_all(&kernfs_root(kn)->deactivate_waitq);
+}
+
+static void __kernfs_deactivate_self(struct kernfs_node *kn)
+{
+	/*
+	 * Take out ourself out of the active ref dependency chain and
+	 * deactivate.  If we're called without an active ref, lockdep will
+	 * complain.
+	 */
+	kernfs_put_active(kn);
+	__kernfs_deactivate(kn);
+}
+
+static void __kernfs_reactivate_self(struct kernfs_node *kn)
+{
+	__kernfs_reactivate(kn);
+	/*
+	 * Restore active ref dropped by deactivate_self() so that it's
+	 * balanced on return.  put_active() will soon be called on @kn, so
+	 * this can't break anything regardless of @kn's state.
+	 */
+	atomic_inc(&kn->active);
+	if (kernfs_lockdep(kn))
+		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
+}
+
+/**
+ * kernfs_deactivate - deactivate subtree of a node
+ * @kn: kernfs_node to deactivate subtree of
+ *
+ * Deactivate the subtree of @kn.  On return, there's no active operation
+ * going on under @kn and creation or renaming of a node under @kn is
+ * blocked until @kn is reactivated or removed.  This function can be
+ * called multiple times and nests properly.  Each invocation should be
+ * paired with kernfs_reactivate().
+ *
+ * For a kernfs user which uses simple locking, the subsystem lock would
+ * nest inside active reference.  This becomes problematic if the user
+ * tries to remove nodes while holding the subystem lock as it would create
+ * a reverse locking dependency from the subsystem lock to active ref.
+ * This function can be used to break such reverse dependency.  The user
+ * can call this function outside the subsystem lock and then proceed to
+ * invoke kernfs_remove() while holding the subsystem lock without
+ * introducing such reverse dependency.
+ */
+void kernfs_deactivate(struct kernfs_node *kn)
+{
+	mutex_lock(&kernfs_mutex);
+	__kernfs_deactivate(kn);
+	mutex_unlock(&kernfs_mutex);
+}
+
+/**
+ * kernfs_reactivate - reactivate subtree of a node
+ * @kn: kernfs_node to reactivate subtree of
+ *
+ * Undo kernfs_deactivate().
+ */
+void kernfs_reactivate(struct kernfs_node *kn)
+{
+	mutex_lock(&kernfs_mutex);
+	__kernfs_reactivate(kn);
+	mutex_unlock(&kernfs_mutex);
+}
+
+/**
+ * kernfs_deactivate_self - deactivate subtree of a node from its own method
+ * @kn: the self kernfs_node to deactivate subtree of
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops.  Once this function
+ * is called, @kn may be removed by someone else while the enclosing method
+ * is in progress.  Other than that, this function is equivalent to
+ * kernfs_deactivate() and should be paired with kernfs_reactivate_self().
+ */
+void kernfs_deactivate_self(struct kernfs_node *kn)
+{
+	mutex_lock(&kernfs_mutex);
+	__kernfs_deactivate_self(kn);
+	mutex_unlock(&kernfs_mutex);
+}
+
+/**
+ * kernfs_reactivate_self - reactivate subtree of a node from its own method
+ * @kn: the self kernfs_node to reactivate subtree of
+ *
+ * Undo kernfs_deactivate_self().
+ */
+void kernfs_reactivate_self(struct kernfs_node *kn)
+{
+	mutex_lock(&kernfs_mutex);
+	__kernfs_reactivate_self(kn);
+	mutex_unlock(&kernfs_mutex);
+}
+
 static void __kernfs_remove(struct kernfs_node *kn)
 {
 	struct kernfs_root *root = kernfs_root(kn);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 9b5a4bb88c6..ac869302705 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -80,6 +80,8 @@ struct kernfs_elem_attr {
 struct kernfs_node {
 	atomic_t		count;
 	atomic_t		active;
+	int			deact_depth;
+	unsigned int		hash;	/* ns + name hash */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
@@ -90,7 +92,6 @@ struct kernfs_node {
 	struct rb_node		rb;
 
 	const void		*ns;	/* namespace tag */
-	unsigned int		hash;	/* ns + name hash */
 	union {
 		struct kernfs_elem_dir		dir;
 		struct kernfs_elem_symlink	symlink;
@@ -233,6 +234,10 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
 				       struct kernfs_node *target);
+void kernfs_deactivate(struct kernfs_node *kn);
+void kernfs_reactivate(struct kernfs_node *kn);
+void kernfs_deactivate_self(struct kernfs_node *kn);
+void kernfs_reactivate_self(struct kernfs_node *kn);
 void kernfs_remove(struct kernfs_node *kn);
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns);
-- 
cgit v1.2.3-70-g09d2


From 1ae06819c77cff1ea2833c94f8c093fe8a5c79db Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:27 -0500
Subject: kernfs, sysfs, driver-core: implement kernfs_remove_self() and its
 wrappers

Sometimes it's necessary to implement a node which wants to delete
nodes including itself.  This isn't straightforward because of kernfs
active reference.  While a file operation is in progress, an active
reference is held and kernfs_remove() waits for all such references to
drain before completing.  For a self-deleting node, this is a deadlock
as kernfs_remove() ends up waiting for an active reference that itself
is sitting on top of.

This currently is worked around in the sysfs layer using
sysfs_schedule_callback() which makes such removals asynchronous.
While it works, it's rather cumbersome and inherently breaks
synchronicity of the operation - the file operation which triggered
the operation may complete before the removal is finished (or even
started) and the removal may fail asynchronously.  If a removal
operation is immmediately followed by another operation which expects
the specific name to be available (e.g. removal followed by rename
onto the same name), there's no way to make the latter operation
reliable.

The thing is there's no inherent reason for this to be asynchrnous.
All that's necessary to do this synchronous is a dedicated operation
which drops its own active ref and deactivates self.  This patch
implements kernfs_remove_self() and its wrappers in sysfs and driver
core.  kernfs_remove_self() is to be called from one of the file
operations, drops the active ref and deactivates using
__kernfs_deactivate_self(), removes the self node, and restores active
ref to the dead node using __kernfs_reactivate_self() so that the ref
is balanced afterwards.  __kernfs_remove() is updated so that it takes
an early exit if the target node is already fully removed so that the
active ref restored by kernfs_remove_self() after removal doesn't
confuse the deactivation path.

This makes implementing self-deleting nodes very easy.  The normal
removal path doesn't even need to be changed to use
kernfs_remove_self() for the self-deleting node.  The method can
invoke kernfs_remove_self() on itself before proceeding the normal
removal path.  kernfs_remove() invoked on the node by the normal
deletion path will simply be ignored.

This will replace sysfs_schedule_callback().  A subtle feature of
sysfs_schedule_callback() is that it collapses multiple invocations -
even if multiple removals are triggered, the removal callback is run
only once.  An equivalent effect can be achieved by testing the return
value of kernfs_remove_self() - only the one which gets %true return
value should proceed with actual deletion.  All other instances of
kernfs_remove_self() will wait till the enclosing kernfs operation
which invoked the winning instance of kernfs_remove_self() finishes
and then return %false.  This trivially makes all users of
kernfs_remove_self() automatically show correct synchronous behavior
even when there are multiple concurrent operations - all "echo 1 >
delete" instances will finish only after the whole operation is
completed by one of the instances.

v2: For !CONFIG_SYSFS, dummy version kernfs_remove_self() was missing
    and sysfs_remove_file_self() had incorrect return type.  Fix it.
    Reported by kbuild test bot.

v3: Updated to use __kernfs_{de|re}activate_self().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 17 ++++++++++++
 fs/kernfs/dir.c        | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/sysfs/file.c        | 23 ++++++++++++++++
 include/linux/device.h |  2 ++
 include/linux/kernfs.h |  6 +++++
 include/linux/sysfs.h  |  7 +++++
 6 files changed, 127 insertions(+)

(limited to 'fs')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 2b567177ef7..9db57afcf81 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -570,6 +570,23 @@ void device_remove_file(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_remove_file);
 
+/**
+ * device_remove_file_self - remove sysfs attribute file from its own method.
+ * @dev: device.
+ * @attr: device attribute descriptor.
+ *
+ * See kernfs_remove_self() for details.
+ */
+bool device_remove_file_self(struct device *dev,
+			     const struct device_attribute *attr)
+{
+	if (dev)
+		return sysfs_remove_file_self(&dev->kobj, &attr->attr);
+	else
+		return false;
+}
+EXPORT_SYMBOL_GPL(device_remove_file_self);
+
 /**
  * device_create_bin_file - create sysfs binary attribute file for device.
  * @dev: device.
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1aeb57969bf..a8028be6cdb 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -985,6 +985,78 @@ void kernfs_remove(struct kernfs_node *kn)
 	mutex_unlock(&kernfs_mutex);
 }
 
+/**
+ * kernfs_remove_self - remove a kernfs_node from its own method
+ * @kn: the self kernfs_node to remove
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops.  This can be used to
+ * implement a file operation which deletes itself.
+ *
+ * For example, the "delete" file for a sysfs device directory can be
+ * implemented by invoking kernfs_remove_self() on the "delete" file
+ * itself.  This function breaks the circular dependency of trying to
+ * deactivate self while holding an active ref itself.  It isn't necessary
+ * to modify the usual removal path to use kernfs_remove_self().  The
+ * "delete" implementation can simply invoke kernfs_remove_self() on self
+ * before proceeding with the usual removal path.  kernfs will ignore later
+ * kernfs_remove() on self.
+ *
+ * kernfs_remove_self() can be called multiple times concurrently on the
+ * same kernfs_node.  Only the first one actually performs removal and
+ * returns %true.  All others will wait until the kernfs operation which
+ * won self-removal finishes and return %false.  Note that the losers wait
+ * for the completion of not only the winning kernfs_remove_self() but also
+ * the whole kernfs_ops which won the arbitration.  This can be used to
+ * guarantee, for example, all concurrent writes to a "delete" file to
+ * finish only after the whole operation is complete.
+ */
+bool kernfs_remove_self(struct kernfs_node *kn)
+{
+	bool ret;
+
+	mutex_lock(&kernfs_mutex);
+	__kernfs_deactivate_self(kn);
+
+	/*
+	 * SUICIDAL is used to arbitrate among competing invocations.  Only
+	 * the first one will actually perform removal.  When the removal
+	 * is complete, SUICIDED is set and the active ref is restored
+	 * while holding kernfs_mutex.  The ones which lost arbitration
+	 * waits for SUICDED && drained which can happen only after the
+	 * enclosing kernfs operation which executed the winning instance
+	 * of kernfs_remove_self() finished.
+	 */
+	if (!(kn->flags & KERNFS_SUICIDAL)) {
+		kn->flags |= KERNFS_SUICIDAL;
+		__kernfs_remove(kn);
+		kn->flags |= KERNFS_SUICIDED;
+		ret = true;
+	} else {
+		wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
+		DEFINE_WAIT(wait);
+
+		while (true) {
+			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
+
+			if ((kn->flags & KERNFS_SUICIDED) &&
+			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
+				break;
+
+			mutex_unlock(&kernfs_mutex);
+			schedule();
+			mutex_lock(&kernfs_mutex);
+		}
+		finish_wait(waitq, &wait);
+		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
+		ret = false;
+	}
+
+	__kernfs_reactivate_self(kn);
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
 /**
  * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
  * @parent: parent of the target
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 810cf6e613e..1b8b91b67fd 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -372,6 +372,29 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
+/**
+ * sysfs_remove_file_self - remove an object attribute from its own method
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
+ *
+ * See kernfs_remove_self() for details.
+ */
+bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
+{
+	struct kernfs_node *parent = kobj->sd;
+	struct kernfs_node *kn;
+	bool ret;
+
+	kn = kernfs_find_and_get(parent, attr->name);
+	if (WARN_ON_ONCE(!kn))
+		return false;
+
+	ret = kernfs_remove_self(kn);
+
+	kernfs_put(kn);
+	return ret;
+}
+
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
 {
 	int i;
diff --git a/include/linux/device.h b/include/linux/device.h
index 952b01033c3..1ff3f169751 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -560,6 +560,8 @@ extern int device_create_file(struct device *device,
 			      const struct device_attribute *entry);
 extern void device_remove_file(struct device *dev,
 			       const struct device_attribute *attr);
+extern bool device_remove_file_self(struct device *dev,
+				    const struct device_attribute *attr);
 extern int __must_check device_create_bin_file(struct device *dev,
 					const struct bin_attribute *attr);
 extern void device_remove_bin_file(struct device *dev,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index ac869302705..0b7b7cc352e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -43,6 +43,8 @@ enum kernfs_node_flag {
 	KERNFS_HAS_MMAP		= 0x0080,
 	KERNFS_LOCKDEP		= 0x0100,
 	KERNFS_STATIC_NAME	= 0x0200,
+	KERNFS_SUICIDAL		= 0x0400,
+	KERNFS_SUICIDED		= 0x0800,
 };
 
 /* type-specific structures for kernfs_node union members */
@@ -239,6 +241,7 @@ void kernfs_reactivate(struct kernfs_node *kn);
 void kernfs_deactivate_self(struct kernfs_node *kn);
 void kernfs_reactivate_self(struct kernfs_node *kn);
 void kernfs_remove(struct kernfs_node *kn);
+bool kernfs_remove_self(struct kernfs_node *kn);
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns);
 int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
@@ -296,6 +299,9 @@ kernfs_create_link(struct kernfs_node *parent, const char *name,
 
 static inline void kernfs_remove(struct kernfs_node *kn) { }
 
+static inline bool kernfs_remove_self(struct kernfs_node *kn)
+{ return false; }
+
 static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
 					   const char *name, const void *ns)
 { return -ENOSYS; }
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 30b2ebee643..bd96c603ab6 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -198,6 +198,7 @@ int __must_check sysfs_chmod_file(struct kobject *kobj,
 				  const struct attribute *attr, umode_t mode);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 			  const void *ns);
+bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr);
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **attr);
 
 int __must_check sysfs_create_bin_file(struct kobject *kobj,
@@ -301,6 +302,12 @@ static inline void sysfs_remove_file_ns(struct kobject *kobj,
 {
 }
 
+static inline bool sysfs_remove_file_self(struct kobject *kobj,
+					  const struct attribute *attr)
+{
+	return false;
+}
+
 static inline void sysfs_remove_files(struct kobject *kobj,
 				     const struct attribute **attr)
 {
-- 
cgit v1.2.3-70-g09d2


From d1ba277e79889085a2faec3b68b91ce89c63f888 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Jan 2014 08:57:31 -0500
Subject: sysfs, driver-core: remove unused
 {sysfs|device}_schedule_callback_owner()

All device_schedule_callback_owner() users are converted to use
device_remove_file_self().  Remove now unused
{sysfs|device}_schedule_callback_owner().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 33 ------------------
 fs/sysfs/file.c        | 92 --------------------------------------------------
 include/linux/device.h | 11 +-----
 include/linux/sysfs.h  |  9 -----
 4 files changed, 1 insertion(+), 144 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 9db57afcf81..4195364f9fd 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -615,39 +615,6 @@ void device_remove_bin_file(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_remove_bin_file);
 
-/**
- * device_schedule_callback_owner - helper to schedule a callback for a device
- * @dev: device.
- * @func: callback function to invoke later.
- * @owner: module owning the callback routine
- *
- * Attribute methods must not unregister themselves or their parent device
- * (which would amount to the same thing).  Attempts to do so will deadlock,
- * since unregistration is mutually exclusive with driver callbacks.
- *
- * Instead methods can call this routine, which will attempt to allocate
- * and schedule a workqueue request to call back @func with @dev as its
- * argument in the workqueue's process context.  @dev will be pinned until
- * @func returns.
- *
- * This routine is usually called via the inline device_schedule_callback(),
- * which automatically sets @owner to THIS_MODULE.
- *
- * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available.
- *
- * NOTE: This routine won't work if CONFIG_SYSFS isn't set!  It uses an
- * underlying sysfs routine (since it is intended for use by attribute
- * methods), and if sysfs isn't available you'll get nothing but -ENOSYS.
- */
-int device_schedule_callback_owner(struct device *dev,
-		void (*func)(struct device *), struct module *owner)
-{
-	return sysfs_schedule_callback(&dev->kobj,
-			(void (*)(void *)) func, dev, owner);
-}
-EXPORT_SYMBOL_GPL(device_schedule_callback_owner);
-
 static void klist_children_get(struct klist_node *n)
 {
 	struct device_private *p = to_device_private_parent(n);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b8b91b67fd..28cc1acd543 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -453,95 +453,3 @@ void sysfs_remove_bin_file(struct kobject *kobj,
 	kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
-
-struct sysfs_schedule_callback_struct {
-	struct list_head	workq_list;
-	struct kobject		*kobj;
-	void			(*func)(void *);
-	void			*data;
-	struct module		*owner;
-	struct work_struct	work;
-};
-
-static struct workqueue_struct *sysfs_workqueue;
-static DEFINE_MUTEX(sysfs_workq_mutex);
-static LIST_HEAD(sysfs_workq);
-static void sysfs_schedule_callback_work(struct work_struct *work)
-{
-	struct sysfs_schedule_callback_struct *ss = container_of(work,
-			struct sysfs_schedule_callback_struct, work);
-
-	(ss->func)(ss->data);
-	kobject_put(ss->kobj);
-	module_put(ss->owner);
-	mutex_lock(&sysfs_workq_mutex);
-	list_del(&ss->workq_list);
-	mutex_unlock(&sysfs_workq_mutex);
-	kfree(ss);
-}
-
-/**
- * sysfs_schedule_callback - helper to schedule a callback for a kobject
- * @kobj: object we're acting for.
- * @func: callback function to invoke later.
- * @data: argument to pass to @func.
- * @owner: module owning the callback code
- *
- * sysfs attribute methods must not unregister themselves or their parent
- * kobject (which would amount to the same thing).  Attempts to do so will
- * deadlock, since unregistration is mutually exclusive with driver
- * callbacks.
- *
- * Instead methods can call this routine, which will attempt to allocate
- * and schedule a workqueue request to call back @func with @data as its
- * argument in the workqueue's process context.  @kobj will be pinned
- * until @func returns.
- *
- * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available,
- * -EAGAIN if a callback has already been scheduled for @kobj.
- */
-int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
-		void *data, struct module *owner)
-{
-	struct sysfs_schedule_callback_struct *ss, *tmp;
-
-	if (!try_module_get(owner))
-		return -ENODEV;
-
-	mutex_lock(&sysfs_workq_mutex);
-	list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
-		if (ss->kobj == kobj) {
-			module_put(owner);
-			mutex_unlock(&sysfs_workq_mutex);
-			return -EAGAIN;
-		}
-	mutex_unlock(&sysfs_workq_mutex);
-
-	if (sysfs_workqueue == NULL) {
-		sysfs_workqueue = create_singlethread_workqueue("sysfsd");
-		if (sysfs_workqueue == NULL) {
-			module_put(owner);
-			return -ENOMEM;
-		}
-	}
-
-	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
-	if (!ss) {
-		module_put(owner);
-		return -ENOMEM;
-	}
-	kobject_get(kobj);
-	ss->kobj = kobj;
-	ss->func = func;
-	ss->data = data;
-	ss->owner = owner;
-	INIT_WORK(&ss->work, sysfs_schedule_callback_work);
-	INIT_LIST_HEAD(&ss->workq_list);
-	mutex_lock(&sysfs_workq_mutex);
-	list_add_tail(&ss->workq_list, &sysfs_workq);
-	mutex_unlock(&sysfs_workq_mutex);
-	queue_work(sysfs_workqueue, &ss->work);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/include/linux/device.h b/include/linux/device.h
index 1ff3f169751..fb1ba13f766 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -566,12 +566,6 @@ extern int __must_check device_create_bin_file(struct device *dev,
 					const struct bin_attribute *attr);
 extern void device_remove_bin_file(struct device *dev,
 				   const struct bin_attribute *attr);
-extern int device_schedule_callback_owner(struct device *dev,
-		void (*func)(struct device *dev), struct module *owner);
-
-/* This is a macro to avoid include problems with THIS_MODULE */
-#define device_schedule_callback(dev, func)			\
-	device_schedule_callback_owner(dev, func, THIS_MODULE)
 
 /* device resource management */
 typedef void (*dr_release_t)(struct device *dev, void *res);
@@ -931,10 +925,7 @@ extern int device_online(struct device *dev);
 extern struct device *__root_device_register(const char *name,
 					     struct module *owner);
 
-/*
- * This is a macro to avoid include problems with THIS_MODULE,
- * just as per what is done for device_schedule_callback() above.
- */
+/* This is a macro to avoid include problems with THIS_MODULE */
 #define root_device_register(name) \
 	__root_device_register(name, THIS_MODULE)
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index bd96c603ab6..14df05415af 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -178,9 +178,6 @@ struct sysfs_ops {
 
 #ifdef CONFIG_SYSFS
 
-int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
-			    void *data, struct module *owner);
-
 int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns);
 void sysfs_remove_dir(struct kobject *kobj);
 int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
@@ -249,12 +246,6 @@ int __must_check sysfs_init(void);
 
 #else /* CONFIG_SYSFS */
 
-static inline int sysfs_schedule_callback(struct kobject *kobj,
-		void (*func)(void *), void *data, struct module *owner)
-{
-	return -ENOSYS;
-}
-
 static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From d7092ae2973f20a39fee786c47e5edf18ced088f Mon Sep 17 00:00:00 2001
From: jon ernst <jonernst07@gmail.com>
Date: Sat, 11 Jan 2014 13:26:56 -0500
Subject: ext4: delete "set but not used" variables

Signed-off-by: Jon Ernst <jonernst07@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
---
 fs/ext4/inline.c | 2 --
 fs/ext4/ioctl.c  | 6 +-----
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index ed29e720e88..82edf5b9335 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1841,7 +1841,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
 {
 	int error;
 	struct ext4_xattr_entry *entry;
-	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc;
 
@@ -1850,7 +1849,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
 		return error;
 
 	raw_inode = ext4_raw_inode(&iloc);
-	header = IHDR(inode, raw_inode);
 	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
 					    EXT4_I(inode)->i_inline_off);
 	if (EXT4_XATTR_LEN(entry->e_name_len) +
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 60589b60e9b..6bea80614d7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -101,9 +101,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	handle_t *handle;
 	int err;
 	struct inode *inode_bl;
-	struct ext4_inode_info *ei;
 	struct ext4_inode_info *ei_bl;
-	struct ext4_sb_info *sbi;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
 		err = -EINVAL;
@@ -115,9 +114,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
 		goto swap_boot_out;
 	}
 
-	sbi = EXT4_SB(sb);
-	ei = EXT4_I(inode);
-
 	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
 	if (IS_ERR(inode_bl)) {
 		err = PTR_ERR(inode_bl);
-- 
cgit v1.2.3-70-g09d2


From 88533f990c616cf50c2fe585ea03f75c806a293d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 11 Jan 2014 18:23:23 -0500
Subject: kernfs: remove unnecessary NULL check in __kernfs_remove()

895a068a524e ("kernfs: make kernfs_get_active() block if the node is
deactivated but not removed") added "struct kernfs_root *root =
kernfs_root(kn);" at the head of the function; however, the parameter
@kn is checked for later implying that the function may be called with
NULL.  This means that we may end up invoking kernfs_root() with NULL
which will oops.  None of the existing users invokes removal with NULL
@kn, so this bug doesn't actually trigger.

We can relocate kernfs_root() invocation after NULL check; however,
allowing NULL param tends to cause more confusion than actually
helping anything.  As there's no existing user, let's remove the
spurious NULL check.

This bug was detected by smatch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a8028be6cdb..4076e8a7c26 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -920,9 +920,6 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 	lockdep_assert_held(&kernfs_mutex);
 
-	if (!kn)
-		return;
-
 	pr_debug("kernfs %s: removing\n", kn->name);
 
 	__kernfs_deactivate(kn);
-- 
cgit v1.2.3-70-g09d2


From d8c951c313ed1d7144b55c0d56f7c53220044dda Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 13 Jan 2014 12:08:11 -0500
Subject: NFSv4.1: Don't trust attributes if a pNFS LAYOUTCOMMIT is outstanding

If a LAYOUTCOMMIT is outstanding, then chances are that the metadata
server may still be returning incorrect values for the change attribute,
ctime, mtime and/or size.
Just ignore those attributes for now, and wait for the LAYOUTCOMMIT
rpc call to finish.

Reported-by: shaobingqing <shaobingqing@bwstor.com.cn>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c    | 19 +++++++++++++++++--
 fs/nfs/nfs4proc.c |  5 ++---
 fs/nfs/pnfs.h     | 16 ++++++++++++++++
 3 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5feec233895..c63e1522446 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1283,12 +1283,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
 		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
 
+/*
+ * Don't trust the change_attribute, mtime, ctime or size if
+ * a pnfs LAYOUTCOMMIT is outstanding
+ */
+static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
+		struct nfs_fattr *fattr)
+{
+	if (pnfs_layoutcommit_outstanding(inode))
+		fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
+				NFS_ATTR_FATTR_MTIME |
+				NFS_ATTR_FATTR_CTIME |
+				NFS_ATTR_FATTR_SIZE);
+}
+
 static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
 	int ret;
 
 	trace_nfs_refresh_inode_enter(inode);
 
+	nfs_inode_attrs_handle_layoutcommit(inode, fattr);
+
 	if (nfs_inode_attrs_need_update(inode, fattr))
 		ret = nfs_update_inode(inode, fattr);
 	else
@@ -1518,8 +1534,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if (new_isize != cur_isize) {
 			/* Do we perhaps have any outstanding writes, or has
 			 * the file grown beyond our last write? */
-			if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) ||
-			     new_isize > cur_isize) {
+			if ((nfsi->npages == 0) || new_isize > cur_isize) {
 				i_size_write(inode, new_isize);
 				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 15052b81df4..f4908eb40a2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7780,10 +7780,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 	case -NFS4ERR_BADLAYOUT:     /* no layout */
 	case -NFS4ERR_GRACE:	    /* loca_recalim always false */
 		task->tk_status = 0;
-		break;
 	case 0:
-		nfs_post_op_update_inode_force_wcc(data->args.inode,
-						   data->res.fattr);
 		break;
 	default:
 		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
@@ -7798,6 +7795,8 @@ static void nfs4_layoutcommit_release(void *calldata)
 	struct nfs4_layoutcommit_data *data = calldata;
 
 	pnfs_cleanup_layoutcommit(data);
+	nfs_post_op_update_inode_force_wcc(data->args.inode,
+					   data->res.fattr);
 	put_rpccred(data->cred);
 	kfree(data);
 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a4f41810a7f..02379390977 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -359,6 +359,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
 		PNFS_LAYOUTRET_ON_SETATTR;
 }
 
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
+		test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
+}
+
 static inline int pnfs_return_layout(struct inode *ino)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
@@ -515,6 +524,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
 	return false;
 }
 
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+	return false;
+}
+
+
 static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
 	return NULL;
-- 
cgit v1.2.3-70-g09d2


From 71244d9bdf185e5bba1473254241f9f65d4dd0d8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 13 Jan 2014 13:34:36 -0500
Subject: NFSv4.1: Fix a race in nfs4_write_inode

nfs4_write_inode() must not be allowed to exit until the layoutcommit
is done. That means that both NFS_INO_LAYOUTCOMMIT and
NFS_INO_LAYOUTCOMMITTING have to be cleared.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4super.c | 14 +++---------
 fs/nfs/pnfs.c      | 67 ++++++++++++++++++++++++++++--------------------------
 2 files changed, 38 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 65ab0a0ca1c..808f2957441 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret = nfs_write_inode(inode, wbc);
 
-	if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
-		int status;
-		bool sync = true;
-
-		if (wbc->sync_mode == WB_SYNC_NONE)
-			sync = false;
-
-		status = pnfs_layoutcommit_inode(inode, sync);
-		if (status < 0)
-			return status;
-	}
+	if (ret == 0)
+		ret = pnfs_layoutcommit_inode(inode,
+				wbc->sync_mode == WB_SYNC_ALL);
 	return ret;
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d75d938d36c..4755858e37a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1790,6 +1790,15 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
 
+static void pnfs_clear_layoutcommitting(struct inode *inode)
+{
+	unsigned long *bitlock = &NFS_I(inode)->flags;
+
+	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
+
 /*
  * There can be multiple RW segments.
  */
@@ -1807,7 +1816,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
 {
 	struct pnfs_layout_segment *lseg, *tmp;
-	unsigned long *bitlock = &NFS_I(inode)->flags;
 
 	/* Matched by references in pnfs_set_layoutcommit */
 	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
@@ -1815,9 +1823,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis
 		pnfs_put_lseg(lseg);
 	}
 
-	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
-	smp_mb__after_clear_bit();
-	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+	pnfs_clear_layoutcommitting(inode);
 }
 
 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
@@ -1881,43 +1887,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 	struct nfs4_layoutcommit_data *data;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t end_pos;
-	int status = 0;
+	int status;
 
-	dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
-
-	if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+	if (!pnfs_layoutcommit_outstanding(inode))
 		return 0;
 
-	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
-	data = kzalloc(sizeof(*data), GFP_NOFS);
-	if (!data) {
-		status = -ENOMEM;
-		goto out;
-	}
-
-	if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
-		goto out_free;
+	dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
 
+	status = -EAGAIN;
 	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
-		if (!sync) {
-			status = -EAGAIN;
-			goto out_free;
-		}
-		status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
-					nfs_wait_bit_killable, TASK_KILLABLE);
+		if (!sync)
+			goto out;
+		status = wait_on_bit_lock(&nfsi->flags,
+				NFS_INO_LAYOUTCOMMITTING,
+				nfs_wait_bit_killable,
+				TASK_KILLABLE);
 		if (status)
-			goto out_free;
+			goto out;
 	}
 
-	INIT_LIST_HEAD(&data->lseg_list);
+	status = -ENOMEM;
+	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+	data = kzalloc(sizeof(*data), GFP_NOFS);
+	if (!data)
+		goto clear_layoutcommitting;
+
+	status = 0;
 	spin_lock(&inode->i_lock);
-	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
-		clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
-		spin_unlock(&inode->i_lock);
-		wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
-		goto out_free;
-	}
+	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+		goto out_unlock;
 
+	INIT_LIST_HEAD(&data->lseg_list);
 	pnfs_list_write_lseg(inode, &data->lseg_list);
 
 	end_pos = nfsi->layout->plh_lwb;
@@ -1940,8 +1940,11 @@ out:
 		mark_inode_dirty_sync(inode);
 	dprintk("<-- %s status %d\n", __func__, status);
 	return status;
-out_free:
+out_unlock:
+	spin_unlock(&inode->i_lock);
 	kfree(data);
+clear_layoutcommitting:
+	pnfs_clear_layoutcommitting(inode);
 	goto out;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ce9b499c9f58d7f3f680413f3ab5407f4e647ba2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 13:50:31 -0800
Subject: Revert "kernfs: remove unnecessary NULL check in __kernfs_remove()"

This reverts commit 88533f990c616cf50c2fe585ea03f75c806a293d.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 4076e8a7c26..a8028be6cdb 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -920,6 +920,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 	lockdep_assert_held(&kernfs_mutex);
 
+	if (!kn)
+		return;
+
 	pr_debug("kernfs %s: removing\n", kn->name);
 
 	__kernfs_deactivate(kn);
-- 
cgit v1.2.3-70-g09d2


From a30f82b7ebc87cdec3ef48303278f02970086118 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 13:51:36 -0800
Subject: Revert "sysfs, driver-core: remove unused
 {sysfs|device}_schedule_callback_owner()"

This reverts commit d1ba277e79889085a2faec3b68b91ce89c63f888.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 33 ++++++++++++++++++
 fs/sysfs/file.c        | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h | 11 +++++-
 include/linux/sysfs.h  |  9 +++++
 4 files changed, 144 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4195364f9fd..9db57afcf81 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -615,6 +615,39 @@ void device_remove_bin_file(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_remove_bin_file);
 
+/**
+ * device_schedule_callback_owner - helper to schedule a callback for a device
+ * @dev: device.
+ * @func: callback function to invoke later.
+ * @owner: module owning the callback routine
+ *
+ * Attribute methods must not unregister themselves or their parent device
+ * (which would amount to the same thing).  Attempts to do so will deadlock,
+ * since unregistration is mutually exclusive with driver callbacks.
+ *
+ * Instead methods can call this routine, which will attempt to allocate
+ * and schedule a workqueue request to call back @func with @dev as its
+ * argument in the workqueue's process context.  @dev will be pinned until
+ * @func returns.
+ *
+ * This routine is usually called via the inline device_schedule_callback(),
+ * which automatically sets @owner to THIS_MODULE.
+ *
+ * Returns 0 if the request was submitted, -ENOMEM if storage could not
+ * be allocated, -ENODEV if a reference to @owner isn't available.
+ *
+ * NOTE: This routine won't work if CONFIG_SYSFS isn't set!  It uses an
+ * underlying sysfs routine (since it is intended for use by attribute
+ * methods), and if sysfs isn't available you'll get nothing but -ENOSYS.
+ */
+int device_schedule_callback_owner(struct device *dev,
+		void (*func)(struct device *), struct module *owner)
+{
+	return sysfs_schedule_callback(&dev->kobj,
+			(void (*)(void *)) func, dev, owner);
+}
+EXPORT_SYMBOL_GPL(device_schedule_callback_owner);
+
 static void klist_children_get(struct klist_node *n)
 {
 	struct device_private *p = to_device_private_parent(n);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 28cc1acd543..1b8b91b67fd 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -453,3 +453,95 @@ void sysfs_remove_bin_file(struct kobject *kobj,
 	kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
+
+struct sysfs_schedule_callback_struct {
+	struct list_head	workq_list;
+	struct kobject		*kobj;
+	void			(*func)(void *);
+	void			*data;
+	struct module		*owner;
+	struct work_struct	work;
+};
+
+static struct workqueue_struct *sysfs_workqueue;
+static DEFINE_MUTEX(sysfs_workq_mutex);
+static LIST_HEAD(sysfs_workq);
+static void sysfs_schedule_callback_work(struct work_struct *work)
+{
+	struct sysfs_schedule_callback_struct *ss = container_of(work,
+			struct sysfs_schedule_callback_struct, work);
+
+	(ss->func)(ss->data);
+	kobject_put(ss->kobj);
+	module_put(ss->owner);
+	mutex_lock(&sysfs_workq_mutex);
+	list_del(&ss->workq_list);
+	mutex_unlock(&sysfs_workq_mutex);
+	kfree(ss);
+}
+
+/**
+ * sysfs_schedule_callback - helper to schedule a callback for a kobject
+ * @kobj: object we're acting for.
+ * @func: callback function to invoke later.
+ * @data: argument to pass to @func.
+ * @owner: module owning the callback code
+ *
+ * sysfs attribute methods must not unregister themselves or their parent
+ * kobject (which would amount to the same thing).  Attempts to do so will
+ * deadlock, since unregistration is mutually exclusive with driver
+ * callbacks.
+ *
+ * Instead methods can call this routine, which will attempt to allocate
+ * and schedule a workqueue request to call back @func with @data as its
+ * argument in the workqueue's process context.  @kobj will be pinned
+ * until @func returns.
+ *
+ * Returns 0 if the request was submitted, -ENOMEM if storage could not
+ * be allocated, -ENODEV if a reference to @owner isn't available,
+ * -EAGAIN if a callback has already been scheduled for @kobj.
+ */
+int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
+		void *data, struct module *owner)
+{
+	struct sysfs_schedule_callback_struct *ss, *tmp;
+
+	if (!try_module_get(owner))
+		return -ENODEV;
+
+	mutex_lock(&sysfs_workq_mutex);
+	list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
+		if (ss->kobj == kobj) {
+			module_put(owner);
+			mutex_unlock(&sysfs_workq_mutex);
+			return -EAGAIN;
+		}
+	mutex_unlock(&sysfs_workq_mutex);
+
+	if (sysfs_workqueue == NULL) {
+		sysfs_workqueue = create_singlethread_workqueue("sysfsd");
+		if (sysfs_workqueue == NULL) {
+			module_put(owner);
+			return -ENOMEM;
+		}
+	}
+
+	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+	if (!ss) {
+		module_put(owner);
+		return -ENOMEM;
+	}
+	kobject_get(kobj);
+	ss->kobj = kobj;
+	ss->func = func;
+	ss->data = data;
+	ss->owner = owner;
+	INIT_WORK(&ss->work, sysfs_schedule_callback_work);
+	INIT_LIST_HEAD(&ss->workq_list);
+	mutex_lock(&sysfs_workq_mutex);
+	list_add_tail(&ss->workq_list, &sysfs_workq);
+	mutex_unlock(&sysfs_workq_mutex);
+	queue_work(sysfs_workqueue, &ss->work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/include/linux/device.h b/include/linux/device.h
index fb1ba13f766..1ff3f169751 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -566,6 +566,12 @@ extern int __must_check device_create_bin_file(struct device *dev,
 					const struct bin_attribute *attr);
 extern void device_remove_bin_file(struct device *dev,
 				   const struct bin_attribute *attr);
+extern int device_schedule_callback_owner(struct device *dev,
+		void (*func)(struct device *dev), struct module *owner);
+
+/* This is a macro to avoid include problems with THIS_MODULE */
+#define device_schedule_callback(dev, func)			\
+	device_schedule_callback_owner(dev, func, THIS_MODULE)
 
 /* device resource management */
 typedef void (*dr_release_t)(struct device *dev, void *res);
@@ -925,7 +931,10 @@ extern int device_online(struct device *dev);
 extern struct device *__root_device_register(const char *name,
 					     struct module *owner);
 
-/* This is a macro to avoid include problems with THIS_MODULE */
+/*
+ * This is a macro to avoid include problems with THIS_MODULE,
+ * just as per what is done for device_schedule_callback() above.
+ */
 #define root_device_register(name) \
 	__root_device_register(name, THIS_MODULE)
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 14df05415af..bd96c603ab6 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -178,6 +178,9 @@ struct sysfs_ops {
 
 #ifdef CONFIG_SYSFS
 
+int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
+			    void *data, struct module *owner);
+
 int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns);
 void sysfs_remove_dir(struct kobject *kobj);
 int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
@@ -246,6 +249,12 @@ int __must_check sysfs_init(void);
 
 #else /* CONFIG_SYSFS */
 
+static inline int sysfs_schedule_callback(struct kobject *kobj,
+		void (*func)(void *), void *data, struct module *owner)
+{
+	return -ENOSYS;
+}
+
 static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From a9f138b0e537de55933335d580ebd38c2bc53c47 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:05:13 -0800
Subject: Revert "kernfs, sysfs, driver-core: implement kernfs_remove_self()
 and its wrappers"

This reverts commit 1ae06819c77cff1ea2833c94f8c093fe8a5c79db.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 17 ------------
 fs/kernfs/dir.c        | 72 --------------------------------------------------
 fs/sysfs/file.c        | 23 ----------------
 include/linux/device.h |  2 --
 include/linux/kernfs.h |  6 -----
 include/linux/sysfs.h  |  7 -----
 6 files changed, 127 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 9db57afcf81..2b567177ef7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -570,23 +570,6 @@ void device_remove_file(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_remove_file);
 
-/**
- * device_remove_file_self - remove sysfs attribute file from its own method.
- * @dev: device.
- * @attr: device attribute descriptor.
- *
- * See kernfs_remove_self() for details.
- */
-bool device_remove_file_self(struct device *dev,
-			     const struct device_attribute *attr)
-{
-	if (dev)
-		return sysfs_remove_file_self(&dev->kobj, &attr->attr);
-	else
-		return false;
-}
-EXPORT_SYMBOL_GPL(device_remove_file_self);
-
 /**
  * device_create_bin_file - create sysfs binary attribute file for device.
  * @dev: device.
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a8028be6cdb..1aeb57969bf 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -985,78 +985,6 @@ void kernfs_remove(struct kernfs_node *kn)
 	mutex_unlock(&kernfs_mutex);
 }
 
-/**
- * kernfs_remove_self - remove a kernfs_node from its own method
- * @kn: the self kernfs_node to remove
- *
- * The caller must be running off of a kernfs operation which is invoked
- * with an active reference - e.g. one of kernfs_ops.  This can be used to
- * implement a file operation which deletes itself.
- *
- * For example, the "delete" file for a sysfs device directory can be
- * implemented by invoking kernfs_remove_self() on the "delete" file
- * itself.  This function breaks the circular dependency of trying to
- * deactivate self while holding an active ref itself.  It isn't necessary
- * to modify the usual removal path to use kernfs_remove_self().  The
- * "delete" implementation can simply invoke kernfs_remove_self() on self
- * before proceeding with the usual removal path.  kernfs will ignore later
- * kernfs_remove() on self.
- *
- * kernfs_remove_self() can be called multiple times concurrently on the
- * same kernfs_node.  Only the first one actually performs removal and
- * returns %true.  All others will wait until the kernfs operation which
- * won self-removal finishes and return %false.  Note that the losers wait
- * for the completion of not only the winning kernfs_remove_self() but also
- * the whole kernfs_ops which won the arbitration.  This can be used to
- * guarantee, for example, all concurrent writes to a "delete" file to
- * finish only after the whole operation is complete.
- */
-bool kernfs_remove_self(struct kernfs_node *kn)
-{
-	bool ret;
-
-	mutex_lock(&kernfs_mutex);
-	__kernfs_deactivate_self(kn);
-
-	/*
-	 * SUICIDAL is used to arbitrate among competing invocations.  Only
-	 * the first one will actually perform removal.  When the removal
-	 * is complete, SUICIDED is set and the active ref is restored
-	 * while holding kernfs_mutex.  The ones which lost arbitration
-	 * waits for SUICDED && drained which can happen only after the
-	 * enclosing kernfs operation which executed the winning instance
-	 * of kernfs_remove_self() finished.
-	 */
-	if (!(kn->flags & KERNFS_SUICIDAL)) {
-		kn->flags |= KERNFS_SUICIDAL;
-		__kernfs_remove(kn);
-		kn->flags |= KERNFS_SUICIDED;
-		ret = true;
-	} else {
-		wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
-		DEFINE_WAIT(wait);
-
-		while (true) {
-			prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
-
-			if ((kn->flags & KERNFS_SUICIDED) &&
-			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
-				break;
-
-			mutex_unlock(&kernfs_mutex);
-			schedule();
-			mutex_lock(&kernfs_mutex);
-		}
-		finish_wait(waitq, &wait);
-		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
-		ret = false;
-	}
-
-	__kernfs_reactivate_self(kn);
-	mutex_unlock(&kernfs_mutex);
-	return ret;
-}
-
 /**
  * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
  * @parent: parent of the target
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b8b91b67fd..810cf6e613e 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -372,29 +372,6 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
-/**
- * sysfs_remove_file_self - remove an object attribute from its own method
- * @kobj: object we're acting for
- * @attr: attribute descriptor
- *
- * See kernfs_remove_self() for details.
- */
-bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
-{
-	struct kernfs_node *parent = kobj->sd;
-	struct kernfs_node *kn;
-	bool ret;
-
-	kn = kernfs_find_and_get(parent, attr->name);
-	if (WARN_ON_ONCE(!kn))
-		return false;
-
-	ret = kernfs_remove_self(kn);
-
-	kernfs_put(kn);
-	return ret;
-}
-
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
 {
 	int i;
diff --git a/include/linux/device.h b/include/linux/device.h
index 1ff3f169751..952b01033c3 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -560,8 +560,6 @@ extern int device_create_file(struct device *device,
 			      const struct device_attribute *entry);
 extern void device_remove_file(struct device *dev,
 			       const struct device_attribute *attr);
-extern bool device_remove_file_self(struct device *dev,
-				    const struct device_attribute *attr);
 extern int __must_check device_create_bin_file(struct device *dev,
 					const struct bin_attribute *attr);
 extern void device_remove_bin_file(struct device *dev,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 0b7b7cc352e..ac869302705 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -43,8 +43,6 @@ enum kernfs_node_flag {
 	KERNFS_HAS_MMAP		= 0x0080,
 	KERNFS_LOCKDEP		= 0x0100,
 	KERNFS_STATIC_NAME	= 0x0200,
-	KERNFS_SUICIDAL		= 0x0400,
-	KERNFS_SUICIDED		= 0x0800,
 };
 
 /* type-specific structures for kernfs_node union members */
@@ -241,7 +239,6 @@ void kernfs_reactivate(struct kernfs_node *kn);
 void kernfs_deactivate_self(struct kernfs_node *kn);
 void kernfs_reactivate_self(struct kernfs_node *kn);
 void kernfs_remove(struct kernfs_node *kn);
-bool kernfs_remove_self(struct kernfs_node *kn);
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns);
 int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
@@ -299,9 +296,6 @@ kernfs_create_link(struct kernfs_node *parent, const char *name,
 
 static inline void kernfs_remove(struct kernfs_node *kn) { }
 
-static inline bool kernfs_remove_self(struct kernfs_node *kn)
-{ return false; }
-
 static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
 					   const char *name, const void *ns)
 { return -ENOSYS; }
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index bd96c603ab6..30b2ebee643 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -198,7 +198,6 @@ int __must_check sysfs_chmod_file(struct kobject *kobj,
 				  const struct attribute *attr, umode_t mode);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 			  const void *ns);
-bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr);
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **attr);
 
 int __must_check sysfs_create_bin_file(struct kobject *kobj,
@@ -302,12 +301,6 @@ static inline void sysfs_remove_file_ns(struct kobject *kobj,
 {
 }
 
-static inline bool sysfs_remove_file_self(struct kobject *kobj,
-					  const struct attribute *attr)
-{
-	return false;
-}
-
 static inline void sysfs_remove_files(struct kobject *kobj,
 				     const struct attribute **attr)
 {
-- 
cgit v1.2.3-70-g09d2


From 9b0925a6ff64a33be45497e3c798bfee8790b102 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:09:38 -0800
Subject: Revert "kernfs: implement kernfs_{de|re}activate[_self]()"

This reverts commit 9f010c2ad5194a4b682e747984477850fabd03be.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 118 +------------------------------------------------
 include/linux/kernfs.h |   7 +--
 2 files changed, 2 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1aeb57969bf..37dd6408f5f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -396,7 +396,6 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 
 	atomic_set(&kn->count, 1);
 	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
-	kn->deact_depth = 1;
 	RB_CLEAR_NODE(&kn->rb);
 
 	kn->name = name;
@@ -462,7 +461,6 @@ int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent)
 
 	/* Mark the entry added into directory tree */
 	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
-	kn->deact_depth--;
 	ret = 0;
 out_unlock:
 	mutex_unlock(&kernfs_mutex);
@@ -563,7 +561,6 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 	}
 
 	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
-	kn->deact_depth--;
 	kn->priv = priv;
 	kn->dir.root = root;
 
@@ -776,8 +773,7 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 	/* prevent any new usage under @kn by deactivating all nodes */
 	pos = NULL;
 	while ((pos = kernfs_next_descendant_post(pos, kn))) {
-		if (!pos->deact_depth++) {
-			WARN_ON_ONCE(atomic_read(&pos->active) < 0);
+		if (atomic_read(&pos->active) >= 0) {
 			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
 			pos->flags |= KERNFS_JUST_DEACTIVATED;
 		}
@@ -801,118 +797,6 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 	}
 }
 
-static void __kernfs_reactivate(struct kernfs_node *kn)
-{
-	struct kernfs_node *pos;
-
-	lockdep_assert_held(&kernfs_mutex);
-
-	pos = NULL;
-	while ((pos = kernfs_next_descendant_post(pos, kn))) {
-		if (!--pos->deact_depth) {
-			WARN_ON_ONCE(atomic_read(&pos->active) >= 0);
-			atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
-		}
-		WARN_ON_ONCE(pos->deact_depth < 0);
-	}
-
-	/* some nodes reactivated, kick get_active waiters */
-	wake_up_all(&kernfs_root(kn)->deactivate_waitq);
-}
-
-static void __kernfs_deactivate_self(struct kernfs_node *kn)
-{
-	/*
-	 * Take out ourself out of the active ref dependency chain and
-	 * deactivate.  If we're called without an active ref, lockdep will
-	 * complain.
-	 */
-	kernfs_put_active(kn);
-	__kernfs_deactivate(kn);
-}
-
-static void __kernfs_reactivate_self(struct kernfs_node *kn)
-{
-	__kernfs_reactivate(kn);
-	/*
-	 * Restore active ref dropped by deactivate_self() so that it's
-	 * balanced on return.  put_active() will soon be called on @kn, so
-	 * this can't break anything regardless of @kn's state.
-	 */
-	atomic_inc(&kn->active);
-	if (kernfs_lockdep(kn))
-		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
-}
-
-/**
- * kernfs_deactivate - deactivate subtree of a node
- * @kn: kernfs_node to deactivate subtree of
- *
- * Deactivate the subtree of @kn.  On return, there's no active operation
- * going on under @kn and creation or renaming of a node under @kn is
- * blocked until @kn is reactivated or removed.  This function can be
- * called multiple times and nests properly.  Each invocation should be
- * paired with kernfs_reactivate().
- *
- * For a kernfs user which uses simple locking, the subsystem lock would
- * nest inside active reference.  This becomes problematic if the user
- * tries to remove nodes while holding the subystem lock as it would create
- * a reverse locking dependency from the subsystem lock to active ref.
- * This function can be used to break such reverse dependency.  The user
- * can call this function outside the subsystem lock and then proceed to
- * invoke kernfs_remove() while holding the subsystem lock without
- * introducing such reverse dependency.
- */
-void kernfs_deactivate(struct kernfs_node *kn)
-{
-	mutex_lock(&kernfs_mutex);
-	__kernfs_deactivate(kn);
-	mutex_unlock(&kernfs_mutex);
-}
-
-/**
- * kernfs_reactivate - reactivate subtree of a node
- * @kn: kernfs_node to reactivate subtree of
- *
- * Undo kernfs_deactivate().
- */
-void kernfs_reactivate(struct kernfs_node *kn)
-{
-	mutex_lock(&kernfs_mutex);
-	__kernfs_reactivate(kn);
-	mutex_unlock(&kernfs_mutex);
-}
-
-/**
- * kernfs_deactivate_self - deactivate subtree of a node from its own method
- * @kn: the self kernfs_node to deactivate subtree of
- *
- * The caller must be running off of a kernfs operation which is invoked
- * with an active reference - e.g. one of kernfs_ops.  Once this function
- * is called, @kn may be removed by someone else while the enclosing method
- * is in progress.  Other than that, this function is equivalent to
- * kernfs_deactivate() and should be paired with kernfs_reactivate_self().
- */
-void kernfs_deactivate_self(struct kernfs_node *kn)
-{
-	mutex_lock(&kernfs_mutex);
-	__kernfs_deactivate_self(kn);
-	mutex_unlock(&kernfs_mutex);
-}
-
-/**
- * kernfs_reactivate_self - reactivate subtree of a node from its own method
- * @kn: the self kernfs_node to reactivate subtree of
- *
- * Undo kernfs_deactivate_self().
- */
-void kernfs_reactivate_self(struct kernfs_node *kn)
-{
-	mutex_lock(&kernfs_mutex);
-	__kernfs_reactivate_self(kn);
-	mutex_unlock(&kernfs_mutex);
-}
-
 static void __kernfs_remove(struct kernfs_node *kn)
 {
 	struct kernfs_root *root = kernfs_root(kn);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index ac869302705..9b5a4bb88c6 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -80,8 +80,6 @@ struct kernfs_elem_attr {
 struct kernfs_node {
 	atomic_t		count;
 	atomic_t		active;
-	int			deact_depth;
-	unsigned int		hash;	/* ns + name hash */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
@@ -92,6 +90,7 @@ struct kernfs_node {
 	struct rb_node		rb;
 
 	const void		*ns;	/* namespace tag */
+	unsigned int		hash;	/* ns + name hash */
 	union {
 		struct kernfs_elem_dir		dir;
 		struct kernfs_elem_symlink	symlink;
@@ -234,10 +233,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
 				       struct kernfs_node *target);
-void kernfs_deactivate(struct kernfs_node *kn);
-void kernfs_reactivate(struct kernfs_node *kn);
-void kernfs_deactivate_self(struct kernfs_node *kn);
-void kernfs_reactivate_self(struct kernfs_node *kn);
 void kernfs_remove(struct kernfs_node *kn);
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns);
-- 
cgit v1.2.3-70-g09d2


From f4b3e631b39db31f7375cce0b5e4111d14cde511 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:13:39 -0800
Subject: Revert "kernfs: make kernfs_get_active() block if the node is
 deactivated but not removed"

This reverts commit 895a068a524e134900b9d98b519309b7aae7bbb1.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 37dd6408f5f..770d687ee9f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -149,25 +149,12 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
 	if (unlikely(!kn))
 		return NULL;
 
-	if (kernfs_lockdep(kn))
-		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
-
-	/*
-	 * Try to obtain an active ref.  If @kn is deactivated, we block
-	 * till either it's reactivated or killed.
-	 */
-	do {
-		if (atomic_inc_unless_negative(&kn->active))
-			return kn;
-
-		wait_event(kernfs_root(kn)->deactivate_waitq,
-			   atomic_read(&kn->active) >= 0 ||
-			   RB_EMPTY_NODE(&kn->rb));
-	} while (!RB_EMPTY_NODE(&kn->rb));
+	if (!atomic_inc_unless_negative(&kn->active))
+		return NULL;
 
 	if (kernfs_lockdep(kn))
-		rwsem_release(&kn->dep_map, 1, _RET_IP_);
-	return NULL;
+		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+	return kn;
 }
 
 /**
@@ -799,7 +786,6 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 
 static void __kernfs_remove(struct kernfs_node *kn)
 {
-	struct kernfs_root *root = kernfs_root(kn);
 	struct kernfs_node *pos;
 
 	lockdep_assert_held(&kernfs_mutex);
@@ -851,9 +837,6 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 		kernfs_put(pos);
 	} while (pos != kn);
-
-	/* some nodes killed, kick get_active waiters */
-	wake_up_all(&root->deactivate_waitq);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 7653fe9d6cddc3fc5e4220608079006d8ac0054c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:20:56 -0800
Subject: Revert "kernfs: remove kernfs_addrm_cxt"

This reverts commit 99177a34110889a8f2c36420c34e3bcc9bfd8a70.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 114 ++++++++++++++++++++++++++++++++++----------
 fs/kernfs/file.c            |  10 +++-
 fs/kernfs/kernfs-internal.h |  12 ++++-
 fs/kernfs/symlink.c         |  10 +++-
 include/linux/kernfs.h      |   4 ++
 5 files changed, 121 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 770d687ee9f..f878e4f2efe 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -398,8 +398,29 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 	return NULL;
 }
 
+/**
+ *	kernfs_addrm_start - prepare for kernfs_node add/remove
+ *	@acxt: pointer to kernfs_addrm_cxt to be used
+ *
+ *	This function is called when the caller is about to add or remove
+ *	kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
+ *	to keep and pass context to other addrm functions.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).  kernfs_mutex is locked on
+ *	return.
+ */
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
+	__acquires(kernfs_mutex)
+{
+	memset(acxt, 0, sizeof(*acxt));
+
+	mutex_lock(&kernfs_mutex);
+}
+
 /**
  *	kernfs_add_one - add kernfs_node to parent without warning
+ *	@acxt: addrm context to use
  *	@kn: kernfs_node to be added
  *	@parent: the parent kernfs_node to add @kn to
  *
@@ -407,29 +428,34 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
  *	parent inode if @kn is a directory and link into the children list
  *	of the parent.
  *
+ *	This function should be called between calls to
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
+ *	the same @acxt as passed to kernfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by kernfs_addrm_start().
+ *
  *	RETURNS:
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent)
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
+		  struct kernfs_node *parent)
 {
+	bool has_ns = kernfs_ns_enabled(parent);
 	struct kernfs_iattrs *ps_iattr;
-	bool has_ns;
 	int ret;
 
-	if (!kernfs_get_active(parent))
-		return -ENOENT;
+	WARN_ON_ONCE(atomic_read(&parent->active) < 0);
 
-	mutex_lock(&kernfs_mutex);
-
-	ret = -EINVAL;
-	has_ns = kernfs_ns_enabled(parent);
-	if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
-		 has_ns ? "required" : "invalid", parent->name, kn->name))
-		goto out_unlock;
+	if (has_ns != (bool)kn->ns) {
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid", parent->name, kn->name);
+		return -EINVAL;
+	}
 
 	if (kernfs_type(parent) != KERNFS_DIR)
-		goto out_unlock;
+		return -EINVAL;
 
 	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kn->parent = parent;
@@ -437,7 +463,7 @@ int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent)
 
 	ret = kernfs_link_sibling(kn);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	/* Update timestamps on the parent */
 	ps_iattr = parent->iattr;
@@ -448,11 +474,34 @@ int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent)
 
 	/* Mark the entry added into directory tree */
 	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
-	ret = 0;
-out_unlock:
+	return 0;
+}
+
+/**
+ *	kernfs_addrm_finish - finish up kernfs_node add/remove
+ *	@acxt: addrm context to finish up
+ *
+ *	Finish up kernfs_node add/remove.  Resources acquired by
+ *	kernfs_addrm_start() are released and removed kernfs_nodes are
+ *	cleaned up.
+ *
+ *	LOCKING:
+ *	kernfs_mutex is released.
+ */
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
+	__releases(kernfs_mutex)
+{
+	/* release resources acquired by kernfs_addrm_start() */
 	mutex_unlock(&kernfs_mutex);
-	kernfs_put_active(parent);
-	return ret;
+
+	/* kill removed kernfs_nodes */
+	while (acxt->removed) {
+		struct kernfs_node *kn = acxt->removed;
+
+		acxt->removed = kn->u.removed_list;
+
+		kernfs_put(kn);
+	}
 }
 
 /**
@@ -584,6 +633,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
 					 void *priv, const void *ns)
 {
+	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 	int rc;
 
@@ -598,7 +648,14 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	kn->priv = priv;
 
 	/* link in */
-	rc = kernfs_add_one(kn, parent);
+	rc = -ENOENT;
+	if (kernfs_get_active(parent)) {
+		kernfs_addrm_start(&acxt);
+		rc = kernfs_add_one(&acxt, kn, parent);
+		kernfs_addrm_finish(&acxt);
+		kernfs_put_active(parent);
+	}
+
 	if (!rc)
 		return kn;
 
@@ -784,7 +841,8 @@ static void __kernfs_deactivate(struct kernfs_node *kn)
 	}
 }
 
-static void __kernfs_remove(struct kernfs_node *kn)
+static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
+			    struct kernfs_node *kn)
 {
 	struct kernfs_node *pos;
 
@@ -832,7 +890,8 @@ static void __kernfs_remove(struct kernfs_node *kn)
 				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
 			}
 
-			kernfs_put(pos);
+			pos->u.removed_list = acxt->removed;
+			acxt->removed = pos;
 		}
 
 		kernfs_put(pos);
@@ -847,9 +906,11 @@ static void __kernfs_remove(struct kernfs_node *kn)
  */
 void kernfs_remove(struct kernfs_node *kn)
 {
-	mutex_lock(&kernfs_mutex);
-	__kernfs_remove(kn);
-	mutex_unlock(&kernfs_mutex);
+	struct kernfs_addrm_cxt acxt;
+
+	kernfs_addrm_start(&acxt);
+	__kernfs_remove(&acxt, kn);
+	kernfs_addrm_finish(&acxt);
 }
 
 /**
@@ -864,6 +925,7 @@ void kernfs_remove(struct kernfs_node *kn)
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns)
 {
+	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 
 	if (!parent) {
@@ -872,13 +934,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 		return -ENOENT;
 	}
 
-	mutex_lock(&kernfs_mutex);
+	kernfs_addrm_start(&acxt);
 
 	kn = kernfs_find_ns(parent, name, ns);
 	if (kn)
-		__kernfs_remove(kn);
+		__kernfs_remove(&acxt, kn);
 
-	mutex_unlock(&kernfs_mutex);
+	kernfs_addrm_finish(&acxt);
 
 	if (kn)
 		return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ffe1bebf919..404ffd2f27b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -817,6 +817,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 bool name_is_static,
 					 struct lock_class_key *key)
 {
+	struct kernfs_addrm_cxt acxt;
 	struct kernfs_node *kn;
 	unsigned flags;
 	int rc;
@@ -852,7 +853,14 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
 
-	rc = kernfs_add_one(kn, parent);
+	rc = -ENOENT;
+	if (kernfs_get_active(parent)) {
+		kernfs_addrm_start(&acxt);
+		rc = kernfs_add_one(&acxt, kn, parent);
+		kernfs_addrm_finish(&acxt);
+		kernfs_put_active(parent);
+	}
+
 	if (rc) {
 		kernfs_put(kn);
 		return ERR_PTR(rc);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 4bc57848076..e9ec38c8607 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -45,6 +45,13 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
 	return kn->dir.root;
 }
 
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
+struct kernfs_addrm_cxt {
+	struct kernfs_node	*removed;
+};
+
 /*
  * mount.c
  */
@@ -94,7 +101,10 @@ extern const struct inode_operations kernfs_dir_iops;
 
 struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
 void kernfs_put_active(struct kernfs_node *kn);
-int kernfs_add_one(struct kernfs_node *kn, struct kernfs_node *parent);
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
+		   struct kernfs_node *parent);
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
 struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 				    umode_t mode, unsigned flags);
 
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 3a939c263ed..b2c106ca343 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -27,6 +27,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       struct kernfs_node *target)
 {
 	struct kernfs_node *kn;
+	struct kernfs_addrm_cxt acxt;
 	int error;
 
 	kn = kernfs_new_node(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
@@ -39,7 +40,14 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	kn->symlink.target_kn = target;
 	kernfs_get(target);	/* ref owned by symlink */
 
-	error = kernfs_add_one(kn, parent);
+	error = -ENOENT;
+	if (kernfs_get_active(parent)) {
+		kernfs_addrm_start(&acxt);
+		error = kernfs_add_one(&acxt, kn, parent);
+		kernfs_addrm_finish(&acxt);
+		kernfs_put_active(parent);
+	}
+
 	if (!error)
 		return kn;
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 9b5a4bb88c6..61bd7ae6b8e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -89,6 +89,10 @@ struct kernfs_node {
 
 	struct rb_node		rb;
 
+	union {
+		struct kernfs_node	*removed_list;
+	} u;
+
 	const void		*ns;	/* namespace tag */
 	unsigned int		hash;	/* ns + name hash */
 	union {
-- 
cgit v1.2.3-70-g09d2


From 55f6e30d0a6a8975cc0831e8a4a3715b815b6a2f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:27:16 -0800
Subject: Revert "kernfs: invoke kernfs_unmap_bin_file() directly from
 __kernfs_remove()"

This reverts commit f601f9a2bf7dc1f7ee18feece4c4e2fc6845d6c4.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 42 +++++++++---------------------------------
 fs/kernfs/file.c            |  5 ++++-
 fs/kernfs/kernfs-internal.h |  2 +-
 3 files changed, 14 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f878e4f2efe..e565ec096ae 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -121,17 +121,13 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
  *	Locking:
  *	mutex_lock(kernfs_mutex)
  */
-static bool kernfs_unlink_sibling(struct kernfs_node *kn)
+static void kernfs_unlink_sibling(struct kernfs_node *kn)
 {
-	if (RB_EMPTY_NODE(&kn->rb))
-		return false;
-
 	if (kernfs_type(kn) == KERNFS_DIR)
 		kn->parent->dir.subdirs--;
 
 	rb_erase(&kn->rb, &kn->parent->dir.children);
 	RB_CLEAR_NODE(&kn->rb);
-	return true;
 }
 
 /**
@@ -500,6 +496,7 @@ void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 
 		acxt->removed = kn->u.removed_list;
 
+		kernfs_unmap_bin_file(kn);
 		kernfs_put(kn);
 	}
 }
@@ -857,44 +854,23 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 
 	/* unlink the subtree node-by-node */
 	do {
-		pos = kernfs_leftmost_descendant(kn);
-
-		/*
-		 * We're gonna release kernfs_mutex to unmap bin files,
-		 * Make sure @pos doesn't go away inbetween.
-		 */
-		kernfs_get(pos);
+		struct kernfs_iattrs *ps_iattr;
 
-		/*
-		 * This must be come before unlinking; otherwise, when
-		 * there are multiple removers, some may finish before
-		 * unmapping is complete.
-		 */
-		if (pos->flags & KERNFS_HAS_MMAP) {
-			mutex_unlock(&kernfs_mutex);
-			kernfs_unmap_file(pos);
-			mutex_lock(&kernfs_mutex);
-		}
+		pos = kernfs_leftmost_descendant(kn);
 
-		/*
-		 * kernfs_unlink_sibling() succeeds once per node.  Use it
-		 * to decide who's responsible for cleanups.
-		 */
-		if (!pos->parent || kernfs_unlink_sibling(pos)) {
-			struct kernfs_iattrs *ps_iattr =
-				pos->parent ? pos->parent->iattr : NULL;
+		if (pos->parent) {
+			kernfs_unlink_sibling(pos);
 
 			/* update timestamps on the parent */
+			ps_iattr = pos->parent->iattr;
 			if (ps_iattr) {
 				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
 				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
 			}
-
-			pos->u.removed_list = acxt->removed;
-			acxt->removed = pos;
 		}
 
-		kernfs_put(pos);
+		pos->u.removed_list = acxt->removed;
+		acxt->removed = pos;
 	} while (pos != kn);
 }
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 404ffd2f27b..231a171f48b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -700,11 +700,14 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-void kernfs_unmap_file(struct kernfs_node *kn)
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
 {
 	struct kernfs_open_node *on;
 	struct kernfs_open_file *of;
 
+	if (!(kn->flags & KERNFS_HAS_MMAP))
+		return;
+
 	spin_lock_irq(&kernfs_open_node_lock);
 	on = kn->attr.open;
 	if (on)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index e9ec38c8607..57a93f4d645 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -113,7 +113,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
  */
 extern const struct file_operations kernfs_file_fops;
 
-void kernfs_unmap_file(struct kernfs_node *kn);
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
 
 /*
  * symlink.c
-- 
cgit v1.2.3-70-g09d2


From 78b19bae0813bd6f921ca58490196abd101297bd Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Mon, 13 Jan 2014 16:54:45 -0500
Subject: nfs4.1: properly handle ENOTSUP in SECINFO_NO_NAME

Don't check for -NFS4ERR_NOTSUPP, it's already been mapped to -ENOTSUPP
by nfs4_stat_to_errno.

This allows the client to mount v4.1 servers that don't support
SECINFO_NO_NAME by falling back to the "guess and check" method of
nfs4_find_root_sec.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Cc: stable@vger.kernel.org # 3.1+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f4908eb40a2..a9eaaaa436c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7919,7 +7919,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 		switch (err) {
 		case 0:
 		case -NFS4ERR_WRONGSEC:
-		case -NFS4ERR_NOTSUPP:
+		case -ENOTSUPP:
 			goto out;
 		default:
 			err = nfs4_handle_exception(server, err, &exception);
@@ -7953,7 +7953,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	 * Fall back on "guess and check" method if
 	 * the server doesn't support SECINFO_NO_NAME
 	 */
-	if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) {
+	if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
 		err = nfs4_find_root_sec(server, fhandle, info);
 		goto out_freepage;
 	}
-- 
cgit v1.2.3-70-g09d2


From 9811cd57f4c6b5b60ec104de68a88303717e3106 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 08:50:28 -0800
Subject: nfs: fix size updates for aio writes

nfs_file_direct_write only updates the inode size if it succeeded and
returned the number of bytes written.  But in the AIO case nfs_direct_wait
turns the return value into -EIOCBQUEUED and we skip the size update.

Instead the aio completion path should updated it, which this patch
does.  The implementation is a little hacky because there is no obvious
way to find out we are called for a write in nfs_direct_complete.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 049b3408fb9..ce52a9774ec 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -222,12 +222,23 @@ out:
  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
  * the iocb is still valid here if this is a synchronous request.
  */
-static void nfs_direct_complete(struct nfs_direct_req *dreq)
+static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 {
+	struct inode *inode = dreq->inode;
+
 	if (dreq->iocb) {
+		loff_t pos = dreq->iocb->ki_pos + dreq->count;
 		long res = (long) dreq->error;
 		if (!res)
 			res = (long) dreq->count;
+
+		if (write) {
+			spin_lock(&inode->i_lock);
+			if (i_size_read(inode) < pos)
+				i_size_write(inode, pos);
+			spin_unlock(&inode->i_lock);
+		}
+
 		aio_complete(dreq->iocb, res, 0);
 	}
 	complete_all(&dreq->completion);
@@ -272,7 +283,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 	}
 out_put:
 	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
+		nfs_direct_complete(dreq, false);
 	hdr->release(hdr);
 }
 
@@ -434,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	}
 
 	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
+		nfs_direct_complete(dreq, false);
 	return 0;
 }
 
@@ -594,7 +605,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 			break;
 		default:
 			nfs_inode_dio_write_done(dreq->inode);
-			nfs_direct_complete(dreq);
+			nfs_direct_complete(dreq, true);
 	}
 }
 
@@ -611,7 +622,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
 	nfs_inode_dio_write_done(inode);
-	nfs_direct_complete(dreq);
+	nfs_direct_complete(dreq, true);
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 2a009ec98cce440c0992fc9a2353e96cdb0b048b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 08:50:29 -0800
Subject: nfs: defer inode_dio_done call until size update is done

We need to have the I/O fully finished before telling the truncate code
that we are done.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ce52a9774ec..75ed2a90b0f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -226,21 +226,27 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 {
 	struct inode *inode = dreq->inode;
 
-	if (dreq->iocb) {
+	if (dreq->iocb && write) {
 		loff_t pos = dreq->iocb->ki_pos + dreq->count;
+
+		spin_lock(&inode->i_lock);
+		if (i_size_read(inode) < pos)
+			i_size_write(inode, pos);
+		spin_unlock(&inode->i_lock);
+	}
+
+	if (write) {
+		nfs_zap_mapping(inode, inode->i_mapping);
+		inode_dio_done(inode);
+	}
+
+	if (dreq->iocb) {
 		long res = (long) dreq->error;
 		if (!res)
 			res = (long) dreq->count;
-
-		if (write) {
-			spin_lock(&inode->i_lock);
-			if (i_size_read(inode) < pos)
-				i_size_write(inode, pos);
-			spin_unlock(&inode->i_lock);
-		}
-
 		aio_complete(dreq->iocb, res, 0);
 	}
+
 	complete_all(&dreq->completion);
 
 	nfs_direct_req_release(dreq);
@@ -483,12 +489,6 @@ out:
 	return result;
 }
 
-static void nfs_inode_dio_write_done(struct inode *inode)
-{
-	nfs_zap_mapping(inode, inode->i_mapping);
-	inode_dio_done(inode);
-}
-
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
@@ -604,7 +604,6 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
-			nfs_inode_dio_write_done(dreq->inode);
 			nfs_direct_complete(dreq, true);
 	}
 }
@@ -621,7 +620,6 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	nfs_inode_dio_write_done(inode);
 	nfs_direct_complete(dreq, true);
 }
 #endif
-- 
cgit v1.2.3-70-g09d2


From 1f90ee27461e31a1c18e5d819f6ea6f5c7304b16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 08:50:30 -0800
Subject: nfs: increment i_dio_count for reads, too

i_dio_count is used to protect dio access against truncate.  We want
to make sure there are no dio reads pending either when doing a
truncate.  I suspect on plain NFS things might work even without
this, but once we use a pnfs layout driver that access backing devices
directly things will go bad without the proper synchronization.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 75ed2a90b0f..6c232107e83 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -235,10 +235,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 		spin_unlock(&inode->i_lock);
 	}
 
-	if (write) {
+	if (write)
 		nfs_zap_mapping(inode, inode->i_mapping);
-		inode_dio_done(inode);
-	}
+
+	inode_dio_done(inode);
 
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
@@ -419,6 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 					      loff_t pos, bool uio)
 {
 	struct nfs_pageio_descriptor desc;
+	struct inode *inode = dreq->inode;
 	ssize_t result = -EINVAL;
 	size_t requested_bytes = 0;
 	unsigned long seg;
@@ -427,6 +428,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
 	desc.pg_dreq = dreq;
+	atomic_inc(&inode->i_dio_count);
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
@@ -446,6 +448,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
+		inode_dio_done(inode);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
-- 
cgit v1.2.3-70-g09d2


From 14a3ec79437252d922bae574ecbf0c0584c330f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 08:50:31 -0800
Subject: nfs: merge nfs_direct_read into nfs_file_direct_read

Simple code cleanup to prepare for later fixes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 107 ++++++++++++++++++++++++++------------------------------
 1 file changed, 49 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6c232107e83..f8b0a81c340 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -458,14 +458,55 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	return 0;
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos, bool uio)
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iov: vector of user buffers into which to read data
+ * @nr_segs: size of iov vector
+ * @pos: byte offset in file where reading starts
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file.  For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change.  Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos, bool uio)
 {
-	ssize_t result = -ENOMEM;
-	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	struct nfs_direct_req *dreq;
 	struct nfs_lock_context *l_ctx;
+	ssize_t result = -EINVAL;
+	size_t count;
+
+	count = iov_length(iov, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+		file, count, (long long) pos);
+
+	result = 0;
+	if (!count)
+		goto out;
+
+	result = nfs_sync_mapping(mapping);
+	if (result)
+		goto out;
+
+	task_io_account_read(count);
 
+	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (dreq == NULL)
 		goto out;
@@ -484,8 +525,11 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 
 	NFS_I(inode)->read_io += iov_length(iov, nr_segs);
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
-	if (!result)
+	if (!result) {
 		result = nfs_direct_wait(dreq);
+		if (result > 0)
+			iocb->ki_pos = pos + result;
+	}
 out_release:
 	nfs_direct_req_release(dreq);
 out:
@@ -888,59 +932,6 @@ out:
 	return result;
 }
 
-/**
- * nfs_file_direct_read - file direct read operation for NFS files
- * @iocb: target I/O control block
- * @iov: vector of user buffers into which to read data
- * @nr_segs: size of iov vector
- * @pos: byte offset in file where reading starts
- *
- * We use this function for direct reads instead of calling
- * generic_file_aio_read() in order to avoid gfar's check to see if
- * the request starts before the end of the file.  For that check
- * to work, we must generate a GETATTR before each direct read, and
- * even then there is a window between the GETATTR and the subsequent
- * READ where the file size could change.  Our preference is simply
- * to do all reads the application wants, and the server will take
- * care of managing the end of file boundary.
- *
- * This function also eliminates unnecessarily updating the file's
- * atime locally, as the NFS server sets the file's atime, and this
- * client must read the updated atime from the server back into its
- * cache.
- */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos, bool uio)
-{
-	ssize_t retval = -EINVAL;
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	size_t count;
-
-	count = iov_length(iov, nr_segs);
-	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
-
-	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
-		file, count, (long long) pos);
-
-	retval = 0;
-	if (!count)
-		goto out;
-
-	retval = nfs_sync_mapping(mapping);
-	if (retval)
-		goto out;
-
-	task_io_account_read(count);
-
-	retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
-	if (retval > 0)
-		iocb->ki_pos = pos + retval;
-
-out:
-	return retval;
-}
-
 /**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
-- 
cgit v1.2.3-70-g09d2


From 22cd1bf1480133518b3e5568466759ffde649b35 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 08:50:32 -0800
Subject: nfs: merge nfs_direct_write into nfs_file_direct_write

Simple code cleanup to prepare for later fixes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 91 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 41 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f8b0a81c340..cbfbd17eae8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -898,40 +898,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	return 0;
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos,
-				size_t count, bool uio)
-{
-	ssize_t result = -ENOMEM;
-	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	struct nfs_direct_req *dreq;
-	struct nfs_lock_context *l_ctx;
-
-	dreq = nfs_direct_req_alloc();
-	if (!dreq)
-		goto out;
-
-	dreq->inode = inode;
-	dreq->bytes_left = count;
-	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
-	l_ctx = nfs_get_lock_context(dreq->ctx);
-	if (IS_ERR(l_ctx)) {
-		result = PTR_ERR(l_ctx);
-		goto out_release;
-	}
-	dreq->l_ctx = l_ctx;
-	if (!is_sync_kiocb(iocb))
-		dreq->iocb = iocb;
-
-	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
-	if (!result)
-		result = nfs_direct_wait(dreq);
-out_release:
-	nfs_direct_req_release(dreq);
-out:
-	return result;
-}
-
 /**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
@@ -957,9 +923,12 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos, bool uio)
 {
-	ssize_t retval = -EINVAL;
+	ssize_t result = -EINVAL;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct nfs_direct_req *dreq;
+	struct nfs_lock_context *l_ctx;
 	size_t count;
 
 	count = iov_length(iov, nr_segs);
@@ -968,35 +937,57 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
 		file, count, (long long) pos);
 
-	retval = generic_write_checks(file, &pos, &count, 0);
-	if (retval)
+	result = generic_write_checks(file, &pos, &count, 0);
+	if (result)
 		goto out;
 
-	retval = -EINVAL;
+	result = -EINVAL;
 	if ((ssize_t) count < 0)
 		goto out;
-	retval = 0;
+	result = 0;
 	if (!count)
 		goto out;
 
-	retval = nfs_sync_mapping(mapping);
-	if (retval)
+	result = nfs_sync_mapping(mapping);
+	if (result)
 		goto out;
 
 	task_io_account_write(count);
 
-	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
-	if (retval > 0) {
-		struct inode *inode = mapping->host;
+	result = -ENOMEM;
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		goto out;
 
-		iocb->ki_pos = pos + retval;
-		spin_lock(&inode->i_lock);
-		if (i_size_read(inode) < iocb->ki_pos)
-			i_size_write(inode, iocb->ki_pos);
-		spin_unlock(&inode->i_lock);
+	dreq->inode = inode;
+	dreq->bytes_left = count;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (IS_ERR(l_ctx)) {
+		result = PTR_ERR(l_ctx);
+		goto out_release;
+	}
+	dreq->l_ctx = l_ctx;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+	if (!result) {
+		result = nfs_direct_wait(dreq);
+		if (result > 0) {
+			struct inode *inode = mapping->host;
+
+			iocb->ki_pos = pos + result;
+			spin_lock(&inode->i_lock);
+			if (i_size_read(inode) < iocb->ki_pos)
+				i_size_write(inode, iocb->ki_pos);
+			spin_unlock(&inode->i_lock);
+		}
 	}
+out_release:
+	nfs_direct_req_release(dreq);
 out:
-	return retval;
+	return result;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From d0b9875d65c1abcc9d405d648660dfb919353959 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 08:50:33 -0800
Subject: nfs: take i_mutex during direct I/O reads

We'll need the i_mutex to prevent i_dio_count from incrementing while
truncate is waiting for it to reach zero, and protects against having
the pagecache repopulated after we flushed it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index cbfbd17eae8..85e4e4be401 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -500,16 +500,17 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		goto out;
 
+	mutex_lock(&inode->i_mutex);
 	result = nfs_sync_mapping(mapping);
 	if (result)
-		goto out;
+		goto out_unlock;
 
 	task_io_account_read(count);
 
 	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (dreq == NULL)
-		goto out;
+		goto out_unlock;
 
 	dreq->inode = inode;
 	dreq->bytes_left = iov_length(iov, nr_segs);
@@ -525,13 +526,22 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 
 	NFS_I(inode)->read_io += iov_length(iov, nr_segs);
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+
+	mutex_unlock(&inode->i_mutex);
+
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0)
 			iocb->ki_pos = pos + result;
 	}
+
+	nfs_direct_req_release(dreq);
+	return result;
+
 out_release:
 	nfs_direct_req_release(dreq);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
 out:
 	return result;
 }
-- 
cgit v1.2.3-70-g09d2


From a9ab5e840669b19aca2974e2c771a77df2876434 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 08:50:34 -0800
Subject: nfs: page cache invalidation for dio

Make sure to properly invalidate the pagecache before performing direct I/O,
so that no stale pages are left around.  This matches what the generic
direct I/O code does.  Also take the i_mutex over the direct write submission
to avoid the lifelock vs truncate waiting for i_dio_count to decrease, and
to avoid having the pagecache easily repopulated while direct I/O is in
progrss.  Again matching the generic direct I/O code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 85e4e4be401..b8797ae6831 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -939,9 +939,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	struct nfs_direct_req *dreq;
 	struct nfs_lock_context *l_ctx;
+	loff_t end;
 	size_t count;
 
 	count = iov_length(iov, nr_segs);
+	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
@@ -958,16 +961,25 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		goto out;
 
+	mutex_lock(&inode->i_mutex);
+
 	result = nfs_sync_mapping(mapping);
 	if (result)
-		goto out;
+		goto out_unlock;
+
+	if (mapping->nrpages) {
+		result = invalidate_inode_pages2_range(mapping,
+					pos >> PAGE_CACHE_SHIFT, end);
+		if (result)
+			goto out_unlock;
+	}
 
 	task_io_account_write(count);
 
 	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
-		goto out;
+		goto out_unlock;
 
 	dreq->inode = inode;
 	dreq->bytes_left = count;
@@ -982,6 +994,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		dreq->iocb = iocb;
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+
+	if (mapping->nrpages) {
+		invalidate_inode_pages2_range(mapping,
+					      pos >> PAGE_CACHE_SHIFT, end);
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0) {
@@ -994,8 +1014,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 			spin_unlock(&inode->i_lock);
 		}
 	}
+	nfs_direct_req_release(dreq);
+	return result;
+
 out_release:
 	nfs_direct_req_release(dreq);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
 out:
 	return result;
 }
-- 
cgit v1.2.3-70-g09d2


From 4f4b1b6471cf219d136776f9ff9631a07c4e92b5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:30:47 -0800
Subject: Revert "kernfs: restructure removal path to fix possible premature
 return"

This reverts commit 45a140e587f3d32d8d424ed940dffb61e1739047.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 139 +++++++++++++++++++------------------------------
 include/linux/kernfs.h |   1 -
 2 files changed, 53 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e565ec096ae..7f8afc1d08f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -181,38 +181,14 @@ void kernfs_put_active(struct kernfs_node *kn)
  * kernfs_drain - drain kernfs_node
  * @kn: kernfs_node to drain
  *
- * Drain existing usages of @kn.  Mutiple removers may invoke this function
- * concurrently on @kn and all will return after draining is complete.
- * Returns %true if drain is performed and kernfs_mutex was temporarily
- * released.  %false if @kn was already drained and no operation was
- * necessary.
- *
- * The caller is responsible for ensuring @kn stays pinned while this
- * function is in progress even if it gets removed by someone else.
+ * Drain existing usages.
  */
-static bool kernfs_drain(struct kernfs_node *kn)
-	__releases(&kernfs_mutex) __acquires(&kernfs_mutex)
+static void kernfs_drain(struct kernfs_node *kn)
 {
 	struct kernfs_root *root = kernfs_root(kn);
 
-	lockdep_assert_held(&kernfs_mutex);
 	WARN_ON_ONCE(atomic_read(&kn->active) >= 0);
 
-	/*
-	 * We want to go through the active ref lockdep annotation at least
-	 * once for all node removals, but the lockdep annotation can't be
-	 * nested inside kernfs_mutex and deactivation can't make forward
-	 * progress if we keep dropping the mutex.  Use JUST_ACTIVATED to
-	 * force the slow path once for each deactivation if lockdep is
-	 * enabled.
-	 */
-	if ((!kernfs_lockdep(kn) || !(kn->flags & KERNFS_JUST_DEACTIVATED)) &&
-	    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
-		return false;
-
-	kn->flags &= ~KERNFS_JUST_DEACTIVATED;
-	mutex_unlock(&kernfs_mutex);
-
 	if (kernfs_lockdep(kn)) {
 		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
 		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
@@ -226,9 +202,6 @@ static bool kernfs_drain(struct kernfs_node *kn)
 		lock_acquired(&kn->dep_map, _RET_IP_);
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	}
-
-	mutex_lock(&kernfs_mutex);
-	return true;
 }
 
 /**
@@ -473,6 +446,49 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	return 0;
 }
 
+/**
+ *	kernfs_remove_one - remove kernfs_node from parent
+ *	@acxt: addrm context to use
+ *	@kn: kernfs_node to be removed
+ *
+ *	Mark @kn removed and drop nlink of parent inode if @kn is a
+ *	directory.  @kn is unlinked from the children list.
+ *
+ *	This function should be called between calls to
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to kernfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by kernfs_addrm_start().
+ */
+static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
+			      struct kernfs_node *kn)
+{
+	struct kernfs_iattrs *ps_iattr;
+
+	/*
+	 * Removal can be called multiple times on the same node.  Only the
+	 * first invocation is effective and puts the base ref.
+	 */
+	if (atomic_read(&kn->active) < 0)
+		return;
+
+	if (kn->parent) {
+		kernfs_unlink_sibling(kn);
+
+		/* Update timestamps on the parent */
+		ps_iattr = kn->parent->iattr;
+		if (ps_iattr) {
+			ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+			ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+		}
+	}
+
+	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+	kn->u.removed_list = acxt->removed;
+	acxt->removed = kn;
+}
+
 /**
  *	kernfs_addrm_finish - finish up kernfs_node add/remove
  *	@acxt: addrm context to finish up
@@ -496,6 +512,7 @@ void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 
 		acxt->removed = kn->u.removed_list;
 
+		kernfs_drain(kn);
 		kernfs_unmap_bin_file(kn);
 		kernfs_put(kn);
 	}
@@ -805,73 +822,23 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
 	return pos->parent;
 }
 
-static void __kernfs_deactivate(struct kernfs_node *kn)
-{
-	struct kernfs_node *pos;
-
-	lockdep_assert_held(&kernfs_mutex);
-
-	/* prevent any new usage under @kn by deactivating all nodes */
-	pos = NULL;
-	while ((pos = kernfs_next_descendant_post(pos, kn))) {
-		if (atomic_read(&pos->active) >= 0) {
-			atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
-			pos->flags |= KERNFS_JUST_DEACTIVATED;
-		}
-	}
-
-	/*
-	 * Drain the subtree.  If kernfs_drain() blocked to drain, which is
-	 * indicated by %true return, it temporarily released kernfs_mutex
-	 * and the rbtree might have been modified inbetween breaking our
-	 * future walk.  Restart the walk after each %true return.
-	 */
-	pos = NULL;
-	while ((pos = kernfs_next_descendant_post(pos, kn))) {
-		bool drained;
-
-		kernfs_get(pos);
-		drained = kernfs_drain(pos);
-		kernfs_put(pos);
-		if (drained)
-			pos = NULL;
-	}
-}
-
 static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 			    struct kernfs_node *kn)
 {
-	struct kernfs_node *pos;
-
-	lockdep_assert_held(&kernfs_mutex);
+	struct kernfs_node *pos, *next;
 
 	if (!kn)
 		return;
 
 	pr_debug("kernfs %s: removing\n", kn->name);
 
-	__kernfs_deactivate(kn);
-
-	/* unlink the subtree node-by-node */
+	next = NULL;
 	do {
-		struct kernfs_iattrs *ps_iattr;
-
-		pos = kernfs_leftmost_descendant(kn);
-
-		if (pos->parent) {
-			kernfs_unlink_sibling(pos);
-
-			/* update timestamps on the parent */
-			ps_iattr = pos->parent->iattr;
-			if (ps_iattr) {
-				ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
-				ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
-			}
-		}
-
-		pos->u.removed_list = acxt->removed;
-		acxt->removed = pos;
-	} while (pos != kn);
+		pos = next;
+		next = kernfs_next_descendant_post(pos, kn);
+		if (pos)
+			kernfs_remove_one(acxt, pos);
+	} while (next);
 }
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 61bd7ae6b8e..289d4f639ad 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -37,7 +37,6 @@ enum kernfs_node_type {
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
 enum kernfs_node_flag {
-	KERNFS_JUST_DEACTIVATED	= 0x0010, /* used to aid lockdep annotation */
 	KERNFS_NS		= 0x0020,
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
-- 
cgit v1.2.3-70-g09d2


From 798c75a0d44cdbd6e3d82a6a676e6de38525b3bb Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:36:03 -0800
Subject: Revert "kernfs: remove KERNFS_REMOVED"

This reverts commit ae34372eb8408b3d07e870f1939f99007a730d28.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 79 ++++++++++++++++++++-------------------------
 fs/kernfs/file.c            | 10 ++----
 fs/kernfs/kernfs-internal.h |  3 +-
 fs/kernfs/symlink.c         | 10 ++----
 include/linux/kernfs.h      |  1 +
 5 files changed, 43 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 7f8afc1d08f..1c9130a3304 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -127,7 +127,6 @@ static void kernfs_unlink_sibling(struct kernfs_node *kn)
 		kn->parent->dir.subdirs--;
 
 	rb_erase(&kn->rb, &kn->parent->dir.children);
-	RB_CLEAR_NODE(&kn->rb);
 }
 
 /**
@@ -178,16 +177,18 @@ void kernfs_put_active(struct kernfs_node *kn)
 }
 
 /**
- * kernfs_drain - drain kernfs_node
- * @kn: kernfs_node to drain
+ *	kernfs_deactivate - deactivate kernfs_node
+ *	@kn: kernfs_node to deactivate
  *
- * Drain existing usages.
+ *	Deny new active references and drain existing ones.
  */
-static void kernfs_drain(struct kernfs_node *kn)
+static void kernfs_deactivate(struct kernfs_node *kn)
 {
 	struct kernfs_root *root = kernfs_root(kn);
 
-	WARN_ON_ONCE(atomic_read(&kn->active) >= 0);
+	BUG_ON(!(kn->flags & KERNFS_REMOVED));
+
+	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
 
 	if (kernfs_lockdep(kn)) {
 		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -232,15 +233,13 @@ void kernfs_put(struct kernfs_node *kn)
 		return;
 	root = kernfs_root(kn);
  repeat:
-	/*
-	 * Moving/renaming is always done while holding reference.
+	/* Moving/renaming is always done while holding reference.
 	 * kn->parent won't change beneath us.
 	 */
 	parent = kn->parent;
 
-	WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
-		  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
-		  parent ? parent->name : "", kn->name, atomic_read(&kn->active));
+	WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
+	     parent ? parent->name : "", kn->name);
 
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
@@ -282,8 +281,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 	kn = dentry->d_fsdata;
 	mutex_lock(&kernfs_mutex);
 
-	/* Force fresh lookup if removed */
-	if (kn->parent && RB_EMPTY_NODE(&kn->rb))
+	/* The kernfs node has been deleted */
+	if (kn->flags & KERNFS_REMOVED)
 		goto out_bad;
 
 	/* The kernfs node has been moved? */
@@ -351,12 +350,11 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 	kn->ino = ret;
 
 	atomic_set(&kn->count, 1);
-	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
-	RB_CLEAR_NODE(&kn->rb);
+	atomic_set(&kn->active, 0);
 
 	kn->name = name;
 	kn->mode = mode;
-	kn->flags = flags;
+	kn->flags = flags | KERNFS_REMOVED;
 
 	return kn;
 
@@ -415,8 +413,6 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	struct kernfs_iattrs *ps_iattr;
 	int ret;
 
-	WARN_ON_ONCE(atomic_read(&parent->active) < 0);
-
 	if (has_ns != (bool)kn->ns) {
 		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
 		     has_ns ? "required" : "invalid", parent->name, kn->name);
@@ -426,6 +422,9 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	if (kernfs_type(parent) != KERNFS_DIR)
 		return -EINVAL;
 
+	if (parent->flags & KERNFS_REMOVED)
+		return -ENOENT;
+
 	kn->hash = kernfs_name_hash(kn->name, kn->ns);
 	kn->parent = parent;
 	kernfs_get(parent);
@@ -442,7 +441,8 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 	}
 
 	/* Mark the entry added into directory tree */
-	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
+	kn->flags &= ~KERNFS_REMOVED;
+
 	return 0;
 }
 
@@ -470,7 +470,7 @@ static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
 	 * Removal can be called multiple times on the same node.  Only the
 	 * first invocation is effective and puts the base ref.
 	 */
-	if (atomic_read(&kn->active) < 0)
+	if (kn->flags & KERNFS_REMOVED)
 		return;
 
 	if (kn->parent) {
@@ -484,7 +484,7 @@ static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
 		}
 	}
 
-	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+	kn->flags |= KERNFS_REMOVED;
 	kn->u.removed_list = acxt->removed;
 	acxt->removed = kn;
 }
@@ -512,7 +512,7 @@ void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
 
 		acxt->removed = kn->u.removed_list;
 
-		kernfs_drain(kn);
+		kernfs_deactivate(kn);
 		kernfs_unmap_bin_file(kn);
 		kernfs_put(kn);
 	}
@@ -610,7 +610,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
+	kn->flags &= ~KERNFS_REMOVED;
 	kn->priv = priv;
 	kn->dir.root = root;
 
@@ -662,13 +662,9 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	kn->priv = priv;
 
 	/* link in */
-	rc = -ENOENT;
-	if (kernfs_get_active(parent)) {
-		kernfs_addrm_start(&acxt);
-		rc = kernfs_add_one(&acxt, kn, parent);
-		kernfs_addrm_finish(&acxt);
-		kernfs_put_active(parent);
-	}
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
 
 	if (!rc)
 		return kn;
@@ -903,29 +899,27 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 {
 	int error;
 
+	mutex_lock(&kernfs_mutex);
+
 	error = -ENOENT;
-	if (!kernfs_get_active(new_parent))
+	if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
 		goto out;
-	if (!kernfs_get_active(kn))
-		goto out_put_new_parent;
-
-	mutex_lock(&kernfs_mutex);
 
 	error = 0;
 	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
 	    (strcmp(kn->name, new_name) == 0))
-		goto out_unlock;	/* nothing to rename */
+		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
 	if (kernfs_find_ns(new_parent, new_name, new_ns))
-		goto out_unlock;
+		goto out;
 
 	/* rename kernfs_node */
 	if (strcmp(kn->name, new_name) != 0) {
 		error = -ENOMEM;
 		new_name = kstrdup(new_name, GFP_KERNEL);
 		if (!new_name)
-			goto out_unlock;
+			goto out;
 
 		if (kn->flags & KERNFS_STATIC_NAME)
 			kn->flags &= ~KERNFS_STATIC_NAME;
@@ -947,12 +941,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	kernfs_link_sibling(kn);
 
 	error = 0;
-out_unlock:
+ out:
 	mutex_unlock(&kernfs_mutex);
-	kernfs_put_active(kn);
-out_put_new_parent:
-	kernfs_put_active(new_parent);
-out:
 	return error;
 }
 
@@ -972,7 +962,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
 	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
 {
 	if (pos) {
-		int valid = pos->parent == parent && hash == pos->hash;
+		int valid = !(pos->flags & KERNFS_REMOVED) &&
+			pos->parent == parent && hash == pos->hash;
 		kernfs_put(pos);
 		if (!valid)
 			pos = NULL;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 231a171f48b..bdd38854ef6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -856,13 +856,9 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
 
-	rc = -ENOENT;
-	if (kernfs_get_active(parent)) {
-		kernfs_addrm_start(&acxt);
-		rc = kernfs_add_one(&acxt, kn, parent);
-		kernfs_addrm_finish(&acxt);
-		kernfs_put_active(parent);
-	}
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
 
 	if (rc) {
 		kernfs_put(kn);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 57a93f4d645..c6ba5bc37a9 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,8 +26,7 @@ struct kernfs_iattrs {
 	struct simple_xattrs	xattrs;
 };
 
-/* +1 to avoid triggering overflow warning when negating it */
-#define KN_DEACTIVATED_BIAS		(INT_MIN + 1)
+#define KN_DEACTIVATED_BIAS		INT_MIN
 
 /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
 
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index b2c106ca343..a03e26036ef 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -40,13 +40,9 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	kn->symlink.target_kn = target;
 	kernfs_get(target);	/* ref owned by symlink */
 
-	error = -ENOENT;
-	if (kernfs_get_active(parent)) {
-		kernfs_addrm_start(&acxt);
-		error = kernfs_add_one(&acxt, kn, parent);
-		kernfs_addrm_finish(&acxt);
-		kernfs_put_active(parent);
-	}
+	kernfs_addrm_start(&acxt);
+	error = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
 
 	if (!error)
 		return kn;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 289d4f639ad..42ad32ff22f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -37,6 +37,7 @@ enum kernfs_node_type {
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
 enum kernfs_node_flag {
+	KERNFS_REMOVED		= 0x0010,
 	KERNFS_NS		= 0x0020,
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
-- 
cgit v1.2.3-70-g09d2


From 0890147fe09ff7e8275a162b1ab76ab5e3158c6d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:39:52 -0800
Subject: Revert "kernfs: remove KERNFS_ACTIVE_REF and add kernfs_lockdep()"

This reverts commit a69d001cfc712b96ec9d7ba44d6285702a38dabf.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 31 +++++++++++--------------------
 include/linux/kernfs.h |  1 +
 2 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1c9130a3304..ed62de6cdf8 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -22,15 +22,6 @@ DEFINE_MUTEX(kernfs_mutex);
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
-static bool kernfs_lockdep(struct kernfs_node *kn)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	return kn->flags & KERNFS_LOCKDEP;
-#else
-	return false;
-#endif
-}
-
 /**
  *	kernfs_name_hash
  *	@name: Null terminated string to hash
@@ -147,7 +138,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
 	if (!atomic_inc_unless_negative(&kn->active))
 		return NULL;
 
-	if (kernfs_lockdep(kn))
+	if (kn->flags & KERNFS_LOCKDEP)
 		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
 	return kn;
 }
@@ -167,7 +158,7 @@ void kernfs_put_active(struct kernfs_node *kn)
 	if (unlikely(!kn))
 		return;
 
-	if (kernfs_lockdep(kn))
+	if (kn->flags & KERNFS_LOCKDEP)
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	v = atomic_dec_return(&kn->active);
 	if (likely(v != KN_DEACTIVATED_BIAS))
@@ -188,21 +179,21 @@ static void kernfs_deactivate(struct kernfs_node *kn)
 
 	BUG_ON(!(kn->flags & KERNFS_REMOVED));
 
+	if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
+		return;
+
+	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+
 	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
 
-	if (kernfs_lockdep(kn)) {
-		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
-		if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
-			lock_contended(&kn->dep_map, _RET_IP_);
-	}
+	if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
+		lock_contended(&kn->dep_map, _RET_IP_);
 
 	wait_event(root->deactivate_waitq,
 		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
 
-	if (kernfs_lockdep(kn)) {
-		lock_acquired(&kn->dep_map, _RET_IP_);
-		rwsem_release(&kn->dep_map, 1, _RET_IP_);
-	}
+	lock_acquired(&kn->dep_map, _RET_IP_);
+	rwsem_release(&kn->dep_map, 1, _RET_IP_);
 }
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 42ad32ff22f..232f1a63238 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -34,6 +34,7 @@ enum kernfs_node_type {
 };
 
 #define KERNFS_TYPE_MASK	0x000f
+#define KERNFS_ACTIVE_REF	KERNFS_FILE
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 
 enum kernfs_node_flag {
-- 
cgit v1.2.3-70-g09d2


From 87da149343c8c93f6984c0f4b9da7651709624f7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:43:11 -0800
Subject: Revert "kernfs: replace kernfs_node->u.completion with
 kernfs_root->deactivate_waitq"

This reverts commit ea1c472dfeada211a0100daa7976e8e8e779b858.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c        | 27 ++++++++++++++++-----------
 include/linux/kernfs.h |  4 ++--
 2 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index ed62de6cdf8..510b5062ef3 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -8,7 +8,6 @@
  * This file is released under the GPLv2.
  */
 
-#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/idr.h>
@@ -152,7 +151,6 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
  */
 void kernfs_put_active(struct kernfs_node *kn)
 {
-	struct kernfs_root *root = kernfs_root(kn);
 	int v;
 
 	if (unlikely(!kn))
@@ -164,7 +162,11 @@ void kernfs_put_active(struct kernfs_node *kn)
 	if (likely(v != KN_DEACTIVATED_BIAS))
 		return;
 
-	wake_up_all(&root->deactivate_waitq);
+	/*
+	 * atomic_dec_return() is a mb(), we'll always see the updated
+	 * kn->u.completion.
+	 */
+	complete(kn->u.completion);
 }
 
 /**
@@ -175,22 +177,26 @@ void kernfs_put_active(struct kernfs_node *kn)
  */
 static void kernfs_deactivate(struct kernfs_node *kn)
 {
-	struct kernfs_root *root = kernfs_root(kn);
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int v;
 
 	BUG_ON(!(kn->flags & KERNFS_REMOVED));
 
 	if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
 		return;
 
-	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+	kn->u.completion = (void *)&wait;
 
-	atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+	/* atomic_add_return() is a mb(), put_active() will always see
+	 * the updated kn->u.completion.
+	 */
+	v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
 
-	if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
+	if (v != KN_DEACTIVATED_BIAS) {
 		lock_contended(&kn->dep_map, _RET_IP_);
-
-	wait_event(root->deactivate_waitq,
-		   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
+		wait_for_completion(&wait);
+	}
 
 	lock_acquired(&kn->dep_map, _RET_IP_);
 	rwsem_release(&kn->dep_map, 1, _RET_IP_);
@@ -607,7 +613,6 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 
 	root->dir_ops = kdops;
 	root->kn = kn;
-	init_waitqueue_head(&root->deactivate_waitq);
 
 	return root;
 }
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 232f1a63238..d2c439db4ef 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -15,7 +15,7 @@
 #include <linux/lockdep.h>
 #include <linux/rbtree.h>
 #include <linux/atomic.h>
-#include <linux/wait.h>
+#include <linux/completion.h>
 
 struct file;
 struct iattr;
@@ -91,6 +91,7 @@ struct kernfs_node {
 	struct rb_node		rb;
 
 	union {
+		struct completion	*completion;
 		struct kernfs_node	*removed_list;
 	} u;
 
@@ -131,7 +132,6 @@ struct kernfs_root {
 	/* private fields, do not use outside kernfs proper */
 	struct ida		ino_ida;
 	struct kernfs_dir_ops	*dir_ops;
-	wait_queue_head_t	deactivate_waitq;
 };
 
 struct kernfs_open_file {
-- 
cgit v1.2.3-70-g09d2


From 683bb2761fbf123b24aed03a1c0d5d7556ec3018 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 14:49:01 -0800
Subject: Revert "kernfs: fix get_active failure handling in kernfs_seq_*()"

This reverts commit d92d2e6bd72b653f9811e0c9c46307c743b3fc58.

Tejun writes:
        I'm sorry but can you please revert the whole series?
        get_active() waiting while a node is deactivated has potential
        to lead to deadlock and that deactivate/reactivate interface is
        something fundamentally flawed and that cgroup will have to work
        with the remove_self() like everybody else.  IOW, I think the
        first posting was correct.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c | 51 +++++++--------------------------------------------
 1 file changed, 7 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index bdd38854ef6..316604cc3a1 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -54,38 +54,6 @@ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
 	return kn->attr.ops;
 }
 
-/*
- * As kernfs_seq_stop() is also called after kernfs_seq_start() or
- * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
- * a seq_file iteration which is fully initialized with an active reference
- * or an aborted kernfs_seq_start() due to get_active failure.  The
- * position pointer is the only context for each seq_file iteration and
- * thus the stop condition should be encoded in it.  As the return value is
- * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
- * choice to indicate get_active failure.
- *
- * Unfortunately, this is complicated due to the optional custom seq_file
- * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
- * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
- * custom seq_file operations and thus can't decide whether put_active
- * should be performed or not only on ERR_PTR(-ENODEV).
- *
- * This is worked around by factoring out the custom seq_stop() and
- * put_active part into kernfs_seq_stop_active(), skipping it from
- * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
- * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
- * that kernfs_seq_stop_active() is skipped only after get_active failure.
- */
-static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
-{
-	struct kernfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops = kernfs_ops(of->kn);
-
-	if (ops->seq_stop)
-		ops->seq_stop(sf, v);
-	kernfs_put_active(of->kn);
-}
-
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 {
 	struct kernfs_open_file *of = sf->private;
@@ -101,11 +69,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 
 	ops = kernfs_ops(of->kn);
 	if (ops->seq_start) {
-		void *next = ops->seq_start(sf, ppos);
-		/* see the comment above kernfs_seq_stop_active() */
-		if (next == ERR_PTR(-ENODEV))
-			kernfs_seq_stop_active(sf, next);
-		return next;
+		return ops->seq_start(sf, ppos);
 	} else {
 		/*
 		 * The same behavior and code as single_open().  Returns
@@ -121,11 +85,7 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
 	if (ops->seq_next) {
-		void *next = ops->seq_next(sf, v, ppos);
-		/* see the comment above kernfs_seq_stop_active() */
-		if (next == ERR_PTR(-ENODEV))
-			kernfs_seq_stop_active(sf, next);
-		return next;
+		return ops->seq_next(sf, v, ppos);
 	} else {
 		/*
 		 * The same behavior and code as single_open(), always
@@ -139,9 +99,12 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 static void kernfs_seq_stop(struct seq_file *sf, void *v)
 {
 	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
-	if (v != ERR_PTR(-ENODEV))
-		kernfs_seq_stop_active(sf, v);
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
+
+	kernfs_put_active(of->kn);
 	mutex_unlock(&of->mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5514f0aadddcdfaaaea697b60203f5402552eb7b Mon Sep 17 00:00:00 2001
From: Yuan Zhong <yuan.mark.zhong@samsung.com>
Date: Fri, 10 Jan 2014 07:26:14 +0000
Subject: f2fs: remove the needless parameter of f2fs_wait_on_page_writeback

"boo sync" parameter is never referenced in f2fs_wait_on_page_writeback.
We should remove this parameter.

Signed-off-by: Yuan Zhong <yuan.mark.zhong@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c    | 2 +-
 fs/f2fs/f2fs.h    | 2 +-
 fs/f2fs/gc.c      | 4 ++--
 fs/f2fs/inline.c  | 2 +-
 fs/f2fs/inode.c   | 2 +-
 fs/f2fs/segment.c | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 63d190264a3..19ad0669447 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -226,7 +226,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
 	struct page *node_page = dn->node_page;
 	unsigned int ofs_in_node = dn->ofs_in_node;
 
-	f2fs_wait_on_page_writeback(node_page, NODE, false);
+	f2fs_wait_on_page_writeback(node_page, NODE);
 
 	rn = F2FS_NODE(node_page);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5f7dc4f6eb0..8466b5ea76f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1132,7 +1132,7 @@ void rewrite_node_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
 		block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
 int lookup_journal_in_cursum(struct f2fs_summary_block *,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 9117ccaf254..ea0371e854b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -428,7 +428,7 @@ next_step:
 
 		/* set page dirty and write it */
 		if (gc_type == FG_GC) {
-			f2fs_wait_on_page_writeback(node_page, NODE, true);
+			f2fs_wait_on_page_writeback(node_page, NODE);
 			set_page_dirty(node_page);
 		} else {
 			if (!PageWriteback(node_page))
@@ -533,7 +533,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 	} else {
 		struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
-		f2fs_wait_on_page_writeback(page, DATA, true);
+		f2fs_wait_on_page_writeback(page, DATA);
 
 		if (clear_page_dirty_for_io(page) &&
 			S_ISDIR(inode->i_mode)) {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index e0d800a1d79..31ee5b164ff 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -106,7 +106,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 	set_page_writeback(page);
 	write_data_page(page, &dn, &new_blk_addr, &fio);
 	update_extent_cache(new_blk_addr, &dn);
-	f2fs_wait_on_page_writeback(page, DATA, true);
+	f2fs_wait_on_page_writeback(page, DATA);
 
 	/* clear inline data and flag after data writeback */
 	zero_user_segment(ipage, INLINE_DATA_OFFSET,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 915f9a8f3ee..ffa4c6d0fab 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -175,7 +175,7 @@ void update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
 
-	f2fs_wait_on_page_writeback(node_page, NODE, false);
+	f2fs_wait_on_page_writeback(node_page, NODE);
 
 	ri = F2FS_INODE(node_page);
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5f84639354e..a934e6f2738 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1053,7 +1053,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 }
 
 void f2fs_wait_on_page_writeback(struct page *page,
-				enum page_type type, bool sync)
+				enum page_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
 	if (PageWriteback(page)) {
-- 
cgit v1.2.3-70-g09d2


From 4531929e3922f2cdd34208d7dc1404ac06e6ced5 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Fri, 10 Jan 2014 18:09:02 +0800
Subject: f2fs: move grabing orphan pages out of protection region

Move grabing orphan block page out of protection region, and grab all
the orphan block pages ahead.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Reviewed-by: Chao Yu <chao2.yu@samsung.com>
[Jaegeuk Kim: remove unnecessary code pointed by Chao Yu]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0d78bbe79f9..fdc5a12c0ed 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -303,22 +303,25 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 {
 	struct list_head *head;
 	struct f2fs_orphan_block *orphan_blk = NULL;
-	struct page *page = NULL;
 	unsigned int nentries = 0;
-	unsigned short index = 1;
-	unsigned short orphan_blocks;
+	unsigned short index;
+	unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
+		(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+	struct page *page = NULL;
+	struct page *pages[orphan_blocks];
 	struct orphan_inode_entry *orphan = NULL;
 
-	orphan_blocks = (unsigned short)((sbi->n_orphans +
-		(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+	for (index = 0; index < orphan_blocks; index++)
+		pages[index] = grab_meta_page(sbi, start_blk + index);
 
+	index = 1;
 	mutex_lock(&sbi->orphan_inode_mutex);
 	head = &sbi->orphan_inode_list;
 
 	/* loop for each orphan inode entry and write them in Jornal block */
 	list_for_each_entry(orphan, head, list) {
 		if (!page) {
-			page = grab_meta_page(sbi, start_blk);
+			page = pages[index - 1];
 			orphan_blk =
 				(struct f2fs_orphan_block *)page_address(page);
 			memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -338,7 +341,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 			set_page_dirty(page);
 			f2fs_put_page(page, 1);
 			index++;
-			start_blk++;
 			nentries = 0;
 			page = NULL;
 		}
-- 
cgit v1.2.3-70-g09d2


From c1ef37257229dc8903615eaf1d1abaa5da3f0686 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Fri, 10 Jan 2014 18:09:08 +0800
Subject: f2fs: move alloc new orphan node out of lock protection region

Move alloc new orphan node out of lock protection region.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index fdc5a12c0ed..681782c715f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -219,26 +219,29 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 	struct list_head *head, *this;
 	struct orphan_inode_entry *new = NULL, *orphan = NULL;
 
+	new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
+	new->ino = ino;
+
 	mutex_lock(&sbi->orphan_inode_mutex);
 	head = &sbi->orphan_inode_list;
 	list_for_each(this, head) {
 		orphan = list_entry(this, struct orphan_inode_entry, list);
-		if (orphan->ino == ino)
-			goto out;
+		if (orphan->ino == ino) {
+			mutex_unlock(&sbi->orphan_inode_mutex);
+			kmem_cache_free(orphan_entry_slab, new);
+			return;
+		}
+
 		if (orphan->ino > ino)
 			break;
 		orphan = NULL;
 	}
 
-	new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
-	new->ino = ino;
-
 	/* add new_oentry into list which is sorted by inode number */
 	if (orphan)
 		list_add(&new->list, this->prev);
 	else
 		list_add_tail(&new->list, head);
-out:
 	mutex_unlock(&sbi->orphan_inode_mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 17b692f60e93f5d417fcdbfd681b0f20f9f31ec8 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Fri, 10 Jan 2014 18:09:14 +0800
Subject: f2fs: use spinlock rather than mutex for better speed

With the 2 previous changes, all the long time operations are moved out
of the protection region, so here we can use spinlock rather than mutex
(orphan_inode_mutex) for lower overhead.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 24 ++++++++++++------------
 fs/f2fs/f2fs.h       |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 681782c715f..4de0345f73f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -196,22 +196,22 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
 	int err = 0;
 
-	mutex_lock(&sbi->orphan_inode_mutex);
+	spin_lock(&sbi->orphan_inode_lock);
 	if (unlikely(sbi->n_orphans >= sbi->max_orphans))
 		err = -ENOSPC;
 	else
 		sbi->n_orphans++;
-	mutex_unlock(&sbi->orphan_inode_mutex);
+	spin_unlock(&sbi->orphan_inode_lock);
 
 	return err;
 }
 
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
-	mutex_lock(&sbi->orphan_inode_mutex);
+	spin_lock(&sbi->orphan_inode_lock);
 	f2fs_bug_on(sbi->n_orphans == 0);
 	sbi->n_orphans--;
-	mutex_unlock(&sbi->orphan_inode_mutex);
+	spin_unlock(&sbi->orphan_inode_lock);
 }
 
 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -222,12 +222,12 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 	new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
 	new->ino = ino;
 
-	mutex_lock(&sbi->orphan_inode_mutex);
+	spin_lock(&sbi->orphan_inode_lock);
 	head = &sbi->orphan_inode_list;
 	list_for_each(this, head) {
 		orphan = list_entry(this, struct orphan_inode_entry, list);
 		if (orphan->ino == ino) {
-			mutex_unlock(&sbi->orphan_inode_mutex);
+			spin_unlock(&sbi->orphan_inode_lock);
 			kmem_cache_free(orphan_entry_slab, new);
 			return;
 		}
@@ -242,7 +242,7 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 		list_add(&new->list, this->prev);
 	else
 		list_add_tail(&new->list, head);
-	mutex_unlock(&sbi->orphan_inode_mutex);
+	spin_unlock(&sbi->orphan_inode_lock);
 }
 
 void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -250,7 +250,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 	struct list_head *head;
 	struct orphan_inode_entry *orphan;
 
-	mutex_lock(&sbi->orphan_inode_mutex);
+	spin_lock(&sbi->orphan_inode_lock);
 	head = &sbi->orphan_inode_list;
 	list_for_each_entry(orphan, head, list) {
 		if (orphan->ino == ino) {
@@ -261,7 +261,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 			break;
 		}
 	}
-	mutex_unlock(&sbi->orphan_inode_mutex);
+	spin_unlock(&sbi->orphan_inode_lock);
 }
 
 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -318,7 +318,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 		pages[index] = grab_meta_page(sbi, start_blk + index);
 
 	index = 1;
-	mutex_lock(&sbi->orphan_inode_mutex);
+	spin_lock(&sbi->orphan_inode_lock);
 	head = &sbi->orphan_inode_list;
 
 	/* loop for each orphan inode entry and write them in Jornal block */
@@ -357,7 +357,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 		f2fs_put_page(page, 1);
 	}
 
-	mutex_unlock(&sbi->orphan_inode_mutex);
+	spin_unlock(&sbi->orphan_inode_lock);
 }
 
 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -828,7 +828,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 
 void init_orphan_info(struct f2fs_sb_info *sbi)
 {
-	mutex_init(&sbi->orphan_inode_mutex);
+	spin_lock_init(&sbi->orphan_inode_lock);
 	INIT_LIST_HEAD(&sbi->orphan_inode_list);
 	sbi->n_orphans = 0;
 	/*
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8466b5ea76f..ee304fb71e1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -412,7 +412,7 @@ struct f2fs_sb_info {
 
 	/* for orphan inode management */
 	struct list_head orphan_inode_list;	/* orphan inode list */
-	struct mutex orphan_inode_mutex;	/* for orphan inode list */
+	spinlock_t orphan_inode_lock;		/* for orphan inode list */
 	unsigned int n_orphans;			/* # of orphan inodes */
 	unsigned int max_orphans;		/* max orphan inodes */
 
-- 
cgit v1.2.3-70-g09d2


From 499046ab2c256c4b239cf389ff48e0c7478f3061 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Mon, 13 Jan 2014 10:34:18 +0900
Subject: f2fs: add delimiter to seperate name and value in debug phrase

Support for f2fs-tools/tools/f2stat to monitor
/sys/kernel/debug/f2fs/status

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/debug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 711a0d4d1cb..6e4ac9a25df 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -246,13 +246,13 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
 			   si->hit_ext, si->total_ext);
 		seq_printf(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - nodes %4d in %4d\n",
+		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
-		seq_printf(s, "  - dents %4d in dirs:%4d\n",
+		seq_printf(s, "  - dents: %4d in dirs:%4d\n",
 			   si->ndirty_dent, si->ndirty_dirs);
-		seq_printf(s, "  - meta %4d in %4d\n",
+		seq_printf(s, "  - meta: %4d in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
-		seq_printf(s, "  - NATs %5d > %lu\n",
+		seq_printf(s, "  - NATs: %5d > %lu\n",
 			   si->nats, NM_WOUT_THRESHOLD);
 		seq_printf(s, "  - SITs: %5d\n  - free_nids: %5d\n",
 			   si->sits, si->fnids);
-- 
cgit v1.2.3-70-g09d2


From bb305947bdbb67325e1f949183cdd208fc2a7999 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Jan 2014 09:52:01 -0500
Subject: kernfs: fix get_active failure handling in kernfs_seq_*()

When kernfs_seq_start() fails to obtain an active reference, it
returns ERR_PTR(-ENODEV).  kernfs_seq_stop() is then invoked with the
error pointer value; however, it still proceeds to invoke
kernfs_put_active() on the node leading to unbalanced put.

If kernfs_seq_stop() is called even after active ref failure, it
should skip invocation of @ops->seq_stop() and put_active.
Unfortunately, this is a bit complicated because active ref failure
isn't the only thing which may fail with ERR_PTR(-ENODEV).
@ops->seq_start/next() may also fail with the error value and
kernfs_seq_stop() doesn't have a way to tell apart those failures.

Work it around by factoring out the active part of kernfs_seq_stop()
into kernfs_seq_stop_active() and invoking it directly if
@ops->seq_start/next() fail with ERR_PTR(-ENODEV) and updating
kernfs_seq_stop() to skip kernfs_seq_stop_active() on
ERR_PTR(-ENODEV).  This is a bit nasty but ensures that the active put
is skipped iff get_active failed in kernfs_seq_start().

tj: This was originally committed as d92d2e6bd72b but got reverted by
    683bb2761fbf along with other kernfs self removal patches.
    However, this one is an independent fix and shouldn't have been
    reverted together.  Reinstate the change.  Sorry about the mess.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 316604cc3a1..bdd38854ef6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -54,6 +54,38 @@ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
 	return kn->attr.ops;
 }
 
+/*
+ * As kernfs_seq_stop() is also called after kernfs_seq_start() or
+ * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
+ * a seq_file iteration which is fully initialized with an active reference
+ * or an aborted kernfs_seq_start() due to get_active failure.  The
+ * position pointer is the only context for each seq_file iteration and
+ * thus the stop condition should be encoded in it.  As the return value is
+ * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
+ * choice to indicate get_active failure.
+ *
+ * Unfortunately, this is complicated due to the optional custom seq_file
+ * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
+ * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
+ * custom seq_file operations and thus can't decide whether put_active
+ * should be performed or not only on ERR_PTR(-ENODEV).
+ *
+ * This is worked around by factoring out the custom seq_stop() and
+ * put_active part into kernfs_seq_stop_active(), skipping it from
+ * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
+ * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
+ * that kernfs_seq_stop_active() is skipped only after get_active failure.
+ */
+static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
+	kernfs_put_active(of->kn);
+}
+
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 {
 	struct kernfs_open_file *of = sf->private;
@@ -69,7 +101,11 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 
 	ops = kernfs_ops(of->kn);
 	if (ops->seq_start) {
-		return ops->seq_start(sf, ppos);
+		void *next = ops->seq_start(sf, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
 	} else {
 		/*
 		 * The same behavior and code as single_open().  Returns
@@ -85,7 +121,11 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
 	if (ops->seq_next) {
-		return ops->seq_next(sf, v, ppos);
+		void *next = ops->seq_next(sf, v, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
 	} else {
 		/*
 		 * The same behavior and code as single_open(), always
@@ -99,12 +139,9 @@ static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
 static void kernfs_seq_stop(struct seq_file *sf, void *v)
 {
 	struct kernfs_open_file *of = sf->private;
-	const struct kernfs_ops *ops = kernfs_ops(of->kn);
 
-	if (ops->seq_stop)
-		ops->seq_stop(sf, v);
-
-	kernfs_put_active(of->kn);
+	if (v != ERR_PTR(-ENODEV))
+		kernfs_seq_stop_active(sf, v);
 	mutex_unlock(&of->mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 086352f1aa8d717eaf565074d6634c7bdd26aca0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 14 Jan 2014 13:46:51 +0000
Subject: GFS2: No need to invalidate pages for a dio read

We recently fixed the writeback of pages prior to performing
direct i/o, however the initial fix was perhaps a bit heavy
handed. There is no need to invalidate pages if the direct i/o
is only a read, since they will be identical to what has been
flushed to disk anyway.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index cf858dad75d..49436fa7cd4 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1032,8 +1032,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
 		rv = filemap_write_and_wait_range(mapping, lstart, end);
 		if (rv)
-			return rv;
-		truncate_inode_pages_range(mapping, lstart, end);
+			goto out;
+		if (rw == WRITE)
+			truncate_inode_pages_range(mapping, lstart, end);
 	}
 
 	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-- 
cgit v1.2.3-70-g09d2


From c754fbbb1b6bf462c6ddba48b19f20adf2335cac Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 12 Dec 2013 10:47:59 +0000
Subject: GFS2: Use RCU/hlist_bl based hash for quotas

Prior to this patch, GFS2 kept all the quotas for each
super block in a single linked list. This is rather slow
when there are large numbers of quotas.

This patch introduces a hlist_bl based hash table, similar
to the one used for glocks. The initial look up of the quota
is now lockless in the case where it is already cached,
although we still have to take the per quota spinlock in
order to bump the ref count. Either way though, this is a
big improvement on what was there before.

The qd_lock and the per super block list is preserved, for
the time being. However it is intended that since this is no
longer used for its original role, it should be possible to
shrink the number of items on that list in due course and
remove the requirement to take qd_lock in qd_get.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 fs/gfs2/incore.h |   4 ++
 fs/gfs2/main.c   |   1 +
 fs/gfs2/quota.c  | 168 +++++++++++++++++++++++++++++++++++++++----------------
 fs/gfs2/quota.h  |   1 +
 4 files changed, 126 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a99f60c9884..59d99ec9d87 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -428,10 +428,13 @@ enum {
 };
 
 struct gfs2_quota_data {
+	struct hlist_bl_node qd_hlist;
 	struct list_head qd_list;
 	struct kqid qd_id;
+	struct gfs2_sbd *qd_sbd;
 	struct lockref qd_lockref;
 	struct list_head qd_lru;
+	unsigned qd_hash;
 
 	unsigned long qd_flags;		/* QDF_... */
 
@@ -450,6 +453,7 @@ struct gfs2_quota_data {
 
 	u64 qd_sync_gen;
 	unsigned long qd_last_warn;
+	struct rcu_head qd_rcu;
 };
 
 struct gfs2_trans {
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0650db2541e..c272e73063d 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
 
 	gfs2_str2qstr(&gfs2_qdot, ".");
 	gfs2_str2qstr(&gfs2_qdotdot, "..");
+	gfs2_quota_hash_init();
 
 	error = gfs2_sys_init();
 	if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1b6b3675ee1..a1df01d381a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -52,6 +52,10 @@
 #include <linux/dqblk_xfs.h>
 #include <linux/lockref.h>
 #include <linux/list_lru.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+#include <linux/jhash.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -67,10 +71,43 @@
 #include "inode.h"
 #include "util.h"
 
-/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
+#define GFS2_QD_HASH_SHIFT      12
+#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
+#define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
+
+/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
 static DEFINE_SPINLOCK(qd_lock);
 struct list_lru gfs2_qd_lru;
 
+static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
+
+static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
+				 const struct kqid qid)
+{
+	unsigned int h;
+
+	h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
+	h = jhash(&qid, sizeof(struct kqid), h);
+
+	return h & GFS2_QD_HASH_MASK;
+}
+
+static inline void spin_lock_bucket(unsigned int hash)
+{
+        hlist_bl_lock(&qd_hash_table[hash]);
+}
+
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        hlist_bl_unlock(&qd_hash_table[hash]);
+}
+
+static void gfs2_qd_dealloc(struct rcu_head *rcu)
+{
+	struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
+	kmem_cache_free(gfs2_quotad_cachep, qd);
+}
+
 static void gfs2_qd_dispose(struct list_head *list)
 {
 	struct gfs2_quota_data *qd;
@@ -87,6 +124,10 @@ static void gfs2_qd_dispose(struct list_head *list)
 		list_del(&qd->qd_list);
 		spin_unlock(&qd_lock);
 
+		spin_lock_bucket(qd->qd_hash);
+		hlist_bl_del_rcu(&qd->qd_hlist);
+		spin_unlock_bucket(qd->qd_hash);
+
 		gfs2_assert_warn(sdp, !qd->qd_change);
 		gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -95,7 +136,7 @@ static void gfs2_qd_dispose(struct list_head *list)
 		atomic_dec(&sdp->sd_quota_count);
 
 		/* Delete it from the common reclaim list */
-		kmem_cache_free(gfs2_quotad_cachep, qd);
+		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
 	}
 }
 
@@ -165,83 +206,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
 	return offset;
 }
 
-static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
-		    struct gfs2_quota_data **qdp)
+static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
 {
 	struct gfs2_quota_data *qd;
 	int error;
 
 	qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
 	if (!qd)
-		return -ENOMEM;
+		return NULL;
 
+	qd->qd_sbd = sdp;
 	qd->qd_lockref.count = 1;
 	spin_lock_init(&qd->qd_lockref.lock);
 	qd->qd_id = qid;
 	qd->qd_slot = -1;
 	INIT_LIST_HEAD(&qd->qd_lru);
+	qd->qd_hash = hash;
 
 	error = gfs2_glock_get(sdp, qd2index(qd),
 			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
 	if (error)
 		goto fail;
 
-	*qdp = qd;
-
-	return 0;
+	return qd;
 
 fail:
 	kmem_cache_free(gfs2_quotad_cachep, qd);
-	return error;
+	return NULL;
 }
 
-static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
-		  struct gfs2_quota_data **qdp)
+static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
+						     const struct gfs2_sbd *sdp,
+						     struct kqid qid)
 {
-	struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
-	int error, found;
-
-	*qdp = NULL;
+	struct gfs2_quota_data *qd;
+	struct hlist_bl_node *h;
 
-	for (;;) {
-		found = 0;
-		spin_lock(&qd_lock);
-		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
-			if (qid_eq(qd->qd_id, qid) &&
-			    lockref_get_not_dead(&qd->qd_lockref)) {
-				list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
-				found = 1;
-				break;
-			}
+	hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
+		if (!qid_eq(qd->qd_id, qid))
+			continue;
+		if (qd->qd_sbd != sdp)
+			continue;
+		if (lockref_get_not_dead(&qd->qd_lockref)) {
+			list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+			return qd;
 		}
+	}
 
-		if (!found)
-			qd = NULL;
+	return NULL;
+}
 
-		if (!qd && new_qd) {
-			qd = new_qd;
-			list_add(&qd->qd_list, &sdp->sd_quota_list);
-			atomic_inc(&sdp->sd_quota_count);
-			new_qd = NULL;
-		}
 
-		spin_unlock(&qd_lock);
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+		  struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd, *new_qd;
+	unsigned int hash = gfs2_qd_hash(sdp, qid);
 
-		if (qd) {
-			if (new_qd) {
-				gfs2_glock_put(new_qd->qd_gl);
-				kmem_cache_free(gfs2_quotad_cachep, new_qd);
-			}
-			*qdp = qd;
-			return 0;
-		}
+	rcu_read_lock();
+	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+	rcu_read_unlock();
 
-		error = qd_alloc(sdp, qid, &new_qd);
-		if (error)
-			return error;
+	if (qd)
+		return 0;
+
+	new_qd = qd_alloc(hash, sdp, qid);
+	if (!new_qd)
+		return -ENOMEM;
+
+	spin_lock(&qd_lock);
+	spin_lock_bucket(hash);
+	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+	if (qd == NULL) {
+		*qdp = new_qd;
+		list_add(&new_qd->qd_list, &sdp->sd_quota_list);
+		hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
+		atomic_inc(&sdp->sd_quota_count);
+	}
+	spin_unlock_bucket(hash);
+	spin_unlock(&qd_lock);
+
+	if (qd) {
+		gfs2_glock_put(new_qd->qd_gl);
+		kmem_cache_free(gfs2_quotad_cachep, new_qd);
 	}
+
+	return 0;
 }
 
+
 static void qd_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
@@ -1215,6 +1268,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
+	unsigned int hash;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
@@ -1272,8 +1326,9 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			if (!qc_change)
 				continue;
 
-			error = qd_alloc(sdp, qc_id, &qd);
-			if (error) {
+			hash = gfs2_qd_hash(sdp, qc_id);
+			qd = qd_alloc(hash, sdp, qc_id);
+			if (qd == NULL) {
 				brelse(bh);
 				goto fail;
 			}
@@ -1289,6 +1344,10 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			atomic_inc(&sdp->sd_quota_count);
 			spin_unlock(&qd_lock);
 
+			spin_lock_bucket(hash);
+			hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
+			spin_unlock_bucket(hash);
+
 			found++;
 		}
 
@@ -1335,11 +1394,16 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		spin_unlock(&qd->qd_lockref.lock);
 
 		list_del(&qd->qd_list);
+
 		/* Also remove if this qd exists in the reclaim list */
 		list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
 		atomic_dec(&sdp->sd_quota_count);
 		spin_unlock(&qd_lock);
 
+		spin_lock_bucket(qd->qd_hash);
+		hlist_bl_del_rcu(&qd->qd_hlist);
+		spin_unlock_bucket(qd->qd_hash);
+
 		if (!qd->qd_lockref.count) {
 			gfs2_assert_warn(sdp, !qd->qd_change);
 			gfs2_assert_warn(sdp, !qd->qd_slot_count);
@@ -1348,7 +1412,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
 		gfs2_glock_put(qd->qd_gl);
-		kmem_cache_free(gfs2_quotad_cachep, qd);
+		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
 
 		spin_lock(&qd_lock);
 	}
@@ -1643,3 +1707,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
 	.get_dqblk	= gfs2_get_dqblk,
 	.set_dqblk	= gfs2_set_dqblk,
 };
+
+void __init gfs2_quota_hash_init(void)
+{
+	unsigned i;
+
+	for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
+		INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 96e4f34a03b..55d506eb3c4 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 extern const struct quotactl_ops gfs2_quotactl_ops;
 extern struct shrinker gfs2_qd_shrinker;
 extern struct list_lru gfs2_qd_lru;
+extern void __init gfs2_quota_hash_init(void);
 
 #endif /* __QUOTA_DOT_H__ */
-- 
cgit v1.2.3-70-g09d2


From 8ad151c2ac9aa106cb903cfd838b31561dbd7bcc Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 12 Dec 2013 11:34:09 +0000
Subject: GFS2: Only run logd and quota when mounted read/write

While investigating a rather strange bit of code in the quota
clean up function, I spotted that the reason for its existence
was that when remounting read only, we were not stopping the
quotad thread, and thus it was possible for it to still have
a reference to some of the quotas in that case.

This patch moves the logd and quota thread start and stop into
the make_fs_rw/ro functions, so that we now stop those threads
when mounted read only.

This means that quotad will always be stopped before we call
the quota clean up function, and we can thus dispose of the
(rather hackish) code that waits for it to give up its
reference on the quotas.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/ops_fstype.c | 42 +-----------------------------------------
 fs/gfs2/quota.c      | 24 ++----------------------
 fs/gfs2/super.c      | 43 ++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 41 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3b820b672d1..27648e803d1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -969,40 +969,6 @@ fail:
 	return error;
 }
 
-static int init_threads(struct gfs2_sbd *sdp, int undo)
-{
-	struct task_struct *p;
-	int error = 0;
-
-	if (undo)
-		goto fail_quotad;
-
-	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-	if (IS_ERR(p)) {
-		error = PTR_ERR(p);
-		fs_err(sdp, "can't start logd thread: %d\n", error);
-		return error;
-	}
-	sdp->sd_logd_process = p;
-
-	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-	if (IS_ERR(p)) {
-		error = PTR_ERR(p);
-		fs_err(sdp, "can't start quotad thread: %d\n", error);
-		goto fail;
-	}
-	sdp->sd_quotad_process = p;
-
-	return 0;
-
-
-fail_quotad:
-	kthread_stop(sdp->sd_quotad_process);
-fail:
-	kthread_stop(sdp->sd_logd_process);
-	return error;
-}
-
 static const match_table_t nolock_tokens = {
 	{ Opt_jid, "jid=%d\n", },
 	{ Opt_err, NULL },
@@ -1267,15 +1233,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 		goto fail_per_node;
 	}
 
-	error = init_threads(sdp, DO);
-	if (error)
-		goto fail_per_node;
-
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_rw(sdp);
 		if (error) {
 			fs_err(sdp, "can't make FS RW: %d\n", error);
-			goto fail_threads;
+			goto fail_per_node;
 		}
 	}
 
@@ -1283,8 +1245,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	gfs2_online_uevent(sdp);
 	return 0;
 
-fail_threads:
-	init_threads(sdp, UNDO);
 fail_per_node:
 	init_per_node(sdp, UNDO);
 fail_inodes:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a1df01d381a..3287d987150 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1376,23 +1376,6 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	while (!list_empty(head)) {
 		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
 
-		/*
-		 * To be removed in due course... we should be able to
-		 * ensure that all refs to the qd have done by this point
-		 * so that this rather odd test is not required
-		 */
-		spin_lock(&qd->qd_lockref.lock);
-		if (qd->qd_lockref.count > 1 ||
-		    (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
-			spin_unlock(&qd->qd_lockref.lock);
-			list_move(&qd->qd_list, head);
-			spin_unlock(&qd_lock);
-			schedule();
-			spin_lock(&qd_lock);
-			continue;
-		}
-		spin_unlock(&qd->qd_lockref.lock);
-
 		list_del(&qd->qd_list);
 
 		/* Also remove if this qd exists in the reclaim list */
@@ -1404,11 +1387,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		hlist_bl_del_rcu(&qd->qd_hlist);
 		spin_unlock_bucket(qd->qd_hash);
 
-		if (!qd->qd_lockref.count) {
-			gfs2_assert_warn(sdp, !qd->qd_change);
-			gfs2_assert_warn(sdp, !qd->qd_slot_count);
-		} else
-			gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
 		gfs2_glock_put(qd->qd_gl);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 35da5b19c0d..60f60f6181f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 	return 0;
 }
 
+static int init_threads(struct gfs2_sbd *sdp)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
+		fs_err(sdp, "can't start logd thread: %d\n", error);
+		return error;
+	}
+	sdp->sd_logd_process = p;
+
+	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
+		fs_err(sdp, "can't start quotad thread: %d\n", error);
+		goto fail;
+	}
+	sdp->sd_quotad_process = p;
+	return 0;
+
+fail:
+	kthread_stop(sdp->sd_logd_process);
+	return error;
+}
+
 /**
  * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
  * @sdp: the filesystem
@@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 	struct gfs2_log_header_host head;
 	int error;
 
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+	error = init_threads(sdp);
 	if (error)
 		return error;
 
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+	if (error)
+		goto fail_threads;
+
 	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 fail:
 	t_gh.gh_flags |= GL_NOCACHE;
 	gfs2_glock_dq_uninit(&t_gh);
-
+fail_threads:
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
 	return error;
 }
 
@@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	struct gfs2_holder t_gh;
 	int error;
 
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+
 	flush_workqueue(gfs2_delete_workqueue);
 	gfs2_quota_sync(sdp->sd_vfs, 0);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -857,9 +893,6 @@ restart:
 	}
 	spin_unlock(&sdp->sd_jindex_spin);
 
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
 		if (error)
-- 
cgit v1.2.3-70-g09d2


From ee2411a8db49a21bc55dc124e1b434ba194c8903 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 12 Dec 2013 17:29:32 +0000
Subject: GFS2: Clean up quota slot allocation

Quota slot allocation has historically used a vector of pages
and a set of homegrown find/test/set/clear bit functions. Since
the size of the bitmap is likely to be based on the default
qc file size, thats a couple of pages at most. So we ought
to be able to allocate that as a single chunk, with a vmalloc
fallback, just in case of memory fragmentation.

We are then able to use the kernel's own find/test/set/clear
bit functions, rather than rolling our own.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/incore.h |   3 +-
 fs/gfs2/quota.c  | 100 ++++++++++++++++---------------------------------------
 2 files changed, 30 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 59d99ec9d87..4b9aa5b6908 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -733,8 +733,7 @@ struct gfs2_sbd {
 	spinlock_t sd_trunc_lock;
 
 	unsigned int sd_quota_slots;
-	unsigned int sd_quota_chunks;
-	unsigned char **sd_quota_bitmap;
+	unsigned long *sd_quota_bitmap;
 
 	u64 sd_quota_sync_gen;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3287d987150..79be67ab860 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -315,50 +315,30 @@ static void qd_put(struct gfs2_quota_data *qd)
 
 static int slot_get(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-	unsigned int c, o = 0, b;
-	unsigned char byte = 0;
+	struct gfs2_sbd *sdp = qd->qd_sbd;
+	unsigned int bit;
+	int error = 0;
 
 	spin_lock(&qd_lock);
+	if (qd->qd_slot_count != 0)
+		goto out;
 
-	if (qd->qd_slot_count++) {
-		spin_unlock(&qd_lock);
-		return 0;
+	error = -ENOSPC;
+	bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
+	if (bit < sdp->sd_quota_slots) {
+		set_bit(bit, sdp->sd_quota_bitmap);
+		qd->qd_slot = bit;
+out:
+		qd->qd_slot_count++;
 	}
-
-	for (c = 0; c < sdp->sd_quota_chunks; c++)
-		for (o = 0; o < PAGE_SIZE; o++) {
-			byte = sdp->sd_quota_bitmap[c][o];
-			if (byte != 0xFF)
-				goto found;
-		}
-
-	goto fail;
-
-found:
-	for (b = 0; b < 8; b++)
-		if (!(byte & (1 << b)))
-			break;
-	qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
-
-	if (qd->qd_slot >= sdp->sd_quota_slots)
-		goto fail;
-
-	sdp->sd_quota_bitmap[c][o] |= 1 << b;
-
 	spin_unlock(&qd_lock);
 
-	return 0;
-
-fail:
-	qd->qd_slot_count--;
-	spin_unlock(&qd_lock);
-	return -ENOSPC;
+	return error;
 }
 
 static void slot_hold(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_sbd *sdp = qd->qd_sbd;
 
 	spin_lock(&qd_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
@@ -366,34 +346,14 @@ static void slot_hold(struct gfs2_quota_data *qd)
 	spin_unlock(&qd_lock);
 }
 
-static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-			     unsigned int bit, int new_value)
-{
-	unsigned int c, o, b = bit;
-	int old_value;
-
-	c = b / (8 * PAGE_SIZE);
-	b %= 8 * PAGE_SIZE;
-	o = b / 8;
-	b %= 8;
-
-	old_value = (bitmap[c][o] & (1 << b));
-	gfs2_assert_withdraw(sdp, !old_value != !new_value);
-
-	if (new_value)
-		bitmap[c][o] |= 1 << b;
-	else
-		bitmap[c][o] &= ~(1 << b);
-}
-
 static void slot_put(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_sbd *sdp = qd->qd_sbd;
 
 	spin_lock(&qd_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	if (!--qd->qd_slot_count) {
-		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+		BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
 		qd->qd_slot = -1;
 	}
 	spin_unlock(&qd_lock);
@@ -1269,6 +1229,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
 	unsigned int hash;
+	unsigned int bm_size;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
@@ -1277,20 +1238,16 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 		return -EIO;
 
 	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
-	sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
-
+	bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
+	bm_size *= sizeof(unsigned long);
 	error = -ENOMEM;
-
-	sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
-				       sizeof(unsigned char *), GFP_NOFS);
+	sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
+	if (sdp->sd_quota_bitmap == NULL)
+		sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
 	if (!sdp->sd_quota_bitmap)
 		return error;
 
-	for (x = 0; x < sdp->sd_quota_chunks; x++) {
-		sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
-		if (!sdp->sd_quota_bitmap[x])
-			goto fail;
-	}
+	memset(sdp->sd_quota_bitmap, 0, bm_size);
 
 	for (x = 0; x < blocks; x++) {
 		struct buffer_head *bh;
@@ -1339,7 +1296,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			qd->qd_slot_count = 1;
 
 			spin_lock(&qd_lock);
-			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+			BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
 			list_add(&qd->qd_list, &sdp->sd_quota_list);
 			atomic_inc(&sdp->sd_quota_count);
 			spin_unlock(&qd_lock);
@@ -1370,7 +1327,6 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 {
 	struct list_head *head = &sdp->sd_quota_list;
 	struct gfs2_quota_data *qd;
-	unsigned int x;
 
 	spin_lock(&qd_lock);
 	while (!list_empty(head)) {
@@ -1401,9 +1357,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
 
 	if (sdp->sd_quota_bitmap) {
-		for (x = 0; x < sdp->sd_quota_chunks; x++)
-			kfree(sdp->sd_quota_bitmap[x]);
-		kfree(sdp->sd_quota_bitmap);
+		if (is_vmalloc_addr(sdp->sd_quota_bitmap))
+			vfree(sdp->sd_quota_bitmap);
+		else
+			kfree(sdp->sd_quota_bitmap);
+		sdp->sd_quota_bitmap = NULL;
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 2d9e72303d538024627fb1fe2cbde48aec12acc0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 13 Dec 2013 11:46:28 +0000
Subject: GFS2: Move quota bitmap operations under their own lock

Gradually, the global qd_lock is being used for less and less.
After this patch it will only be used for the per super block
list whose purpose is to allow syncing of changes back to the
master quota file from the local quota changes file. Fixing
up that process to make it more efficient will be the subject
of a later patch, however this patch removes another barrier
to doing that.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/incore.h     |  1 +
 fs/gfs2/ops_fstype.c |  1 +
 fs/gfs2/quota.c      | 16 ++++++++--------
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4b9aa5b6908..8c64e268b7e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -734,6 +734,7 @@ struct gfs2_sbd {
 
 	unsigned int sd_quota_slots;
 	unsigned long *sd_quota_bitmap;
+	spinlock_t sd_bitmap_lock;
 
 	u64 sd_quota_sync_gen;
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 27648e803d1..06a66c9624e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	init_waitqueue_head(&sdp->sd_quota_wait);
 	INIT_LIST_HEAD(&sdp->sd_trunc_list);
 	spin_lock_init(&sdp->sd_trunc_lock);
+	spin_lock_init(&sdp->sd_bitmap_lock);
 
 	mapping = &sdp->sd_aspace;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 79be67ab860..02a2740f246 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -76,6 +76,7 @@
 #define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
 
 /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
+/*                     -> sd_bitmap_lock                              */
 static DEFINE_SPINLOCK(qd_lock);
 struct list_lru gfs2_qd_lru;
 
@@ -319,7 +320,7 @@ static int slot_get(struct gfs2_quota_data *qd)
 	unsigned int bit;
 	int error = 0;
 
-	spin_lock(&qd_lock);
+	spin_lock(&sdp->sd_bitmap_lock);
 	if (qd->qd_slot_count != 0)
 		goto out;
 
@@ -331,7 +332,7 @@ static int slot_get(struct gfs2_quota_data *qd)
 out:
 		qd->qd_slot_count++;
 	}
-	spin_unlock(&qd_lock);
+	spin_unlock(&sdp->sd_bitmap_lock);
 
 	return error;
 }
@@ -340,23 +341,23 @@ static void slot_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_sbd;
 
-	spin_lock(&qd_lock);
+	spin_lock(&sdp->sd_bitmap_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
-	spin_unlock(&qd_lock);
+	spin_unlock(&sdp->sd_bitmap_lock);
 }
 
 static void slot_put(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_sbd;
 
-	spin_lock(&qd_lock);
+	spin_lock(&sdp->sd_bitmap_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	if (!--qd->qd_slot_count) {
 		BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
 		qd->qd_slot = -1;
 	}
-	spin_unlock(&qd_lock);
+	spin_unlock(&sdp->sd_bitmap_lock);
 }
 
 static int bh_get(struct gfs2_quota_data *qd)
@@ -434,8 +435,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
 	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 	set_bit(QDF_LOCKED, &qd->qd_flags);
 	qd->qd_change_sync = qd->qd_change;
-	gfs2_assert_warn(sdp, qd->qd_slot_count);
-	qd->qd_slot_count++;
+	slot_hold(qd);
 	return 1;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1e3d36206bd6dfa34c85b073faba3d94ee6aba79 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 15 Jan 2014 12:57:25 +0000
Subject: GFS2: Fix kbuild test robot reported warning

Well I don't get the same warning locally as the kbuild
robot, but I guess this should fix the problem, anyway.
Here is the warning:

head:   2d9e72303d538024627fb1fe2cbde48aec12acc0
commit: ee2411a8db49a21bc55dc124e1b434ba194c8903 [19/20] GFS2: Clean up quota slot allocation
config: make ARCH=powerpc allmodconfig

All error/warnings:

   fs/gfs2/quota.c: In function 'gfs2_quota_init':
>> fs/gfs2/quota.c:1246:3: error: implicit declaration of function '__vmalloc' [-Werror=implicit-function-declaration]
      sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
      ^
>> fs/gfs2/quota.c:1246:24: warning: assignment makes pointer from integer without a cast [enabled by default]
      sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
                           ^
   fs/gfs2/quota.c: In function 'gfs2_quota_cleanup':
>> fs/gfs2/quota.c:1361:4: error: implicit declaration of function 'vfree' [-Werror=implicit-function-declaration]
       vfree(sdp->sd_quota_bitmap);

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 02a2740f246..8bec0e3192d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -56,6 +56,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/bit_spinlock.h>
 #include <linux/jhash.h>
+#include <linux/vmalloc.h>
 
 #include "gfs2.h"
 #include "incore.h"
-- 
cgit v1.2.3-70-g09d2


From c33ec32692e1f2f4650f7bf5bb1108bb346b82a4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 16 Jan 2014 16:20:40 +0900
Subject: f2fs: avoid f2fs_balance_fs call during pageout

This patch should resolve the following bug.

=========================================================
[ INFO: possible irq lock inversion dependency detected ]
3.13.0-rc5.f2fs+ #6 Not tainted
---------------------------------------------------------
kswapd0/41 just changed the state of lock:
 (&sbi->gc_mutex){+.+.-.}, at: [<ffffffffa030503e>] f2fs_balance_fs+0xae/0xd0 [f2fs]
but this lock took another, RECLAIM_FS-READ-unsafe lock in the past:
 (&sbi->cp_rwsem){++++.?}

and interrupts could create inverse lock ordering between them.

other info that might help us debug this:
Chain exists of:
  &sbi->gc_mutex --> &sbi->cp_mutex --> &sbi->cp_rwsem

 Possible interrupt unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&sbi->cp_rwsem);
                               local_irq_disable();
                               lock(&sbi->gc_mutex);
                               lock(&sbi->cp_mutex);
  <Interrupt>
    lock(&sbi->gc_mutex);

 *** DEADLOCK ***

This bug is due to the f2fs_balance_fs call in f2fs_write_data_page.
If f2fs_write_data_page is triggered by wbc->for_reclaim via kswapd, it should
not call f2fs_balance_fs which tries to get a mutex grabbed by original syscall
flow.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 19ad0669447..e57bde02e37 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -842,8 +842,10 @@ write:
 	else if (err)
 		goto redirty_out;
 
-	if (wbc->for_reclaim)
+	if (wbc->for_reclaim) {
 		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+		need_balance_fs = false;
+	}
 
 	clear_cold_data(page);
 out:
-- 
cgit v1.2.3-70-g09d2


From c434cbc0edda6a7a59d22b9d5d4279989d0ab804 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Thu, 16 Jan 2014 11:58:54 +0900
Subject: f2fs: missing REQ_META and REQ_PRIO when sync_meta_pages(META_FLUSH)

Doing sync_meta_pages with META_FLUSH when checkpoint, we overide rw
using WRITE_FLUSH_FUA. At this time, we also should set
REQ_META|REQ_PRIO.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e57bde02e37..bda889e2ca6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -144,7 +144,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 	/* change META to META_FLUSH in the checkpoint procedure */
 	if (type >= META_FLUSH) {
 		io->fio.type = META_FLUSH;
-		io->fio.rw = WRITE_FLUSH_FUA;
+		io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
 	}
 	__submit_merged_bio(io);
 	mutex_unlock(&io->io_mutex);
-- 
cgit v1.2.3-70-g09d2


From ac3beb6a5de048e7c0676f630ad2048a7b37b305 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 16 Jan 2014 10:31:13 +0000
Subject: GFS2: Don't use ENOBUFS when ENOMEM is the correct error code

Al Viro has tactfully pointed out that we are using the incorrect
error code in some cases. This patch fixes that, and also removes
the (unused) return value for glock dumping.

>        * gfs2_iget() - ENOBUFS instead of ENOMEM.  ENOBUFS is
> "No buffer space available (POSIX.1 (XSI STREAMS option))" and since
> we don't support STREAMS it's probably fair game, but... what the hell?

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
---
 fs/gfs2/glock.c      | 29 ++++++++++-------------------
 fs/gfs2/glock.h      |  2 +-
 fs/gfs2/glops.c      |  6 ++----
 fs/gfs2/incore.h     |  2 +-
 fs/gfs2/inode.c      |  2 +-
 fs/gfs2/ops_fstype.c |  2 +-
 fs/gfs2/rgrp.c       |  5 ++---
 fs/gfs2/rgrp.h       |  2 +-
 8 files changed, 19 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6f7a47c0525..ca0be6c69a2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 	glock_hash_walk(thaw_glock, sdp);
 }
 
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
 {
-	int ret;
 	spin_lock(&gl->gl_spin);
-	ret = gfs2_dump_glock(seq, gl);
+	gfs2_dump_glock(seq, gl);
 	spin_unlock(&gl->gl_spin);
-	return ret;
 }
 
 static void dump_glock_func(struct gfs2_glock *gl)
@@ -1647,10 +1645,9 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
  * @seq: the seq_file struct
  * @gh: the glock holder
  *
- * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
 	struct task_struct *gh_owner = NULL;
 	char flags_buf[32];
@@ -1666,7 +1663,6 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 		       gh_owner ? gh_owner->comm : "(ended)",
 		       (void *)gh->gh_ip);
 	rcu_read_unlock();
-	return 0;
 }
 
 static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1721,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
  * example. The field's are n = number (id of the object), f = flags,
  * t = type, s = state, r = refcount, e = error, p = pid.
  *
- * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	unsigned long long dtime;
 	const struct gfs2_holder *gh;
 	char gflags_buf[32];
-	int error = 0;
 
 	dtime = jiffies - gl->gl_demote_time;
 	dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1747,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 		  atomic_read(&gl->gl_revokes),
 		  (int)gl->gl_lockref.count, gl->gl_hold_time);
 
-	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		error = dump_holder(seq, gh);
-		if (error)
-			goto out;
-	}
+	list_for_each_entry(gh, &gl->gl_holders, gh_list)
+		dump_holder(seq, gh);
+
 	if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-		error = glops->go_dump(seq, gl);
-out:
-	return error;
+		glops->go_dump(seq, gl);
 }
 
 static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1953,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-	return dump_glock(seq, iter_ptr);
+	dump_glock(seq, iter_ptr);
+	return 0;
 }
 
 static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 6647d77366b..32572f71f02 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
 			     struct gfs2_holder *gh);
 extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
 extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index b792dbcc83f..3bf0631b5d5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -437,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh)
  * @seq: The iterator
  * @ip: the inode
  *
- * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
 	const struct gfs2_inode *ip = gl->gl_object;
 	if (ip == NULL)
-		return 0;
+		return;
 	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
 		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
 		  (unsigned int)ip->i_diskflags,
 		  (unsigned long long)i_size_read(&ip->i_inode));
-	return 0;
 }
 
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8c64e268b7e..cf0e34400f7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -218,7 +218,7 @@ struct gfs2_glock_operations {
 	int (*go_demote_ok) (const struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
-	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+	void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
 	void (*go_callback)(struct gfs2_glock *gl, bool remote);
 	const int go_type;
 	const unsigned long go_flags;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 51cf10df83e..b8956f24e3d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 	ip = GFS2_I(inode);
 
 	if (!inode)
-		return ERR_PTR(-ENOBUFS);
+		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
 		struct gfs2_sbd *sdp = GFS2_SB(inode);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 06a66c9624e..1e712b566d7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -231,7 +231,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 
 	page = alloc_page(GFP_NOFS);
 	if (unlikely(!page))
-		return -ENOBUFS;
+		return -ENOMEM;
 
 	ClearPageUptodate(page);
 	ClearPageDirty(page);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 183cf0f0052..e14e8877db9 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2120,14 +2120,14 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
  *
  */
 
-int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
 	struct gfs2_rgrpd *rgd = gl->gl_object;
 	struct gfs2_blkreserv *trs;
 	const struct rb_node *n;
 
 	if (rgd == NULL)
-		return 0;
+		return;
 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
@@ -2138,7 +2138,6 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 		dump_rs(seq, trs);
 	}
 	spin_unlock(&rgd->rd_rsspin);
-	return 0;
 }
 
 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3a10d2ffbbe..463ab2e95d1 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   struct buffer_head *bh,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
-- 
cgit v1.2.3-70-g09d2


From 8b127d0494890ced4e35fb8eca6d6b57564ee1c6 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 16 Jan 2014 08:52:16 -0500
Subject: GFS2: Small cleanup

This is a small cleanup to function gfs2_rgrp_go_lock so that it
uses rgd instead of its more complicated twin.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e14e8877db9..a1da2134923 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1199,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
 
 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
 		return 0;
-	return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object);
+	return gfs2_rgrp_bh_get(rgd);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From fc12c80aa57ee90385dc90e4263ec1a66200ba76 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Jan 2014 17:42:53 -0500
Subject: ceph: trivial comment fix

"disconnected" is too easily confused with "DCACHE_DISCONNECTED".  I
think "unhashed" is the more precise term here.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/caps.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9289c6b2f1b..80dad0d491a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2350,11 +2350,11 @@ static void invalidate_aliases(struct inode *inode)
 	d_prune_aliases(inode);
 	/*
 	 * For non-directory inode, d_find_alias() only returns
-	 * connected dentry. After calling d_invalidate(), the
-	 * dentry become disconnected.
+	 * hashed dentry. After calling d_invalidate(), the
+	 * dentry becomes unhashed.
 	 *
 	 * For directory inode, d_find_alias() can return
-	 * disconnected dentry. But directory inode should have
+	 * unhashed dentry. But directory inode should have
 	 * one alias at most.
 	 */
 	while ((dn = d_find_alias(inode))) {
-- 
cgit v1.2.3-70-g09d2


From db4aad209bc9aefd91f0a9aeb9e37364088b39ad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 Jan 2014 09:58:25 -0500
Subject: kernfs: associate a new kernfs_node with its parent on creation

Once created, a kernfs_node is always destroyed by kernfs_put().
Since ba7443bc656e ("sysfs, kernfs: implement
kernfs_create/destroy_root()"), kernfs_put() depends on kernfs_root()
to locate the ino_ida.  kernfs_root() in turn depends on
kernfs_node->parent being set for !dir nodes.  This means that
kernfs_put() of a !dir node requires its ->parent to be initialized.

This leads to oops when a newly created !dir node is destroyed without
going through kernfs_add_one() or after failing kernfs_add_one()
before ->parent is set.  kernfs_root() invoked from kernfs_put() will
try to dereference NULL parent.

Fix it by moving parent association to kernfs_new_node() from
kernfs_add_one().  kernfs_new_node() now takes @parent instead of
@root and determines the root from the parent and also sets the new
node's parent properly.  @parent parameter is removed from
kernfs_add_one().  As there's no parent when creating the root node,
__kernfs_new_node() which takes @root as before and doesn't set the
parent is used in that case.

This ensures that a kernfs_node in any stage in its life has its
parent associated and thus can be put.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c             | 40 ++++++++++++++++++++++++++--------------
 fs/kernfs/file.c            |  5 ++---
 fs/kernfs/kernfs-internal.h |  8 ++++----
 fs/kernfs/symlink.c         |  5 ++---
 4 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 510b5062ef3..5104cf5d25c 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -324,8 +324,9 @@ const struct dentry_operations kernfs_dops = {
 	.d_release	= kernfs_dop_release,
 };
 
-struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
-				    umode_t mode, unsigned flags)
+static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+					     const char *name, umode_t mode,
+					     unsigned flags)
 {
 	char *dup_name = NULL;
 	struct kernfs_node *kn;
@@ -362,6 +363,20 @@ struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
 	return NULL;
 }
 
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+				    const char *name, umode_t mode,
+				    unsigned flags)
+{
+	struct kernfs_node *kn;
+
+	kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+	if (kn) {
+		kernfs_get(parent);
+		kn->parent = parent;
+	}
+	return kn;
+}
+
 /**
  *	kernfs_addrm_start - prepare for kernfs_node add/remove
  *	@acxt: pointer to kernfs_addrm_cxt to be used
@@ -386,11 +401,10 @@ void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
  *	kernfs_add_one - add kernfs_node to parent without warning
  *	@acxt: addrm context to use
  *	@kn: kernfs_node to be added
- *	@parent: the parent kernfs_node to add @kn to
  *
- *	Get @parent and set @kn->parent to it and increment nlink of the
- *	parent inode if @kn is a directory and link into the children list
- *	of the parent.
+ *	The caller must already have initialized @kn->parent.  This
+ *	function increments nlink of the parent's inode if @kn is a
+ *	directory and link into the children list of the parent.
  *
  *	This function should be called between calls to
  *	kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
@@ -403,9 +417,9 @@ void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
-		  struct kernfs_node *parent)
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
 {
+	struct kernfs_node *parent = kn->parent;
 	bool has_ns = kernfs_ns_enabled(parent);
 	struct kernfs_iattrs *ps_iattr;
 	int ret;
@@ -423,8 +437,6 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
 		return -ENOENT;
 
 	kn->hash = kernfs_name_hash(kn->name, kn->ns);
-	kn->parent = parent;
-	kernfs_get(parent);
 
 	ret = kernfs_link_sibling(kn);
 	if (ret)
@@ -600,7 +612,8 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
 
 	ida_init(&root->ino_ida);
 
-	kn = kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR);
+	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+			       KERNFS_DIR);
 	if (!kn) {
 		ida_destroy(&root->ino_ida);
 		kfree(root);
@@ -648,8 +661,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	int rc;
 
 	/* allocate */
-	kn = kernfs_new_node(kernfs_root(parent), name, mode | S_IFDIR,
-			     KERNFS_DIR);
+	kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -659,7 +671,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 
 	/* link in */
 	kernfs_addrm_start(&acxt);
-	rc = kernfs_add_one(&acxt, kn, parent);
+	rc = kernfs_add_one(&acxt, kn);
 	kernfs_addrm_finish(&acxt);
 
 	if (!rc)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index bdd38854ef6..dbf397bfdff 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -829,8 +829,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 	if (name_is_static)
 		flags |= KERNFS_STATIC_NAME;
 
-	kn = kernfs_new_node(kernfs_root(parent), name,
-			     (mode & S_IALLUGO) | S_IFREG, flags);
+	kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -857,7 +856,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 		kn->flags |= KERNFS_HAS_MMAP;
 
 	kernfs_addrm_start(&acxt);
-	rc = kernfs_add_one(&acxt, kn, parent);
+	rc = kernfs_add_one(&acxt, kn);
 	kernfs_addrm_finish(&acxt);
 
 	if (rc) {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index c6ba5bc37a9..eb536b76374 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -101,11 +101,11 @@ extern const struct inode_operations kernfs_dir_iops;
 struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
 void kernfs_put_active(struct kernfs_node *kn);
 void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
-int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
-		   struct kernfs_node *parent);
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
 void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
-struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
-				    umode_t mode, unsigned flags);
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+				    const char *name, umode_t mode,
+				    unsigned flags);
 
 /*
  * file.c
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index a03e26036ef..4d457055acb 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -30,8 +30,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	struct kernfs_addrm_cxt acxt;
 	int error;
 
-	kn = kernfs_new_node(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
-			     KERNFS_LINK);
+	kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -41,7 +40,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 	kernfs_get(target);	/* ref owned by symlink */
 
 	kernfs_addrm_start(&acxt);
-	error = kernfs_add_one(&acxt, kn, parent);
+	error = kernfs_add_one(&acxt, kn);
 	kernfs_addrm_finish(&acxt);
 
 	if (!error)
-- 
cgit v1.2.3-70-g09d2


From 263b4509ec4d47e0da3e753f85a39ea12d1eff24 Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Fri, 17 Jan 2014 15:12:05 -0500
Subject: nfs: always make sure page is up-to-date before extending a write to
 cover the entire page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We should always make sure the cached page is up-to-date when we're
determining whether we can extend a write to cover the full page -- even
if we've received a write delegation from the server.

Commit c7559663 added logic to skip this check if we have a write
delegation, which can lead to data corruption such as the following
scenario if client B receives a write delegation from the NFS server:

Client A:
    # echo 123456789 > /mnt/file

Client B:
    # echo abcdefghi >> /mnt/file
    # cat /mnt/file
    0�D0�abcdefghi

Just because we hold a write delegation doesn't mean that we've read in
the entire page contents.

Cc: <stable@vger.kernel.org> # v3.11+
Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 77a00c66c7d..a44a87268a6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -922,19 +922,20 @@ out:
  * extend the write to cover the entire page in order to avoid fragmentation
  * inefficiencies.
  *
- * If the file is opened for synchronous writes or if we have a write delegation
- * from the server then we can just skip the rest of the checks.
+ * If the file is opened for synchronous writes then we can just skip the rest
+ * of the checks.
  */
 static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
 {
 	if (file->f_flags & O_DSYNC)
 		return 0;
+	if (!nfs_write_pageuptodate(page, inode))
+		return 0;
 	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
 		return 1;
-	if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
-			(inode->i_flock->fl_start == 0 &&
+	if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
 			inode->i_flock->fl_end == OFFSET_MAX &&
-			inode->i_flock->fl_type != F_RDLCK)))
+			inode->i_flock->fl_type != F_RDLCK))
 		return 1;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From d57b9c9a999a8f4475fe73fba629c964245800ca Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Thu, 16 Jan 2014 13:51:07 -0500
Subject: GFS2: revert "GFS2: d_splice_alias() can't return error"

0d0d110720d7960b77c03c9f2597faaff4b484ae asserts that "d_splice_alias()
can't return error unless it was given an IS_ERR(inode)".

That was true of the implementation of d_splice_alias, but this is
really a problem with d_splice_alias: at a minimum it should be able to
return -ELOOP in the case where inserting the given dentry would cause a
directory loop.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b8956f24e3d..890588c7fb3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -604,6 +604,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	error = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		d = d_splice_alias(inode, dentry);
+		error = PTR_ERR(d);
+		if (IS_ERR(d))
+			goto fail_gunlock;
 		error = 0;
 		if (file) {
 			if (S_ISREG(inode->i_mode)) {
@@ -799,6 +802,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	d = d_splice_alias(inode, dentry);
+	if (IS_ERR(d)) {
+		iput(inode);
+		gfs2_glock_dq_uninit(&gh);
+		return d;
+	}
 	if (file && S_ISREG(inode->i_mode))
 		error = finish_open(file, dentry, gfs2_open_common, opened);
 
-- 
cgit v1.2.3-70-g09d2


From 64590daa9e0dfb3aad89e3ab9230683b76211d5b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 17 Jan 2014 17:03:41 -0500
Subject: NFSv4.1: Handle errors correctly in nfs41_walk_client_list

Both nfs41_walk_client_list and nfs40_walk_client_list expect the
'status' variable to be set to the value -NFS4ERR_STALE_CLIENTID
if the loop fails to find a match.
The problem is that the 'pos->cl_cons_state > NFS_CS_READY' changes
the value of 'status', and sets it either to the value '0' (which
indicates success), or to the value EINTR.

Cc: stable@vger.kernel.org # 3.7.x: 7b1f1fd1842e6: NFSv4/4.1: Fix bugs in
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4client.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index c1b7a80b670..06e770ace07 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -498,9 +498,10 @@ int nfs40_walk_client_list(struct nfs_client *new,
 			prev = pos;
 
 			status = nfs_wait_client_init_complete(pos);
-			spin_lock(&nn->nfs_client_lock);
 			if (status < 0)
-				continue;
+				goto out;
+			status = -NFS4ERR_STALE_CLIENTID;
+			spin_lock(&nn->nfs_client_lock);
 		}
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
@@ -638,7 +639,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
 			}
 			spin_lock(&nn->nfs_client_lock);
 			if (status < 0)
-				continue;
+				break;
+			status = -NFS4ERR_STALE_CLIENTID;
 		}
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
-- 
cgit v1.2.3-70-g09d2


From 6c311ec6c2d9e015d454b4e3fda8008b5bebf316 Mon Sep 17 00:00:00 2001
From: Chris Fries <cfries@motorola.com>
Date: Fri, 17 Jan 2014 14:44:39 -0600
Subject: f2fs: clean checkpatch warnings

Fixed a variety of trivial checkpatch warnings.  The only delta should
be some minor formatting on log strings that were split / too long.

Signed-off-by: Chris Fries <cfries@motorola.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c     |  2 +-
 fs/f2fs/debug.c    |  2 +-
 fs/f2fs/dir.c      |  5 +++--
 fs/f2fs/f2fs.h     |  6 +++---
 fs/f2fs/file.c     |  2 +-
 fs/f2fs/inode.c    | 12 ++++++++----
 fs/f2fs/node.c     |  2 +-
 fs/f2fs/recovery.c | 12 ++++++------
 fs/f2fs/segment.c  |  6 ++++--
 fs/f2fs/segment.h  | 13 +++++++------
 fs/f2fs/super.c    |  8 +++++---
 11 files changed, 40 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index bda889e2ca6..f5fac16bc6e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -792,7 +792,7 @@ static int f2fs_write_data_page(struct page *page,
 	int err = 0;
 	struct f2fs_io_info fio = {
 		.type = DATA,
-		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC: WRITE,
+		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
 	};
 
 	if (page->index < end_index)
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 6e4ac9a25df..63cb7e215e0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -245,7 +245,7 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - node blocks : %d\n", si->node_blks);
 		seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
 			   si->hit_ext, si->total_ext);
-		seq_printf(s, "\nBalancing F2FS Async:\n");
+		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
 		seq_printf(s, "  - dents: %4d in dirs:%4d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index f815ca0c581..cd055b6fcff 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -430,7 +430,8 @@ next:
  * Caller should grab and release a rwsem by calling f2fs_lock_op() and
  * f2fs_unlock_op().
  */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+						struct inode *inode)
 {
 	unsigned int bit_pos;
 	unsigned int level;
@@ -631,7 +632,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 
 	bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
 
-	for ( ; n < npages; n++) {
+	for (; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n);
 		if (IS_ERR(dentry_page))
 			continue;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ee304fb71e1..5ab3981938d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -607,9 +607,9 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 static inline int F2FS_HAS_BLOCKS(struct inode *inode)
 {
 	if (F2FS_I(inode)->i_xattr_nid)
-		return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
+		return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
 	else
-		return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
+		return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
 }
 
 static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
@@ -1231,7 +1231,7 @@ struct f2fs_stat_info {
 
 static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 {
-	return (struct f2fs_stat_info*)sbi->stat_info;
+	return (struct f2fs_stat_info *)sbi->stat_info;
 }
 
 #define stat_inc_call_count(si)		((si)->call_count++)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 14511b00fff..85e91ca88d5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -202,7 +202,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	raw_node = F2FS_NODE(dn->node_page);
 	addr = blkaddr_in_node(raw_node) + ofs;
 
-	for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
+	for (; count > 0; count--, addr++, dn->ofs_in_node++) {
 		block_t blkaddr = le32_to_cpu(*addr);
 		if (blkaddr == NULL_ADDR)
 			continue;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index ffa4c6d0fab..4d67ed736dc 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -42,9 +42,11 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		if (ri->i_addr[0])
-			inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+			inode->i_rdev =
+				old_decode_dev(le32_to_cpu(ri->i_addr[0]));
 		else
-			inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(ri->i_addr[1]));
 	}
 }
 
@@ -52,11 +54,13 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
-			ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev));
+			ri->i_addr[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
 			ri->i_addr[1] = 0;
 		} else {
 			ri->i_addr[0] = 0;
-			ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev));
+			ri->i_addr[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			ri->i_addr[2] = 0;
 		}
 	}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b8c9301db52..226a05a27e3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1196,7 +1196,7 @@ static int f2fs_write_node_page(struct page *page,
 	struct node_info ni;
 	struct f2fs_io_info fio = {
 		.type = NODE,
-		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC: WRITE,
+		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
 	};
 
 	if (unlikely(sbi->por_doing))
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 655791e518c..976a7a934db 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -95,9 +95,9 @@ out_unmap_put:
 	kunmap(page);
 	f2fs_put_page(page, 0);
 out:
-	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
-			"ino = %x, name = %s, dir = %lx, err = %d",
-			ino_of_node(ipage), raw_inode->i_name,
+	f2fs_msg(inode->i_sb, KERN_NOTICE,
+			"%s: ino = %x, name = %s, dir = %lx, err = %d",
+			__func__, ino_of_node(ipage), raw_inode->i_name,
 			IS_ERR(dir) ? 0 : dir->i_ino, err);
 	return err;
 }
@@ -366,9 +366,9 @@ err:
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
 out:
-	f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
-			"recovered_data = %d blocks, err = %d",
-			inode->i_ino, recovered, err);
+	f2fs_msg(sbi->sb, KERN_NOTICE,
+		"recover_data: ino = %lx, recovered = %d blocks, err = %d",
+		inode->i_ino, recovered, err);
 	return err;
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a934e6f2738..e82423fbcb9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -946,7 +946,8 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
 	do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
 }
 
-void rewrite_data_page(struct page *page, block_t old_blkaddr, struct f2fs_io_info *fio)
+void rewrite_data_page(struct page *page, block_t old_blkaddr,
+					struct f2fs_io_info *fio)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1647,7 +1648,8 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 
 			mutex_lock(&curseg->curseg_mutex);
 			for (i = 0; i < sits_in_cursum(sum); i++) {
-				if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
+				if (le32_to_cpu(segno_in_journal(sum, i))
+								== start) {
 					sit = sit_in_journal(sum, i);
 					mutex_unlock(&curseg->curseg_mutex);
 					goto got_it;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e9a10bdd7fe..5731682d751 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -448,8 +448,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 
 static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
-	return ((prefree_segments(sbi) / sbi->segs_per_sec)
-			+ free_sections(sbi) < overprovision_sections(sbi));
+	return (prefree_segments(sbi) / sbi->segs_per_sec)
+			+ free_sections(sbi) < overprovision_sections(sbi);
 }
 
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -460,18 +460,19 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
 	if (unlikely(sbi->por_doing))
 		return false;
 
-	return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
-						reserved_sections(sbi)));
+	return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+						reserved_sections(sbi));
 }
 
 static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
 {
-	return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments);
+	return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
 }
 
 static inline int utilization(struct f2fs_sb_info *sbi)
 {
-	return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count);
+	return div_u64((u64)valid_user_blocks(sbi) * 100,
+					sbi->user_block_count);
 }
 
 /*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b070f3010ce..1a85f83abd5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -535,7 +535,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
 {
 	struct super_block *sb = seq->private;
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+	unsigned int total_segs =
+			le32_to_cpu(sbi->raw_super->segment_count_main);
 	int i;
 
 	for (i = 0; i < total_segs; i++) {
@@ -816,8 +817,9 @@ retry:
 	/* sanity checking of raw super */
 	if (sanity_check_raw_super(sb, *raw_super)) {
 		brelse(*raw_super_buf);
-		f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
-				"in %dth superblock", block + 1);
+		f2fs_msg(sb, KERN_ERR,
+			"Can't find valid F2FS filesystem in %dth superblock",
+								block + 1);
 		if (block == 0) {
 			block++;
 			goto retry;
-- 
cgit v1.2.3-70-g09d2


From abf9767c823bf57837c2032f21332a6efc38a13e Mon Sep 17 00:00:00 2001
From: Christian Engelmayer <cengelma@gmx.at>
Date: Sat, 11 Jan 2014 01:57:22 +0100
Subject: cifs: Fix memory leak in cifs_hardlink()

Fix a potential memory leak in the cifs_hardlink() error handling path.
Detected by Coverity: CID 728510, CID 728511.

Signed-off-by: Christian Engelmayer <cengelma@gmx.at>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/link.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 92aee08483a..28bc8ee9705 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -438,8 +438,10 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 	else {
 		server = tcon->ses->server;
-		if (!server->ops->create_hardlink)
-			return -ENOSYS;
+		if (!server->ops->create_hardlink) {
+			rc = -ENOSYS;
+			goto cifs_hl_exit;
+		}
 		rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
 						  cifs_sb);
 		if ((rc == -EIO) || (rc == -EINVAL))
-- 
cgit v1.2.3-70-g09d2


From b5be1a1c4c57a092cb60c709a0491d4ecead3d58 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Mon, 25 Nov 2013 17:09:49 +0000
Subject: cifs: Rename and cleanup open_query_close_cifs_symlink()

Rename open_query_close_cifs_symlink to cifs_query_mf_symlink() to make
the name more consistent with other protocol version specific functions.

We also pass tcon as an argument to the function. This is already
available in the calling functions and we can avoid having to make an
unnecessary lookup.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h  |  5 +++--
 fs/cifs/cifsproto.h |  7 ++++---
 fs/cifs/link.c      | 37 ++++++++++++-------------------------
 fs/cifs/smb1ops.c   |  2 +-
 4 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f918a998a08..fba4d1341f8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -370,8 +370,9 @@ struct smb_version_operations {
 	void (*new_lease_key)(struct cifs_fid *);
 	int (*generate_signingkey)(struct cifs_ses *);
 	int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
-	int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *,
-				struct cifs_sb_info *, unsigned int);
+	int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
+				struct cifs_sb_info *, const unsigned char *,
+				char *, unsigned int *);
 	/* if we can do cache read operations */
 	bool (*is_read_op)(__u32);
 	/* set oplock level for the inode */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2c29db6a247..10b9ab1d1ae 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -496,7 +496,8 @@ void cifs_writev_complete(struct work_struct *work);
 struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
 						work_func_t complete);
 void cifs_writedata_release(struct kref *refcount);
-int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
-			unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
-			unsigned int xid);
+int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			  struct cifs_sb_info *cifs_sb,
+			  const unsigned char *path, char *pbuf,
+			  unsigned int *pbytes_read);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 28bc8ee9705..c8e29682ccd 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -305,54 +305,41 @@ CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
 }
 
 int
-open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
-			unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
-			unsigned int xid)
+cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		      struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		      char *pbuf, unsigned int *pbytes_read)
 {
 	int rc;
 	int oplock = 0;
 	__u16 netfid = 0;
-	struct tcon_link *tlink;
-	struct cifs_tcon *ptcon;
 	struct cifs_io_parms io_parms;
 	int buf_type = CIFS_NO_BUFFER;
 	FILE_ALL_INFO file_info;
 
-	tlink = cifs_sb_tlink(cifs_sb);
-	if (IS_ERR(tlink))
-		return PTR_ERR(tlink);
-	ptcon = tlink_tcon(tlink);
-
-	rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ,
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
 			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
 			 cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc != 0) {
-		cifs_put_tlink(tlink);
+	if (rc)
 		return rc;
-	}
 
-	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
-		CIFSSMBClose(xid, ptcon, netfid);
-		cifs_put_tlink(tlink);
+	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
 		/* it's not a symlink */
-		return rc;
-	}
+		goto out;
 
 	io_parms.netfid = netfid;
 	io_parms.pid = current->tgid;
-	io_parms.tcon = ptcon;
+	io_parms.tcon = tcon;
 	io_parms.offset = 0;
 	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
 
 	rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
-	CIFSSMBClose(xid, ptcon, netfid);
-	cifs_put_tlink(tlink);
+out:
+	CIFSSMBClose(xid, tcon, netfid);
 	return rc;
 }
 
-
 int
 CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
 		   struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -372,8 +359,8 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
 		return -ENOMEM;
 
 	if (tcon->ses->server->ops->query_mf_symlink)
-		rc = tcon->ses->server->ops->query_mf_symlink(path, buf,
-						&bytes_read, cifs_sb, xid);
+		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+					      cifs_sb, path, buf, &bytes_read);
 	else
 		rc = -ENOSYS;
 
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 5f5ba0dc2ee..099c2760225 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1009,7 +1009,7 @@ struct smb_version_operations smb1_operations = {
 	.mand_lock = cifs_mand_lock,
 	.mand_unlock_range = cifs_unlock_range,
 	.push_mand_locks = cifs_push_mandatory_locks,
-	.query_mf_symlink = open_query_close_cifs_symlink,
+	.query_mf_symlink = cifs_query_mf_symlink,
 	.is_read_op = cifs_is_read_op,
 };
 
-- 
cgit v1.2.3-70-g09d2


From cb084b1a9be34729bea23428c1a42f7d2f5defbc Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Mon, 25 Nov 2013 17:09:50 +0000
Subject: cifs: Rename MF symlink function names

Clean up camel case in functionnames.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsproto.h |  4 ++--
 fs/cifs/inode.c     | 12 ++++++------
 fs/cifs/link.c      | 32 +++++++++++++++-----------------
 fs/cifs/readdir.c   |  2 +-
 4 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 10b9ab1d1ae..78bb6d6ad06 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -476,8 +476,8 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
 			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
-extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
-extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
+extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
+extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 			      struct cifs_sb_info *cifs_sb,
 			      struct cifs_fattr *fattr,
 			      const unsigned char *path);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 49719b8228e..6f7f57a3a46 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -383,10 +383,10 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* check for Minshall+French symlinks */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-		int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
-					       full_path);
+		int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+					     full_path);
 		if (tmprc)
-			cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
+			cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
 	}
 
 	if (*pinode == NULL) {
@@ -800,10 +800,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 
 	/* check for Minshall+French symlinks */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-		tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
-					   full_path);
+		tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+					 full_path);
 		if (tmprc)
-			cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
+			cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
 	}
 
 	if (!*inode) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index c8e29682ccd..500bc77a786 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -91,10 +91,8 @@ symlink_hash_err:
 }
 
 static int
-CIFSParseMFSymlink(const u8 *buf,
-		   unsigned int buf_len,
-		   unsigned int *_link_len,
-		   char **_link_str)
+parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
+		 char **_link_str)
 {
 	int rc;
 	unsigned int link_len;
@@ -137,7 +135,7 @@ CIFSParseMFSymlink(const u8 *buf,
 }
 
 static int
-CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
 {
 	int rc;
 	unsigned int link_len;
@@ -181,7 +179,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 }
 
 static int
-CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
+create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 		    const char *fromName, const char *toName,
 		    struct cifs_sb_info *cifs_sb)
 {
@@ -202,7 +200,7 @@ CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
 	if (!buf)
 		return -ENOMEM;
 
-	rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+	rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
 	if (rc != 0) {
 		kfree(buf);
 		return rc;
@@ -238,7 +236,7 @@ CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
 }
 
 static int
-CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
+query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 		   const unsigned char *searchName, char **symlinkinfo,
 		   const struct nls_table *nls_codepage, int remap)
 {
@@ -282,7 +280,7 @@ CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
 		return rc;
 	}
 
-	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
+	rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
 	kfree(buf);
 	if (rc != 0)
 		return rc;
@@ -291,7 +289,7 @@ CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
 }
 
 bool
-CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
+couldbe_mf_symlink(const struct cifs_fattr *fattr)
 {
 	if (!(fattr->cf_mode & S_IFREG))
 		/* it's not a symlink */
@@ -341,16 +339,16 @@ out:
 }
 
 int
-CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
-		   struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
-		   const unsigned char *path)
+check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+		 const unsigned char *path)
 {
 	int rc;
 	u8 *buf = NULL;
 	unsigned int link_len = 0;
 	unsigned int bytes_read = 0;
 
-	if (!CIFSCouldBeMFSymlink(fattr))
+	if (!couldbe_mf_symlink(fattr))
 		/* it's not a symlink */
 		return 0;
 
@@ -370,7 +368,7 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
 	if (bytes_read == 0) /* not a symlink */
 		goto out;
 
-	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+	rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
 	if (rc == -EINVAL) {
 		/* it's not a symlink */
 		rc = 0;
@@ -519,7 +517,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	 * and fallback to UNIX Extensions Symlinks.
 	 */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-		rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
+		rc = query_mf_symlink(xid, tcon, full_path, &target_path,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -576,7 +574,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 
 	/* BB what if DFS and this volume is on different share? BB */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-		rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
+		rc = create_mf_symlink(xid, pTcon, full_path, symname,
 					cifs_sb);
 	else if (pTcon->unix_ext)
 		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5940ecabbe6..b15862e0f68 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -749,7 +749,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
 	}
 
 	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
-	    CIFSCouldBeMFSymlink(&fattr))
+	    couldbe_mf_symlink(&fattr))
 		/*
 		 * trying to get the type and mode can be slow,
 		 * so just call those regular files for now, and mark
-- 
cgit v1.2.3-70-g09d2


From 8205d1bb31af047c6893a4f9e86ed88cf5d6113d Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Mon, 25 Nov 2013 17:09:51 +0000
Subject: cifs: use protocol specific call for query_mf_symlink()

We have an existing protocol specific call query_mf_symlink() created
for check_mf_symlink which can also be used for query_mf_symlink().

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/link.c | 61 +++++++++++++++++++---------------------------------------
 1 file changed, 20 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 500bc77a786..77dbd55795d 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -237,55 +237,36 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 
 static int
 query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
-		   const unsigned char *searchName, char **symlinkinfo,
-		   const struct nls_table *nls_codepage, int remap)
+		 struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		 char **symlinkinfo)
 {
 	int rc;
-	int oplock = 0;
-	__u16 netfid = 0;
-	u8 *buf;
-	char *pbuf;
-	unsigned int bytes_read = 0;
-	int buf_type = CIFS_NO_BUFFER;
+	u8 *buf = NULL;
 	unsigned int link_len = 0;
-	struct cifs_io_parms io_parms;
-	FILE_ALL_INFO file_info;
-
-	rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
-			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
-			 nls_codepage, remap);
-	if (rc != 0)
-		return rc;
-
-	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
-		CIFSSMBClose(xid, tcon, netfid);
-		/* it's not a symlink */
-		return -EINVAL;
-	}
+	unsigned int bytes_read = 0;
 
 	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
-	pbuf = buf;
-	io_parms.netfid = netfid;
-	io_parms.pid = current->tgid;
-	io_parms.tcon = tcon;
-	io_parms.offset = 0;
-	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
 
-	rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
-	CIFSSMBClose(xid, tcon, netfid);
-	if (rc != 0) {
-		kfree(buf);
-		return rc;
+	if (tcon->ses->server->ops->query_mf_symlink)
+		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+					      cifs_sb, path, buf, &bytes_read);
+	else
+		rc = -ENOSYS;
+
+	if (rc)
+		goto out;
+
+	if (bytes_read == 0) { /* not a symlink */
+		rc = -EINVAL;
+		goto out;
 	}
 
 	rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
+out:
 	kfree(buf);
-	if (rc != 0)
-		return rc;
-
-	return 0;
+	return rc;
 }
 
 bool
@@ -517,10 +498,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	 * and fallback to UNIX Extensions Symlinks.
 	 */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-		rc = query_mf_symlink(xid, tcon, full_path, &target_path,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
+				      &target_path);
 
 	if ((rc != 0) && cap_unix(tcon->ses))
 		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
-- 
cgit v1.2.3-70-g09d2


From cbb0aba6ff3ff5b64f094f81f4d99d2323c0afcc Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Mon, 25 Nov 2013 17:09:52 +0000
Subject: cifs: Add create MFSymlinks to protocol ops struct

Add a new protocol ops function create_mf_symlink and have
create_mf_symlink() use it.

This patchset moves the MFSymlink operations completely to the
ops structure so that we only use the right protocol versions when
querying or creating MFSymlinks.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h  |  3 ++
 fs/cifs/cifsproto.h |  4 +++
 fs/cifs/link.c      | 88 ++++++++++++++++++++++++++++-------------------------
 fs/cifs/smb1ops.c   |  1 +
 4 files changed, 54 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index fba4d1341f8..61228b7f6b6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -373,6 +373,9 @@ struct smb_version_operations {
 	int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
 				struct cifs_sb_info *, const unsigned char *,
 				char *, unsigned int *);
+	int (*create_mf_symlink)(unsigned int, struct cifs_tcon *,
+				 struct cifs_sb_info *, const unsigned char *,
+				 char *, unsigned int *);
 	/* if we can do cache read operations */
 	bool (*is_read_op)(__u32);
 	/* set oplock level for the inode */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 78bb6d6ad06..e88c3b192ef 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -500,4 +500,8 @@ int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 			  struct cifs_sb_info *cifs_sb,
 			  const unsigned char *path, char *pbuf,
 			  unsigned int *pbytes_read);
+int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			   struct cifs_sb_info *cifs_sb,
+			   const unsigned char *path, char *pbuf,
+			   unsigned int *pbytes_written);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 77dbd55795d..3f259aad361 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -180,59 +180,31 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
 
 static int
 create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
-		    const char *fromName, const char *toName,
-		    struct cifs_sb_info *cifs_sb)
+		  struct cifs_sb_info *cifs_sb, const char *fromName,
+		  const char *toName)
 {
 	int rc;
-	int oplock = 0;
-	int remap;
-	int create_options = CREATE_NOT_DIR;
-	__u16 netfid = 0;
 	u8 *buf;
 	unsigned int bytes_written = 0;
-	struct cifs_io_parms io_parms;
-	struct nls_table *nls_codepage;
-
-	nls_codepage = cifs_sb->local_nls;
-	remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
 	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
-	if (rc != 0) {
-		kfree(buf);
-		return rc;
-	}
-
-	if (backup_cred(cifs_sb))
-		create_options |= CREATE_OPEN_BACKUP_INTENT;
-
-	rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
-			 create_options, &netfid, &oplock, NULL,
-			 nls_codepage, remap);
-	if (rc != 0) {
-		kfree(buf);
-		return rc;
-	}
-
-	io_parms.netfid = netfid;
-	io_parms.pid = current->tgid;
-	io_parms.tcon = tcon;
-	io_parms.offset = 0;
-	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+	if (rc)
+		goto out;
 
-	rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0);
-	CIFSSMBClose(xid, tcon, netfid);
-	kfree(buf);
-	if (rc != 0)
-		return rc;
+	rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
+					fromName, buf, &bytes_written);
+	if (rc)
+		goto out;
 
 	if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
-		return -EIO;
-
-	return 0;
+		rc = -EIO;
+out:
+	kfree(buf);
+	return rc;
 }
 
 static int
@@ -319,6 +291,39 @@ out:
 	return rc;
 }
 
+int
+cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		       struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		       char *pbuf, unsigned int *pbytes_written)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	struct cifs_io_parms io_parms;
+	int create_options = CREATE_NOT_DIR;
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_CREATE, GENERIC_WRITE,
+			 create_options, &netfid, &oplock, NULL,
+			 cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		return rc;
+
+	io_parms.netfid = netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
+	CIFSSMBClose(xid, tcon, netfid);
+	return rc;
+}
+
 int
 check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 		 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -553,8 +558,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 
 	/* BB what if DFS and this volume is on different share? BB */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-		rc = create_mf_symlink(xid, pTcon, full_path, symname,
-					cifs_sb);
+		rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
 	else if (pTcon->unix_ext)
 		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
 					   cifs_sb->local_nls);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 099c2760225..1470ec4fc39 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1010,6 +1010,7 @@ struct smb_version_operations smb1_operations = {
 	.mand_unlock_range = cifs_unlock_range,
 	.push_mand_locks = cifs_push_mandatory_locks,
 	.query_mf_symlink = cifs_query_mf_symlink,
+	.create_mf_symlink = cifs_create_mf_symlink,
 	.is_read_op = cifs_is_read_op,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 0f8dce1cb7454f8795b73c5695a28e7a21a57ba0 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Mon, 25 Nov 2013 17:09:53 +0000
Subject: cifs: Re-order M-F Symlink code

This patch makes cosmetic changes. We group similar functions together
and separate out the protocol specific functions.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/link.c | 124 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 68 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 3f259aad361..5988b6060e8 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,6 +29,10 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 
+/*
+ * M-F Symlink Functions - Begin
+ */
+
 #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
 #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
 #define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
@@ -178,6 +182,20 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
 	return 0;
 }
 
+bool
+couldbe_mf_symlink(const struct cifs_fattr *fattr)
+{
+	if (!(fattr->cf_mode & S_IFREG))
+		/* it's not a symlink */
+		return false;
+
+	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+		/* it's not a symlink */
+		return false;
+
+	return true;
+}
+
 static int
 create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 		  struct cifs_sb_info *cifs_sb, const char *fromName,
@@ -241,20 +259,60 @@ out:
 	return rc;
 }
 
-bool
-couldbe_mf_symlink(const struct cifs_fattr *fattr)
+int
+check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+		 const unsigned char *path)
 {
-	if (!(fattr->cf_mode & S_IFREG))
+	int rc;
+	u8 *buf = NULL;
+	unsigned int link_len = 0;
+	unsigned int bytes_read = 0;
+
+	if (!couldbe_mf_symlink(fattr))
 		/* it's not a symlink */
-		return false;
+		return 0;
 
-	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (tcon->ses->server->ops->query_mf_symlink)
+		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+					      cifs_sb, path, buf, &bytes_read);
+	else
+		rc = -ENOSYS;
+
+	if (rc)
+		goto out;
+
+	if (bytes_read == 0) /* not a symlink */
+		goto out;
+
+	rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
+	if (rc == -EINVAL) {
 		/* it's not a symlink */
-		return false;
+		rc = 0;
+		goto out;
+	}
 
-	return true;
+	if (rc != 0)
+		goto out;
+
+	/* it is a symlink */
+	fattr->cf_eof = link_len;
+	fattr->cf_mode &= ~S_IFMT;
+	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
+	fattr->cf_dtype = DT_LNK;
+out:
+	kfree(buf);
+	return rc;
 }
 
+/*
+ * SMB 1.0 Protocol specific functions
+ */
+
 int
 cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 		      struct cifs_sb_info *cifs_sb, const unsigned char *path,
@@ -324,55 +382,9 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 	return rc;
 }
 
-int
-check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
-		 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
-		 const unsigned char *path)
-{
-	int rc;
-	u8 *buf = NULL;
-	unsigned int link_len = 0;
-	unsigned int bytes_read = 0;
-
-	if (!couldbe_mf_symlink(fattr))
-		/* it's not a symlink */
-		return 0;
-
-	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	if (tcon->ses->server->ops->query_mf_symlink)
-		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
-					      cifs_sb, path, buf, &bytes_read);
-	else
-		rc = -ENOSYS;
-
-	if (rc)
-		goto out;
-
-	if (bytes_read == 0) /* not a symlink */
-		goto out;
-
-	rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
-	if (rc == -EINVAL) {
-		/* it's not a symlink */
-		rc = 0;
-		goto out;
-	}
-
-	if (rc != 0)
-		goto out;
-
-	/* it is a symlink */
-	fattr->cf_eof = link_len;
-	fattr->cf_mode &= ~S_IFMT;
-	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
-	fattr->cf_dtype = DT_LNK;
-out:
-	kfree(buf);
-	return rc;
-}
+/*
+ * M-F Symlink Functions - End
+ */
 
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
-- 
cgit v1.2.3-70-g09d2


From 0ecdb4f572f6ab2219a01e3af349863f6e8b45af Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Wed, 27 Nov 2013 13:27:12 +0000
Subject: cifs: move unix extension call to cifs_query_symlink()

Unix extensions rigth now are only applicable to smb1 operations.
Move the check and subsequent unix extension call to the smb1
specific call to query_symlink() ie. cifs_query_symlink().

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/link.c    |  5 +----
 fs/cifs/smb1ops.c | 20 ++++++++++++++------
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 5988b6060e8..38b9bf4f5a6 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -518,10 +518,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 		rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
 				      &target_path);
 
-	if ((rc != 0) && cap_unix(tcon->ses))
-		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
-					     cifs_sb->local_nls);
-	else if (rc != 0 && server->ops->query_symlink)
+	if (rc != 0 && server->ops->query_symlink)
 		rc = server->ops->query_symlink(xid, tcon, full_path,
 						&target_path, cifs_sb);
 
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 1470ec4fc39..988fddb7202 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -918,23 +918,31 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
+	/* Check for unix extensions */
+	if (cap_unix(tcon->ses)) {
+		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
+					     cifs_sb->local_nls);
+		goto out;
+	}
+
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
 			 FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid,
 			 &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc)
-		return rc;
+		goto out;
 
 	rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path,
 				 cifs_sb->local_nls);
-	if (rc) {
-		CIFSSMBClose(xid, tcon, netfid);
-		return rc;
-	}
+	if (rc)
+		goto out_close;
 
 	convert_delimiter(*target_path, '/');
+out_close:
 	CIFSSMBClose(xid, tcon, netfid);
-	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+out:
+	if (!rc)
+		cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 924e3fa48c627ad45d3be9412a93df34ab0fb482 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Mon, 2 Dec 2013 16:37:43 +0000
Subject: cifs: Add support for follow_link on dfs shares under posix
 extensions

When using posix extensions, dfs shares in the dfs root show up as
symlinks resulting in userland tools such as 'ls' calling readlink() on
these shares. Since these are dfs shares, we end up returning -EREMOTE.

$ ls -l /mnt
ls: cannot read symbolic link /mnt/test: Object is remote
total 0
lrwxrwxrwx. 1 root root 19 Nov  6 09:47 test

With added follow_link() support for dfs shares, when using unix
extensions, we call GET_DFS_REFERRAL to obtain the DFS referral and
return the first node returned.

The dfs share in the dfs root is now displayed in the following manner.
$ ls -l /mnt
total 0
lrwxrwxrwx. 1 root root 19 Nov  6 09:47 test -> \vm140-31\test

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb1ops.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 988fddb7202..abd2cc9515c 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -907,6 +907,33 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
 			   (__u8)type, wait, 0);
 }
 
+static int
+cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
+		       const unsigned char *searchName, char **symlinkinfo,
+		       const struct nls_table *nls_codepage)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	int rc;
+	unsigned int num_referrals = 0;
+	struct dfs_info3_param *referrals = NULL;
+
+	rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
+			  &num_referrals, &referrals, 0);
+
+	if (!rc && num_referrals > 0) {
+		*symlinkinfo = kstrndup(referrals->node_name,
+					strlen(referrals->node_name),
+					GFP_KERNEL);
+		if (!*symlinkinfo)
+			rc = -ENOMEM;
+		free_dfs_info_array(referrals, num_referrals);
+	}
+	return rc;
+#else /* No DFS support */
+	return -EREMOTE;
+#endif
+}
+
 static int
 cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 		   const char *full_path, char **target_path,
@@ -922,6 +949,11 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 	if (cap_unix(tcon->ses)) {
 		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
 					     cifs_sb->local_nls);
+		if (rc == -EREMOTE)
+			rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
+						    target_path,
+						    cifs_sb->local_nls);
+
 		goto out;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 9bf4fa01f9aaf240bc6e40b3ed186039472c5298 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Thu, 16 Jan 2014 15:53:33 +0400
Subject: CIFS: Cleanup CIFSSMBOpen

Remove indentation, fix comment style, rename camel case
variables in preparation to make it work with cifs_open_parms
structure as a parm.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsproto.h |   8 +--
 fs/cifs/cifssmb.c   | 150 ++++++++++++++++++++++++++++------------------------
 2 files changed, 86 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e88c3b192ef..582ae61f45b 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -363,10 +363,10 @@ extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 extern int CIFSSMB_set_compression(const unsigned int xid,
 				   struct cifs_tcon *tcon, __u16 fid);
 extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
-			const char *fileName, const int disposition,
-			const int access_flags, const int omode,
-			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
-			const struct nls_table *nls_codepage, int remap);
+		       const char *path, const int disposition,
+		       const int desired_access, const int create_options,
+		       __u16 *netfid, int *oplock, FILE_ALL_INFO *buf,
+		       const struct nls_table *nls, int remap);
 extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d707edb6b85..8e1ebc2dc0d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1274,103 +1274,117 @@ OldOpenRetry:
 
 int
 CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
-	    const char *fileName, const int openDisposition,
-	    const int access_flags, const int create_options, __u16 *netfid,
-	    int *pOplock, FILE_ALL_INFO *pfile_info,
-	    const struct nls_table *nls_codepage, int remap)
+	    const char *path, const int disposition, const int desired_access,
+	    const int create_options, __u16 *netfid, int *oplock,
+	    FILE_ALL_INFO *buf, const struct nls_table *nls, int remap)
 {
 	int rc = -EACCES;
-	OPEN_REQ *pSMB = NULL;
-	OPEN_RSP *pSMBr = NULL;
+	OPEN_REQ *req = NULL;
+	OPEN_RSP *rsp = NULL;
 	int bytes_returned;
 	int name_len;
 	__u16 count;
 
 openRetry:
-	rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
+	rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req,
+		      (void **)&rsp);
 	if (rc)
 		return rc;
 
-	pSMB->AndXCommand = 0xFF;	/* none */
+	/* no commands go after this */
+	req->AndXCommand = 0xFF;
 
-	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		count = 1;	/* account for one byte pad to word boundary */
-		name_len =
-		    cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
-				       fileName, PATH_MAX, nls_codepage, remap);
-		name_len++;	/* trailing null */
+	if (req->hdr.Flags2 & SMBFLG2_UNICODE) {
+		/* account for one byte pad to word boundary */
+		count = 1;
+		name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1),
+					      path, PATH_MAX, nls, remap);
+		/* trailing null */
+		name_len++;
 		name_len *= 2;
-		pSMB->NameLength = cpu_to_le16(name_len);
-	} else {		/* BB improve check for buffer overruns BB */
-		count = 0;	/* no pad */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;	/* trailing null */
-		pSMB->NameLength = cpu_to_le16(name_len);
-		strncpy(pSMB->fileName, fileName, name_len);
+		req->NameLength = cpu_to_le16(name_len);
+	} else {
+		/* BB improve check for buffer overruns BB */
+		/* no pad */
+		count = 0;
+		name_len = strnlen(path, PATH_MAX);
+		/* trailing null */
+		name_len++;
+		req->NameLength = cpu_to_le16(name_len);
+		strncpy(req->fileName, path, name_len);
 	}
-	if (*pOplock & REQ_OPLOCK)
-		pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK);
-	else if (*pOplock & REQ_BATCHOPLOCK)
-		pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
-	pSMB->DesiredAccess = cpu_to_le32(access_flags);
-	pSMB->AllocationSize = 0;
-	/* set file as system file if special file such
-	   as fifo and server expecting SFU style and
-	   no Unix extensions */
+
+	if (*oplock & REQ_OPLOCK)
+		req->OpenFlags = cpu_to_le32(REQ_OPLOCK);
+	else if (*oplock & REQ_BATCHOPLOCK)
+		req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
+
+	req->DesiredAccess = cpu_to_le32(desired_access);
+	req->AllocationSize = 0;
+
+	/*
+	 * Set file as system file if special file such as fifo and server
+	 * expecting SFU style and no Unix extensions.
+	 */
 	if (create_options & CREATE_OPTION_SPECIAL)
-		pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
+		req->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
 	else
-		pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL);
+		req->FileAttributes = cpu_to_le32(ATTR_NORMAL);
 
-	/* XP does not handle ATTR_POSIX_SEMANTICS */
-	/* but it helps speed up case sensitive checks for other
-	servers such as Samba */
+	/*
+	 * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case
+	 * sensitive checks for other servers such as Samba.
+	 */
 	if (tcon->ses->capabilities & CAP_UNIX)
-		pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
+		req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
 
 	if (create_options & CREATE_OPTION_READONLY)
-		pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+		req->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+
+	req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
+	req->CreateDisposition = cpu_to_le32(disposition);
+	req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
 
-	pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
-	pSMB->CreateDisposition = cpu_to_le32(openDisposition);
-	pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
 	/* BB Expirement with various impersonation levels and verify */
-	pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
-	pSMB->SecurityFlags =
-	    SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
+	req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
+	req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY;
 
 	count += name_len;
-	inc_rfc1001_len(pSMB, count);
+	inc_rfc1001_len(req, count);
 
-	pSMB->ByteCount = cpu_to_le16(count);
-	/* long_op set to 1 to allow for oplock break timeouts */
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
+	req->ByteCount = cpu_to_le16(count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req,
+			 (struct smb_hdr *)rsp, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
 	if (rc) {
 		cifs_dbg(FYI, "Error in Open = %d\n", rc);
-	} else {
-		*pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
-		*netfid = pSMBr->Fid;	/* cifs fid stays in le */
-		/* Let caller know file was created so we can set the mode. */
-		/* Do we care about the CreateAction in any other cases? */
-		if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
-			*pOplock |= CIFS_CREATE_ACTION;
-		if (pfile_info) {
-			memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
-				36 /* CreationTime to Attributes */);
-			/* the file_info buf is endian converted by caller */
-			pfile_info->AllocationSize = pSMBr->AllocationSize;
-			pfile_info->EndOfFile = pSMBr->EndOfFile;
-			pfile_info->NumberOfLinks = cpu_to_le32(1);
-			pfile_info->DeletePending = 0;
-		}
+		cifs_buf_release(req);
+		if (rc == -EAGAIN)
+			goto openRetry;
+		return rc;
 	}
 
-	cifs_buf_release(pSMB);
-	if (rc == -EAGAIN)
-		goto openRetry;
+	/* 1 byte no need to le_to_cpu */
+	*oplock = rsp->OplockLevel;
+	/* cifs fid stays in le */
+	*netfid = rsp->Fid;
+
+	/* Let caller know file was created so we can set the mode. */
+	/* Do we care about the CreateAction in any other cases? */
+	if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction)
+		*oplock |= CIFS_CREATE_ACTION;
+
+	if (buf) {
+		/* copy from CreationTime to Attributes */
+		memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
+		/* the file_info buf is endian converted by caller */
+		buf->AllocationSize = rsp->AllocationSize;
+		buf->EndOfFile = rsp->EndOfFile;
+		buf->NumberOfLinks = cpu_to_le32(1);
+		buf->DeletePending = 0;
+	}
+
+	cifs_buf_release(req);
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From dd12067156b442801a7d636de354efe1d4dc467c Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Thu, 16 Jan 2014 15:53:34 +0400
Subject: CIFS: Cleanup cifs_mknod

Rename camel case variable and fix comment style.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/dir.c | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a514e0a65f6..0850325c3b4 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -565,12 +565,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 	int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifs_tcon *pTcon;
+	struct cifs_tcon *tcon;
 	struct cifs_io_parms io_parms;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
 	int oplock = 0;
-	u16 fileHandle;
+	u16 netfid;
 	FILE_ALL_INFO *buf = NULL;
 	unsigned int bytes_written;
 	struct win_dev *pdev;
@@ -583,7 +583,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
 
-	pTcon = tlink_tcon(tlink);
+	tcon = tlink_tcon(tlink);
 
 	xid = get_xid();
 
@@ -593,7 +593,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 		goto mknod_out;
 	}
 
-	if (pTcon->unix_ext) {
+	if (tcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
 			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
@@ -608,7 +608,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 			args.uid = INVALID_UID; /* no change */
 			args.gid = INVALID_GID; /* no change */
 		}
-		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
+		rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -640,42 +640,38 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 	if (backup_cred(cifs_sb))
 		create_options |= CREATE_OPEN_BACKUP_INTENT;
 
-	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_CREATE,
 			 GENERIC_WRITE, create_options,
-			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
+			 &netfid, &oplock, buf, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc)
 		goto mknod_out;
 
-	/* BB Do not bother to decode buf since no local inode yet to put
-	 * timestamps in, but we can reuse it safely */
+	/*
+	 * BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely.
+	 */
 
 	pdev = (struct win_dev *)buf;
-	io_parms.netfid = fileHandle;
+	io_parms.netfid = netfid;
 	io_parms.pid = current->tgid;
-	io_parms.tcon = pTcon;
+	io_parms.tcon = tcon;
 	io_parms.offset = 0;
 	io_parms.length = sizeof(struct win_dev);
 	if (S_ISCHR(mode)) {
 		memcpy(pdev->type, "IntxCHR", 8);
-		pdev->major =
-		      cpu_to_le64(MAJOR(device_number));
-		pdev->minor =
-		      cpu_to_le64(MINOR(device_number));
-		rc = CIFSSMBWrite(xid, &io_parms,
-			&bytes_written, (char *)pdev,
-			NULL, 0);
+		pdev->major = cpu_to_le64(MAJOR(device_number));
+		pdev->minor = cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
+				  NULL, 0);
 	} else if (S_ISBLK(mode)) {
 		memcpy(pdev->type, "IntxBLK", 8);
-		pdev->major =
-		      cpu_to_le64(MAJOR(device_number));
-		pdev->minor =
-		      cpu_to_le64(MINOR(device_number));
-		rc = CIFSSMBWrite(xid, &io_parms,
-			&bytes_written, (char *)pdev,
-			NULL, 0);
+		pdev->major = cpu_to_le64(MAJOR(device_number));
+		pdev->minor = cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
+				  NULL, 0);
 	} /* else if (S_ISFIFO) */
-	CIFSSMBClose(xid, pTcon, fileHandle);
+	CIFSSMBClose(xid, tcon, netfid);
 	d_drop(direntry);
 
 	/* FIXME: add code here to set EAs */
-- 
cgit v1.2.3-70-g09d2


From 0360d605a236355f9501d21175e405536e2acd48 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Thu, 16 Jan 2014 15:53:35 +0400
Subject: CIFS: Remove extra indentation in cifs_sfu_type

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/inode.c | 97 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 50 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6f7f57a3a46..5793b5a557e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -404,7 +404,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 }
 
 static int
-cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
+cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
 	      struct cifs_sb_info *cifs_sb, unsigned int xid)
 {
 	int rc;
@@ -416,6 +416,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 	char buf[24];
 	unsigned int bytes_read;
 	char *pbuf;
+	int buf_type = CIFS_NO_BUFFER;
 
 	pbuf = buf;
 
@@ -441,57 +442,59 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 			 cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc == 0) {
-		int buf_type = CIFS_NO_BUFFER;
-			/* Read header */
-		io_parms.netfid = netfid;
-		io_parms.pid = current->tgid;
-		io_parms.tcon = tcon;
-		io_parms.offset = 0;
-		io_parms.length = 24;
-		rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf,
-				 &buf_type);
-		if ((rc == 0) && (bytes_read >= 8)) {
-			if (memcmp("IntxBLK", pbuf, 8) == 0) {
-				cifs_dbg(FYI, "Block device\n");
-				fattr->cf_mode |= S_IFBLK;
-				fattr->cf_dtype = DT_BLK;
-				if (bytes_read == 24) {
-					/* we have enough to decode dev num */
-					__u64 mjr; /* major */
-					__u64 mnr; /* minor */
-					mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
-					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-					fattr->cf_rdev = MKDEV(mjr, mnr);
-				}
-			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-				cifs_dbg(FYI, "Char device\n");
-				fattr->cf_mode |= S_IFCHR;
-				fattr->cf_dtype = DT_CHR;
-				if (bytes_read == 24) {
-					/* we have enough to decode dev num */
-					__u64 mjr; /* major */
-					__u64 mnr; /* minor */
-					mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
-					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-					fattr->cf_rdev = MKDEV(mjr, mnr);
-				}
-			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-				cifs_dbg(FYI, "Symlink\n");
-				fattr->cf_mode |= S_IFLNK;
-				fattr->cf_dtype = DT_LNK;
-			} else {
-				fattr->cf_mode |= S_IFREG; /* file? */
-				fattr->cf_dtype = DT_REG;
-				rc = -EOPNOTSUPP;
+	if (rc) {
+		cifs_put_tlink(tlink);
+		return rc;
+	}
+
+	/* Read header */
+	io_parms.netfid = netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = 24;
+
+	rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
+	if ((rc == 0) && (bytes_read >= 8)) {
+		if (memcmp("IntxBLK", pbuf, 8) == 0) {
+			cifs_dbg(FYI, "Block device\n");
+			fattr->cf_mode |= S_IFBLK;
+			fattr->cf_dtype = DT_BLK;
+			if (bytes_read == 24) {
+				/* we have enough to decode dev num */
+				__u64 mjr; /* major */
+				__u64 mnr; /* minor */
+				mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+				mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+				fattr->cf_rdev = MKDEV(mjr, mnr);
+			}
+		} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+			cifs_dbg(FYI, "Char device\n");
+			fattr->cf_mode |= S_IFCHR;
+			fattr->cf_dtype = DT_CHR;
+			if (bytes_read == 24) {
+				/* we have enough to decode dev num */
+				__u64 mjr; /* major */
+				__u64 mnr; /* minor */
+				mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+				mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+				fattr->cf_rdev = MKDEV(mjr, mnr);
 			}
+		} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
+			cifs_dbg(FYI, "Symlink\n");
+			fattr->cf_mode |= S_IFLNK;
+			fattr->cf_dtype = DT_LNK;
 		} else {
-			fattr->cf_mode |= S_IFREG; /* then it is a file */
+			fattr->cf_mode |= S_IFREG; /* file? */
 			fattr->cf_dtype = DT_REG;
-			rc = -EOPNOTSUPP; /* or some unknown SFU type */
+			rc = -EOPNOTSUPP;
 		}
-		CIFSSMBClose(xid, tcon, netfid);
+	} else {
+		fattr->cf_mode |= S_IFREG; /* then it is a file */
+		fattr->cf_dtype = DT_REG;
+		rc = -EOPNOTSUPP; /* or some unknown SFU type */
 	}
+	CIFSSMBClose(xid, tcon, netfid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From d81b8a40e2ece0a9ab57b1fe1798e291e75bf8fc Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Thu, 16 Jan 2014 15:53:36 +0400
Subject: CIFS: Cleanup cifs open codepath

Rename CIFSSMBOpen to CIFS_open and make it take
cifs_open_parms structure as a parm.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsacl.c   | 40 ++++++++++++++++++++---------
 fs/cifs/cifsproto.h |  7 ++---
 fs/cifs/cifssmb.c   | 16 ++++++++----
 fs/cifs/dir.c       | 21 ++++++++++-----
 fs/cifs/file.c      |  2 +-
 fs/cifs/inode.c     | 73 ++++++++++++++++++++++++++++++++++-------------------
 fs/cifs/link.c      | 44 ++++++++++++++++++++------------
 fs/cifs/smb1ops.c   | 71 +++++++++++++++++++++++++++++++--------------------
 8 files changed, 174 insertions(+), 100 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 51f5e0ee723..8f9b4f710d4 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -895,9 +895,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 	int oplock = 0;
 	unsigned int xid;
 	int rc, create_options = 0;
-	__u16 fid;
 	struct cifs_tcon *tcon;
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 
 	if (IS_ERR(tlink))
 		return ERR_CAST(tlink);
@@ -908,12 +909,19 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 	if (backup_cred(cifs_sb))
 		create_options |= CREATE_OPEN_BACKUP_INTENT;
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
-			create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = READ_CONTROL;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (!rc) {
-		rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-		CIFSSMBClose(xid, tcon, fid);
+		rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
+		CIFSSMBClose(xid, tcon, fid.netfid);
 	}
 
 	cifs_put_tlink(tlink);
@@ -950,10 +958,11 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 	int oplock = 0;
 	unsigned int xid;
 	int rc, access_flags, create_options = 0;
-	__u16 fid;
 	struct cifs_tcon *tcon;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
@@ -969,18 +978,25 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 	else
 		access_flags = WRITE_DAC;
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
-			create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = access_flags;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc) {
 		cifs_dbg(VFS, "Unable to open file to set ACL\n");
 		goto out;
 	}
 
-	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
+	rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag);
 	cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
 
-	CIFSSMBClose(xid, tcon, fid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 out:
 	free_xid(xid);
 	cifs_put_tlink(tlink);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 582ae61f45b..79e6e9a93a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -362,11 +362,8 @@ extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 			       const struct nls_table *nls_codepage);
 extern int CIFSSMB_set_compression(const unsigned int xid,
 				   struct cifs_tcon *tcon, __u16 fid);
-extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
-		       const char *path, const int disposition,
-		       const int desired_access, const int create_options,
-		       __u16 *netfid, int *oplock, FILE_ALL_INFO *buf,
-		       const struct nls_table *nls, int remap);
+extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
+		     int *oplock, FILE_ALL_INFO *buf);
 extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 8e1ebc2dc0d..4d881c35eec 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1273,10 +1273,8 @@ OldOpenRetry:
 }
 
 int
-CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
-	    const char *path, const int disposition, const int desired_access,
-	    const int create_options, __u16 *netfid, int *oplock,
-	    FILE_ALL_INFO *buf, const struct nls_table *nls, int remap)
+CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
+	  FILE_ALL_INFO *buf)
 {
 	int rc = -EACCES;
 	OPEN_REQ *req = NULL;
@@ -1284,6 +1282,14 @@ CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned;
 	int name_len;
 	__u16 count;
+	struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
+	struct cifs_tcon *tcon = oparms->tcon;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+	const struct nls_table *nls = cifs_sb->local_nls;
+	int create_options = oparms->create_options;
+	int desired_access = oparms->desired_access;
+	int disposition = oparms->disposition;
+	const char *path = oparms->path;
 
 openRetry:
 	rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req,
@@ -1367,7 +1373,7 @@ openRetry:
 	/* 1 byte no need to le_to_cpu */
 	*oplock = rsp->OplockLevel;
 	/* cifs fid stays in le */
-	*netfid = rsp->Fid;
+	oparms->fid->netfid = rsp->Fid;
 
 	/* Let caller know file was created so we can set the mode. */
 	/* Do we care about the CreateAction in any other cases? */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 0850325c3b4..d3a6796caa5 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -570,7 +570,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
 	int oplock = 0;
-	u16 netfid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	FILE_ALL_INFO *buf = NULL;
 	unsigned int bytes_written;
 	struct win_dev *pdev;
@@ -640,10 +641,16 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 	if (backup_cred(cifs_sb))
 		create_options |= CREATE_OPEN_BACKUP_INTENT;
 
-	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_CREATE,
-			 GENERIC_WRITE, create_options,
-			 &netfid, &oplock, buf, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, buf);
 	if (rc)
 		goto mknod_out;
 
@@ -653,7 +660,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 	 */
 
 	pdev = (struct win_dev *)buf;
-	io_parms.netfid = netfid;
+	io_parms.netfid = fid.netfid;
 	io_parms.pid = current->tgid;
 	io_parms.tcon = tcon;
 	io_parms.offset = 0;
@@ -671,7 +678,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 		rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
 				  NULL, 0);
 	} /* else if (S_ISFIFO) */
-	CIFSSMBClose(xid, tcon, netfid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 	d_drop(direntry);
 
 	/* FIXME: add code here to set EAs */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a5a87240fe..853d6d1cc82 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -678,7 +678,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 
 	/*
 	 * Can not refresh inode by passing in file_info buf to be returned by
-	 * CIFSSMBOpen and then calling get_inode_info with returned buf since
+	 * ops->open and then calling get_inode_info with returned buf since
 	 * file might have write behind data that needs to be flushed and server
 	 * version of file size can be stale. If we knew for sure that inode was
 	 * not dirty locally we could do this.
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5793b5a557e..9cb9679d735 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -409,9 +409,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
 {
 	int rc;
 	int oplock = 0;
-	__u16 netfid;
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	struct cifs_io_parms io_parms;
 	char buf[24];
 	unsigned int bytes_read;
@@ -437,18 +438,23 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
-			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
-			 cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_READ;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc) {
 		cifs_put_tlink(tlink);
 		return rc;
 	}
 
 	/* Read header */
-	io_parms.netfid = netfid;
+	io_parms.netfid = fid.netfid;
 	io_parms.pid = current->tgid;
 	io_parms.tcon = tcon;
 	io_parms.offset = 0;
@@ -494,7 +500,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
 		fattr->cf_dtype = DT_REG;
 		rc = -EOPNOTSUPP; /* or some unknown SFU type */
 	}
-	CIFSSMBClose(xid, tcon, netfid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
@@ -1035,7 +1041,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 {
 	int oplock = 0;
 	int rc;
-	__u16 netfid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1058,10 +1065,16 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 		goto out;
 	}
 
-	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
-			 &netfid, &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc != 0)
 		goto out;
 
@@ -1082,7 +1095,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 			goto out_close;
 		}
 		info_buf->Attributes = cpu_to_le32(dosattr);
-		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
 					current->tgid);
 		/* although we would like to mark the file hidden
  		   if that fails we will still try to rename it */
@@ -1093,7 +1106,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 	}
 
 	/* rename the file */
-	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+	rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
+				   cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc != 0) {
@@ -1103,7 +1117,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 
 	/* try to set DELETE_ON_CLOSE */
 	if (!cifsInode->delete_pending) {
-		rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid,
+		rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid,
 					       current->tgid);
 		/*
 		 * some samba versions return -ENOENT when we try to set the
@@ -1123,7 +1137,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 	}
 
 out_close:
-	CIFSSMBClose(xid, tcon, netfid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 out:
 	kfree(info_buf);
 	cifs_put_tlink(tlink);
@@ -1135,13 +1149,13 @@ out:
 	 * them anyway.
 	 */
 undo_rename:
-	CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name,
+	CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
 				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 undo_setattr:
 	if (dosattr != origattr) {
 		info_buf->Attributes = cpu_to_le32(origattr);
-		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
 					current->tgid))
 			cifsInode->cifsAttrs = origattr;
 	}
@@ -1552,7 +1566,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
 	struct TCP_Server_Info *server;
-	__u16 srcfid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	int oplock, rc;
 
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -1579,17 +1594,23 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
 	if (to_dentry->d_parent != from_dentry->d_parent)
 		goto do_rename_exit;
 
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
 	/* open the file to be renamed -- we need DELETE perms */
-	rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE,
-			 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
-			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.desired_access = DELETE;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = from_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc == 0) {
-		rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid,
+		rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
 				(const char *) to_dentry->d_name.name,
 				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		CIFSSMBClose(xid, tcon, srcfid);
+		CIFSSMBClose(xid, tcon, fid.netfid);
 	}
 do_rename_exit:
 	cifs_put_tlink(tlink);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 38b9bf4f5a6..52f41f9f7de 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -320,16 +320,22 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 {
 	int rc;
 	int oplock = 0;
-	__u16 netfid = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	struct cifs_io_parms io_parms;
 	int buf_type = CIFS_NO_BUFFER;
 	FILE_ALL_INFO file_info;
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
-			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
-			 cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_READ;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, &file_info);
 	if (rc)
 		return rc;
 
@@ -337,7 +343,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 		/* it's not a symlink */
 		goto out;
 
-	io_parms.netfid = netfid;
+	io_parms.netfid = fid.netfid;
 	io_parms.pid = current->tgid;
 	io_parms.tcon = tcon;
 	io_parms.offset = 0;
@@ -345,7 +351,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 
 	rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
 out:
-	CIFSSMBClose(xid, tcon, netfid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 	return rc;
 }
 
@@ -356,29 +362,35 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 {
 	int rc;
 	int oplock = 0;
-	__u16 netfid = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	struct cifs_io_parms io_parms;
 	int create_options = CREATE_NOT_DIR;
 
 	if (backup_cred(cifs_sb))
 		create_options |= CREATE_OPEN_BACKUP_INTENT;
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_CREATE, GENERIC_WRITE,
-			 create_options, &netfid, &oplock, NULL,
-			 cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc)
 		return rc;
 
-	io_parms.netfid = netfid;
+	io_parms.netfid = fid.netfid;
 	io_parms.pid = current->tgid;
 	io_parms.tcon = tcon;
 	io_parms.offset = 0;
 	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
 
 	rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
-	CIFSSMBClose(xid, tcon, netfid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 	return rc;
 }
 
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index abd2cc9515c..9ac5bfc9cc5 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -560,17 +560,24 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 	if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
 		int tmprc;
 		int oplock = 0;
-		__u16 netfid;
+		struct cifs_fid fid;
+		struct cifs_open_parms oparms;
+
+		oparms.tcon = tcon;
+		oparms.cifs_sb = cifs_sb;
+		oparms.desired_access = FILE_READ_ATTRIBUTES;
+		oparms.create_options = 0;
+		oparms.disposition = FILE_OPEN;
+		oparms.path = full_path;
+		oparms.fid = &fid;
+		oparms.reconnect = false;
 
 		/* Need to check if this is a symbolic link or not */
-		tmprc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-				    FILE_READ_ATTRIBUTES, 0, &netfid, &oplock,
-				    NULL, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
 		if (tmprc == -EOPNOTSUPP)
 			*symlink = true;
 		else
-			CIFSSMBClose(xid, tcon, netfid);
+			CIFSSMBClose(xid, tcon, fid.netfid);
 	}
 
 	return rc;
@@ -705,12 +712,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
 				     oparms->cifs_sb->local_nls,
 				     oparms->cifs_sb->mnt_cifs_flags
 						& CIFS_MOUNT_MAP_SPECIAL_CHR);
-	return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
-			   oparms->disposition, oparms->desired_access,
-			   oparms->create_options, &oparms->fid->netfid, oplock,
-			   buf, oparms->cifs_sb->local_nls,
-			   oparms->cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	return CIFS_open(xid, oparms, oplock, buf);
 }
 
 static void
@@ -761,8 +763,9 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 {
 	int oplock = 0;
 	int rc;
-	__u16 netfid;
 	__u32 netpid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -772,7 +775,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 	/* if the file is already open for write, just use that fileid */
 	open_file = find_writable_file(cinode, true);
 	if (open_file) {
-		netfid = open_file->fid.netfid;
+		fid.netfid = open_file->fid.netfid;
 		netpid = open_file->pid;
 		tcon = tlink_tcon(open_file->tlink);
 		goto set_via_filehandle;
@@ -796,12 +799,17 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 		goto out;
 	}
 
-	cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
-	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
-			 &netfid, &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
 
+	cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc != 0) {
 		if (rc == -EIO)
 			rc = -EINVAL;
@@ -811,12 +819,12 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 	netpid = current->tgid;
 
 set_via_filehandle:
-	rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
+	rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid);
 	if (!rc)
 		cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
 
 	if (open_file == NULL)
-		CIFSSMBClose(xid, tcon, netfid);
+		CIFSSMBClose(xid, tcon, fid.netfid);
 	else
 		cifsFileInfo_put(open_file);
 out:
@@ -941,7 +949,8 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	int rc;
 	int oplock = 0;
-	__u16 netfid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
@@ -957,21 +966,27 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 		goto out;
 	}
 
-	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-			 FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid,
-			 &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.create_options = OPEN_REPARSE_POINT;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc)
 		goto out;
 
-	rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path,
+	rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
 				 cifs_sb->local_nls);
 	if (rc)
 		goto out_close;
 
 	convert_delimiter(*target_path, '/');
 out_close:
-	CIFSSMBClose(xid, tcon, netfid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 out:
 	if (!rc)
 		cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
-- 
cgit v1.2.3-70-g09d2


From abad2fa5ba67725a3f9c376c8cfe76fbe94a3041 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Sun, 19 Jan 2014 22:45:36 -0500
Subject: nfs4: fix discover_server_trunking use after free

If clp is new (cl_count = 1) and it matches another client in
nfs4_discover_server_trunking, the nfs_put_client will free clp before
->cl_preserve_clid is set.

Cc: stable@vger.kernel.org # 3.7+
Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4client.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 06e770ace07..73d4ecda1e3 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -414,13 +414,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	error = nfs4_discover_server_trunking(clp, &old);
 	if (error < 0)
 		goto error;
-	nfs_put_client(clp);
-	if (clp != old) {
-		clp->cl_preserve_clid = true;
-		clp = old;
-	}
 
-	return clp;
+	if (clp != old)
+		clp->cl_preserve_clid = true;
+	nfs_put_client(clp);
+	return old;
 
 error:
 	nfs_mark_client_ready(clp, error);
-- 
cgit v1.2.3-70-g09d2


From 4fe59789adc93b2573b0417ac93136805521902e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Thu, 31 Oct 2013 16:44:14 +0800
Subject: ceph: handle cap export race in try_flush_caps()

auth cap may change after releasing the i_ceph_lock

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/caps.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 80dad0d491a..101209930d5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1735,13 +1735,12 @@ ack:
 /*
  * Try to flush dirty caps back to the auth mds.
  */
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
-			  unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int unlock_session = session ? 0 : 1;
 	int flushing = 0;
+	struct ceph_mds_session *session = NULL;
 
 retry:
 	spin_lock(&ci->i_ceph_lock);
@@ -1755,13 +1754,14 @@ retry:
 		int want = __ceph_caps_wanted(ci);
 		int delayed;
 
-		if (!session) {
+		if (!session || session != cap->session) {
 			spin_unlock(&ci->i_ceph_lock);
+			if (session)
+				mutex_unlock(&session->s_mutex);
 			session = cap->session;
 			mutex_lock(&session->s_mutex);
 			goto retry;
 		}
-		BUG_ON(session != cap->session);
 		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
 			goto out;
 
@@ -1780,7 +1780,7 @@ retry:
 out:
 	spin_unlock(&ci->i_ceph_lock);
 out_unlocked:
-	if (session && unlock_session)
+	if (session)
 		mutex_unlock(&session->s_mutex);
 	return flushing;
 }
@@ -1865,7 +1865,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return ret;
 	mutex_lock(&inode->i_mutex);
 
-	dirty = try_flush_caps(inode, NULL, &flush_tid);
+	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
 	/*
@@ -1900,7 +1900,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	dout("write_inode %p wait=%d\n", inode, wait);
 	if (wait) {
-		dirty = try_flush_caps(inode, NULL, &flush_tid);
+		dirty = try_flush_caps(inode, &flush_tid);
 		if (dirty)
 			err = wait_event_interruptible(ci->i_cap_wq,
 				       caps_are_flushed(inode, flush_tid));
-- 
cgit v1.2.3-70-g09d2


From d1b87809fba3e07a261080837d1ae58d790b51a6 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Wed, 13 Nov 2013 14:47:19 +0800
Subject: ceph: use ceph_seq_cmp() to compare migrate_seq

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/caps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 101209930d5..2c39d9fe18d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -628,7 +628,7 @@ retry:
 	cap->cap_id = cap_id;
 	cap->issued = issued;
 	cap->implemented |= issued;
-	if (mseq > cap->mseq)
+	if (ceph_seq_cmp(mseq, cap->mseq) > 0)
 		cap->mds_wanted = wanted;
 	else
 		cap->mds_wanted |= wanted;
-- 
cgit v1.2.3-70-g09d2


From 9563f88c1fa01341d125e396edc654a8dbcab2d2 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 22 Nov 2013 13:50:45 +0800
Subject: ceph: fix cache revoke race

handle following sequence of events:

- non-auth MDS revokes Fc cap. queue invalidate work
- auth MDS issues Fc cap through request reply. i_rdcache_gen gets
  increased.
- invalidate work runs. it finds i_rdcache_revoking != i_rdcache_gen,
  so it does nothing.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/caps.c  | 2 +-
 fs/ceph/inode.c | 8 +++++---
 fs/ceph/super.h | 2 ++
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2c39d9fe18d..d2154d63f67 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -816,7 +816,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 
 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 		cap = rb_entry(p, struct ceph_cap, ci_node);
-		if (cap != ocap && __cap_is_valid(cap) &&
+		if (cap != ocap &&
 		    (cap->implemented & ~cap->issued & mask))
 			return 1;
 	}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a808bfb8d8d..3db97ba15a0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1466,7 +1466,8 @@ static void ceph_invalidate_work(struct work_struct *work)
 	dout("invalidate_pages %p gen %d revoking %d\n", inode,
 	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
 	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-		/* nevermind! */
+		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+			check = 1;
 		spin_unlock(&ci->i_ceph_lock);
 		mutex_unlock(&ci->i_truncate_mutex);
 		goto out;
@@ -1487,13 +1488,14 @@ static void ceph_invalidate_work(struct work_struct *work)
 		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
 		     inode, orig_gen, ci->i_rdcache_gen,
 		     ci->i_rdcache_revoking);
+		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+			check = 1;
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	mutex_unlock(&ci->i_truncate_mutex);
-
+out:
 	if (check)
 		ceph_check_caps(ci, 0, NULL);
-out:
 	iput(inode);
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7fa78a7c889..891cda8c72a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -528,6 +528,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 }
 extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
 
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+				      struct ceph_cap *ocap, int mask);
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
 
-- 
cgit v1.2.3-70-g09d2


From 979abfdd5c7ca4abe3f0157a6ea9bfef41114c89 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 22 Nov 2013 13:56:24 +0800
Subject: ceph: fix trim caps

- don't trim auth cap if there are flusing caps
- don't trim auth cap if any 'write' cap is wanted
- allow trimming non-auth cap even if the inode is dirty

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/mds_client.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4a13f6e7206..73c79431cbf 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1214,7 +1214,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 {
 	struct ceph_mds_session *session = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int used, oissued, mine;
+	int used, wanted, oissued, mine;
 
 	if (session->s_trim_caps <= 0)
 		return -1;
@@ -1222,14 +1222,19 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 	spin_lock(&ci->i_ceph_lock);
 	mine = cap->issued | cap->implemented;
 	used = __ceph_caps_used(ci);
+	wanted = __ceph_caps_file_wanted(ci);
 	oissued = __ceph_caps_issued_other(ci, cap);
 
-	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
 	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
-	     ceph_cap_string(used));
-	if (ci->i_dirty_caps)
-		goto out;   /* dirty caps */
-	if ((used & ~oissued) & mine)
+	     ceph_cap_string(used), ceph_cap_string(wanted));
+	if (cap == ci->i_auth_cap) {
+		if (ci->i_dirty_caps | ci->i_flushing_caps)
+			goto out;
+		if ((used | wanted) & CEPH_CAP_ANY_WR)
+			goto out;
+	}
+	if ((used | wanted) & ~oissued & mine)
 		goto out;   /* we need these caps */
 
 	session->s_trim_caps--;
-- 
cgit v1.2.3-70-g09d2


From ca18bede048e95a749d13410ce1da4ad0ffa7938 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 22 Nov 2013 14:21:44 +0800
Subject: ceph: handle -ESTALE reply

Send requests that operate on path to directory's auth MDS if
mode == USE_AUTH_MDS. Always retry using the auth MDS if got
-ESTALE reply from non-auth MDS. Also clean up the code that
handles auth MDS change.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/mds_client.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 73c79431cbf..1fd655ac806 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -713,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 			struct dentry *dn = get_nonsnap_parent(parent);
 			inode = dn->d_inode;
 			dout("__choose_mds using nonsnap parent %p\n", inode);
-		} else if (req->r_dentry->d_inode) {
+		} else {
 			/* dentry target */
 			inode = req->r_dentry->d_inode;
-		} else {
-			/* dir + name */
-			inode = dir;
-			hash = ceph_dentry_hash(dir, req->r_dentry);
-			is_hash = true;
+			if (!inode || mode == USE_AUTH_MDS) {
+				/* dir + name */
+				inode = dir;
+				hash = ceph_dentry_hash(dir, req->r_dentry);
+				is_hash = true;
+			}
 		}
 	}
 
@@ -2161,26 +2162,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	 */
 	if (result == -ESTALE) {
 		dout("got ESTALE on request %llu", req->r_tid);
-		if (!req->r_inode) {
-			/* do nothing; not an authority problem */
-		} else if (req->r_direct_mode != USE_AUTH_MDS) {
+		if (req->r_direct_mode != USE_AUTH_MDS) {
 			dout("not using auth, setting for that now");
 			req->r_direct_mode = USE_AUTH_MDS;
 			__do_request(mdsc, req);
 			mutex_unlock(&mdsc->mutex);
 			goto out;
 		} else  {
-			struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-			struct ceph_cap *cap = NULL;
-
-			if (req->r_session)
-				cap = ceph_get_cap_for_mds(ci,
-						   req->r_session->s_mds);
-
-			dout("already using auth");
-			if ((!cap || cap != ci->i_auth_cap) ||
-			    (cap->mseq != req->r_sent_on_mseq)) {
-				dout("but cap changed, so resending");
+			int mds = __choose_mds(mdsc, req);
+			if (mds >= 0 && mds != req->r_session->s_mds) {
+				dout("but auth changed, so resending");
 				__do_request(mdsc, req);
 				mutex_unlock(&mdsc->mutex);
 				goto out;
-- 
cgit v1.2.3-70-g09d2


From 9215aeea622fec7ca8123c6bd6f03a1753e2b0b3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sat, 30 Nov 2013 12:47:41 +0800
Subject: ceph: check inode caps in ceph_d_revalidate

Some inodes in readdir reply may have no caps. Getattr mds request
for these inodes can return -ESTALE. The fix is consider dentry that
links to inode with no caps as invalid. Invalid dentry causes a
lookup request to send to the mds, the MDS will send caps back.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/caps.c  | 12 ++++++++++++
 fs/ceph/dir.c   | 11 ++++++++---
 fs/ceph/super.h |  1 +
 3 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d2154d63f67..d65ff334901 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -891,6 +891,18 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
 }
 
+int ceph_is_any_caps(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	ret = __ceph_is_any_caps(ci);
+	spin_unlock(&ci->i_ceph_lock);
+
+	return ret;
+}
+
 /*
  * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
  *
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index b629e9d59a3..619616d585b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1041,14 +1041,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 		valid = 1;
 	} else if (dentry_lease_is_valid(dentry) ||
 		   dir_lease_is_valid(dir, dentry)) {
-		valid = 1;
+		if (dentry->d_inode)
+			valid = ceph_is_any_caps(dentry->d_inode);
+		else
+			valid = 1;
 	}
 
 	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
-	if (valid)
+	if (valid) {
 		ceph_dentry_lru_touch(dentry);
-	else
+	} else {
+		ceph_dir_clear_complete(dir);
 		d_drop(dentry);
+	}
 	iput(dir);
 	return valid;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 891cda8c72a..a6ba32fb0d4 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -782,6 +782,7 @@ extern int ceph_add_cap(struct inode *inode,
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
 
 extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
 				u64 cap_id, u32 migrate_seq, u32 issue_seq);
-- 
cgit v1.2.3-70-g09d2


From 186e4f7a4b1883f3f46aa15366c0bcebc28fdda7 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 22 Nov 2013 14:48:37 +0800
Subject: ceph: handle session flush message

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/mds_client.c         | 19 +++++++++++++++++++
 fs/ceph/strings.c            |  2 ++
 include/linux/ceph/ceph_fs.h |  2 ++
 3 files changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1fd655ac806..7c00dd530bd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1137,6 +1137,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 	return 0;
 }
 
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session, u64 seq)
+{
+	struct ceph_msg *msg;
+
+	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
+	     session->s_mds, session_state_name(session->s_state), seq);
+	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+
 /*
  * Note new cap ttl, and any transition from stale -> not stale (fresh?).
  *
@@ -2396,6 +2411,10 @@ static void handle_session(struct ceph_mds_session *session,
 		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
 		break;
 
+	case CEPH_SESSION_FLUSHMSG:
+		send_flushmsg_ack(mdsc, session, seq);
+		break;
+
 	default:
 		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
 		WARN_ON(1);
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 89fa4a940a0..4440f447fd3 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op)
 	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
 	case CEPH_SESSION_STALE: return "stale";
 	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
 	}
 	return "???";
 }
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 2ad7b860f06..26bb587deb7 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -282,6 +282,8 @@ enum {
 	CEPH_SESSION_RENEWCAPS,
 	CEPH_SESSION_STALE,
 	CEPH_SESSION_RECALL_STATE,
+	CEPH_SESSION_FLUSHMSG,
+	CEPH_SESSION_FLUSHMSG_ACK,
 };
 
 extern const char *ceph_session_op_name(int op);
-- 
cgit v1.2.3-70-g09d2


From 4ee6a914edbbd2543884f0ad7d58ea471136be32 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 24 Nov 2013 14:43:46 +0800
Subject: ceph: remove exported caps when handling cap import message

Version 3 cap import message includes the ID of the exported
caps. It allow us to remove the exported caps if we still haven't
received the corresponding cap export message.

We remove the exported caps because they are stale, keeping them
can compromise consistence.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/caps.c               | 79 +++++++++++++++++++++++++++++---------------
 include/linux/ceph/ceph_fs.h | 11 +++++-
 2 files changed, 62 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d65ff334901..98f3ca4a5dd 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,6 +611,7 @@ retry:
 		if (ci->i_auth_cap == NULL ||
 		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
 			ci->i_auth_cap = cap;
+		ci->i_cap_exporting_issued = 0;
 	} else if (ci->i_auth_cap == cap) {
 		ci->i_auth_cap = NULL;
 		spin_lock(&mdsc->cap_dirty_lock);
@@ -2823,10 +2824,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
  */
 static void handle_cap_import(struct ceph_mds_client *mdsc,
 			      struct inode *inode, struct ceph_mds_caps *im,
+			      struct ceph_mds_cap_peer *ph,
 			      struct ceph_mds_session *session,
 			      void *snaptrace, int snaptrace_len)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
 	int mds = session->s_mds;
 	unsigned issued = le32_to_cpu(im->caps);
 	unsigned wanted = le32_to_cpu(im->wanted);
@@ -2834,28 +2837,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 	unsigned mseq = le32_to_cpu(im->migrate_seq);
 	u64 realmino = le64_to_cpu(im->realm);
 	u64 cap_id = le64_to_cpu(im->cap_id);
+	u64 p_cap_id;
+	int peer;
 
-	if (ci->i_cap_exporting_mds >= 0 &&
-	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
-		dout("handle_cap_import inode %p ci %p mds%d mseq %d"
-		     " - cleared exporting from mds%d\n",
-		     inode, ci, mds, mseq,
-		     ci->i_cap_exporting_mds);
-		ci->i_cap_exporting_issued = 0;
-		ci->i_cap_exporting_mseq = 0;
-		ci->i_cap_exporting_mds = -1;
+	if (ph) {
+		p_cap_id = le64_to_cpu(ph->cap_id);
+		peer = le32_to_cpu(ph->mds);
+	} else {
+		p_cap_id = 0;
+		peer = -1;
+	}
 
-		spin_lock(&mdsc->cap_dirty_lock);
-		if (!list_empty(&ci->i_dirty_item)) {
-			dout(" moving %p back to cap_dirty\n", inode);
-			list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
+	     inode, ci, mds, mseq, peer);
+
+	spin_lock(&ci->i_ceph_lock);
+	cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+	if (cap && cap->cap_id == p_cap_id) {
+		dout(" remove export cap %p mds%d flags %d\n",
+		     cap, peer, ph->flags);
+		if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+		    (cap->seq != le32_to_cpu(ph->seq) ||
+		     cap->mseq != le32_to_cpu(ph->mseq))) {
+			pr_err("handle_cap_import: mismatched seq/mseq: "
+			       "ino (%llx.%llx) mds%d seq %d mseq %d "
+			       "importer mds%d has peer seq %d mseq %d\n",
+			       ceph_vinop(inode), peer, cap->seq,
+			       cap->mseq, mds, le32_to_cpu(ph->seq),
+			       le32_to_cpu(ph->mseq));
 		}
-		spin_unlock(&mdsc->cap_dirty_lock);
-	} else {
-		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
-		     inode, ci, mds, mseq);
+		ci->i_cap_exporting_issued = cap->issued;
+		__ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
 	}
 
+	/* make sure we re-request max_size, if necessary */
+	ci->i_wanted_max_size = 0;
+	ci->i_requested_max_size = 0;
+	spin_unlock(&ci->i_ceph_lock);
+
 	down_write(&mdsc->snap_rwsem);
 	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
 			       false);
@@ -2866,11 +2885,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 	kick_flushing_inode_caps(mdsc, session, inode);
 	up_read(&mdsc->snap_rwsem);
 
-	/* make sure we re-request max_size, if necessary */
-	spin_lock(&ci->i_ceph_lock);
-	ci->i_wanted_max_size = 0;  /* reset */
-	ci->i_requested_max_size = 0;
-	spin_unlock(&ci->i_ceph_lock);
 }
 
 /*
@@ -2888,6 +2902,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	struct ceph_inode_info *ci;
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
+	struct ceph_mds_cap_peer *peer = NULL;
 	int mds = session->s_mds;
 	int op;
 	u32 seq, mseq;
@@ -2898,12 +2913,14 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	void *snaptrace;
 	size_t snaptrace_len;
 	void *flock;
+	void *end;
 	u32 flock_len;
 	int open_target_sessions = 0;
 
 	dout("handle_caps from mds%d\n", mds);
 
 	/* decode */
+	end = msg->front.iov_base + msg->front.iov_len;
 	tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
@@ -2921,17 +2938,25 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	snaptrace_len = le32_to_cpu(h->snap_trace_len);
 
 	if (le16_to_cpu(msg->hdr.version) >= 2) {
-		void *p, *end;
-
-		p = snaptrace + snaptrace_len;
-		end = msg->front.iov_base + msg->front.iov_len;
+		void *p = snaptrace + snaptrace_len;
 		ceph_decode_32_safe(&p, end, flock_len, bad);
+		if (p + flock_len > end)
+			goto bad;
 		flock = p;
 	} else {
 		flock = NULL;
 		flock_len = 0;
 	}
 
+	if (le16_to_cpu(msg->hdr.version) >= 3) {
+		if (op == CEPH_CAP_OP_IMPORT) {
+			void *p = flock + flock_len;
+			if (p + sizeof(*peer) > end)
+				goto bad;
+			peer = p;
+		}
+	}
+
 	mutex_lock(&session->s_mutex);
 	session->s_seq++;
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2968,7 +2993,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		goto done;
 
 	case CEPH_CAP_OP_IMPORT:
-		handle_cap_import(mdsc, inode, h, session,
+		handle_cap_import(mdsc, inode, h, peer, session,
 				  snaptrace, snaptrace_len);
 	}
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 26bb587deb7..0a37b989b52 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -459,7 +459,8 @@ struct ceph_mds_reply_cap {
 	__u8 flags;                    /* CEPH_CAP_FLAG_* */
 } __attribute__ ((packed));
 
-#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_AUTH	(1 << 0)  /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE	(1 << 1)  /* release the cap */
 
 /* inode record, for bundling with mds reply */
 struct ceph_mds_reply_inode {
@@ -660,6 +661,14 @@ struct ceph_mds_caps {
 	__le32 time_warp_seq;
 } __attribute__ ((packed));
 
+struct ceph_mds_cap_peer {
+	__le64 cap_id;
+	__le32 seq;
+	__le32 mseq;
+	__le32 mds;
+	__u8   flags;
+} __attribute__ ((packed));
+
 /* cap release msg head */
 struct ceph_mds_cap_release {
 	__le32 num;                /* number of cap_items that follow */
-- 
cgit v1.2.3-70-g09d2


From 5d72d13c425bb41f7752962f168fb402b86b7ac0 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 24 Nov 2013 14:33:01 +0800
Subject: ceph: add open export target session helper

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/mds_client.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 fs/ceph/mds_client.h |  2 ++
 2 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7c00dd530bd..f4f050a69a4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -847,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,
  *
  * called under mdsc->mutex
  */
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+	struct ceph_mds_session *session;
+
+	session = __ceph_lookup_mds_session(mdsc, target);
+	if (!session) {
+		session = register_session(mdsc, target);
+		if (IS_ERR(session))
+			return session;
+	}
+	if (session->s_state == CEPH_MDS_SESSION_NEW ||
+	    session->s_state == CEPH_MDS_SESSION_CLOSING)
+		__open_session(mdsc, session);
+
+	return session;
+}
+
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+	struct ceph_mds_session *session;
+
+	dout("open_export_target_session to mds%d\n", target);
+
+	mutex_lock(&mdsc->mutex);
+	session = __open_export_target_session(mdsc, target);
+	mutex_unlock(&mdsc->mutex);
+
+	return session;
+}
+
 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
 					  struct ceph_mds_session *session)
 {
 	struct ceph_mds_info *mi;
 	struct ceph_mds_session *ts;
 	int i, mds = session->s_mds;
-	int target;
 
 	if (mds >= mdsc->mdsmap->m_max_mds)
 		return;
+
 	mi = &mdsc->mdsmap->m_info[mds];
 	dout("open_export_target_sessions for mds%d (%d targets)\n",
 	     session->s_mds, mi->num_export_targets);
 
 	for (i = 0; i < mi->num_export_targets; i++) {
-		target = mi->export_targets[i];
-		ts = __ceph_lookup_mds_session(mdsc, target);
-		if (!ts) {
-			ts = register_session(mdsc, target);
-			if (IS_ERR(ts))
-				return;
-		}
-		if (session->s_state == CEPH_MDS_SESSION_NEW ||
-		    session->s_state == CEPH_MDS_SESSION_CLOSING)
-			__open_session(mdsc, session);
-		else
-			dout(" mds%d target mds%d %p is %s\n", session->s_mds,
-			     i, ts, session_state_name(ts->s_state));
-		ceph_put_mds_session(ts);
+		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
+		if (!IS_ERR(ts))
+			ceph_put_mds_session(ts);
 	}
 }
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4c053d099ae..68288917c73 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -383,6 +383,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
 				 struct ceph_msg *msg);
 
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 					  struct ceph_mds_session *session);
 
-- 
cgit v1.2.3-70-g09d2


From 11df2dfb610d68e8050c2183c344b1002351a99d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 24 Nov 2013 14:44:38 +0800
Subject: ceph: add imported caps when handling cap export message

Version 3 cap export message includes information about the imported
caps. It allows us to add the imported caps if the corresponding cap
import message still hasn't been received.

This allow us to handle situation that the importer MDS crashes and
the cap import message is missing.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 fs/ceph/caps.c  | 220 ++++++++++++++++++++++++++++++++++++--------------------
 fs/ceph/inode.c |   4 +-
 fs/ceph/super.h |   4 +-
 3 files changed, 146 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98f3ca4a5dd..17543383545 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -555,21 +555,34 @@ retry:
 		cap->ci = ci;
 		__insert_cap_node(ci, cap);
 
-		/* clear out old exporting info?  (i.e. on cap import) */
-		if (ci->i_cap_exporting_mds == mds) {
-			ci->i_cap_exporting_issued = 0;
-			ci->i_cap_exporting_mseq = 0;
-			ci->i_cap_exporting_mds = -1;
-		}
-
 		/* add to session cap list */
 		cap->session = session;
 		spin_lock(&session->s_cap_lock);
 		list_add_tail(&cap->session_caps, &session->s_caps);
 		session->s_nr_caps++;
 		spin_unlock(&session->s_cap_lock);
-	} else if (new_cap)
-		ceph_put_cap(mdsc, new_cap);
+	} else {
+		if (new_cap)
+			ceph_put_cap(mdsc, new_cap);
+
+		/*
+		 * auth mds of the inode changed. we received the cap export
+		 * message, but still haven't received the cap import message.
+		 * handle_cap_export() updated the new auth MDS' cap.
+		 *
+		 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+		 * a message that was send before the cap import message. So
+		 * don't remove caps.
+		 */
+		if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+			WARN_ON(cap != ci->i_auth_cap);
+			WARN_ON(cap->cap_id != cap_id);
+			seq = cap->seq;
+			mseq = cap->mseq;
+			issued |= cap->issued;
+			flags |= CEPH_CAP_FLAG_AUTH;
+		}
+	}
 
 	if (!ci->i_snap_realm) {
 		/*
@@ -612,15 +625,8 @@ retry:
 		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
 			ci->i_auth_cap = cap;
 		ci->i_cap_exporting_issued = 0;
-	} else if (ci->i_auth_cap == cap) {
-		ci->i_auth_cap = NULL;
-		spin_lock(&mdsc->cap_dirty_lock);
-		if (!list_empty(&ci->i_dirty_item)) {
-			dout(" moving %p to cap_dirty_migrating\n", inode);
-			list_move(&ci->i_dirty_item,
-				  &mdsc->cap_dirty_migrating);
-		}
-		spin_unlock(&mdsc->cap_dirty_lock);
+	} else {
+		WARN_ON(ci->i_auth_cap == cap);
 	}
 
 	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
@@ -889,7 +895,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
  */
 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 {
-	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued;
 }
 
 int ceph_is_any_caps(struct inode *inode)
@@ -1396,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 				ci->i_snap_realm->cached_context);
 		dout(" inode %p now dirty snapc %p auth cap %p\n",
 		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+		WARN_ON(!ci->i_auth_cap);
 		BUG_ON(!list_empty(&ci->i_dirty_item));
 		spin_lock(&mdsc->cap_dirty_lock);
-		if (ci->i_auth_cap)
-			list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
-		else
-			list_add(&ci->i_dirty_item,
-				 &mdsc->cap_dirty_migrating);
+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
 		spin_unlock(&mdsc->cap_dirty_lock);
 		if (ci->i_flushing_caps == 0) {
 			ihold(inode);
@@ -2421,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
 		inode->i_size);
 
+
+	/*
+	 * auth mds of the inode changed. we received the cap export message,
+	 * but still haven't received the cap import message. handle_cap_export
+	 * updated the new auth MDS' cap.
+	 *
+	 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+	 * that was sent before the cap import message. So don't remove caps.
+	 */
+	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+		WARN_ON(cap != ci->i_auth_cap);
+		WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+		seq = cap->seq;
+		newcaps |= cap->issued;
+	}
+
 	/*
 	 * If CACHE is being revoked, and we have no dirty buffers,
 	 * try to invalidate (once).  (If there are dirty buffers, we
@@ -2447,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	issued |= implemented | __ceph_caps_dirty(ci);
 
 	cap->cap_gen = session->s_cap_gen;
+	cap->seq = seq;
 
 	__check_cap_issue(ci, cap, newcaps);
 
@@ -2497,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
 			    &atime);
 
+
+	/* file layout may have changed */
+	ci->i_layout = grant->layout;
+
 	/* max size increase? */
 	if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
 		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
@@ -2525,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 			check_caps = 1;
 	}
 
-	cap->seq = seq;
-
-	/* file layout may have changed */
-	ci->i_layout = grant->layout;
-
 	/* revocation, grant, or no-op? */
 	if (cap->issued & ~newcaps) {
 		int revoking = cap->issued & ~newcaps;
@@ -2755,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode,
  * caller holds s_mutex
  */
 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
-			      struct ceph_mds_session *session,
-			      int *open_target_sessions)
+			      struct ceph_mds_cap_peer *ph,
+			      struct ceph_mds_session *session)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *tsession = NULL;
+	struct ceph_cap *cap, *tcap;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int mds = session->s_mds;
+	u64 t_cap_id;
 	unsigned mseq = le32_to_cpu(ex->migrate_seq);
-	struct ceph_cap *cap = NULL, *t;
-	struct rb_node *p;
-	int remember = 1;
+	unsigned t_seq, t_mseq;
+	int target, issued;
+	int mds = session->s_mds;
 
-	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
-	     inode, ci, mds, mseq);
+	if (ph) {
+		t_cap_id = le64_to_cpu(ph->cap_id);
+		t_seq = le32_to_cpu(ph->seq);
+		t_mseq = le32_to_cpu(ph->mseq);
+		target = le32_to_cpu(ph->mds);
+	} else {
+		t_cap_id = t_seq = t_mseq = 0;
+		target = -1;
+	}
 
+	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
+	     inode, ci, mds, mseq, target);
+retry:
 	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap)
+		goto out_unlock;
 
-	/* make sure we haven't seen a higher mseq */
-	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
-		t = rb_entry(p, struct ceph_cap, ci_node);
-		if (ceph_seq_cmp(t->mseq, mseq) > 0) {
-			dout(" higher mseq on cap from mds%d\n",
-			     t->session->s_mds);
-			remember = 0;
-		}
-		if (t->session->s_mds == mds)
-			cap = t;
+	if (target < 0) {
+		__ceph_remove_cap(cap, false);
+		goto out_unlock;
 	}
 
-	if (cap) {
-		if (remember) {
-			/* make note */
-			ci->i_cap_exporting_mds = mds;
-			ci->i_cap_exporting_mseq = mseq;
-			ci->i_cap_exporting_issued = cap->issued;
-
-			/*
-			 * make sure we have open sessions with all possible
-			 * export targets, so that we get the matching IMPORT
-			 */
-			*open_target_sessions = 1;
+	/*
+	 * now we know we haven't received the cap import message yet
+	 * because the exported cap still exist.
+	 */
 
-			/*
-			 * we can't flush dirty caps that we've seen the
-			 * EXPORT but no IMPORT for
-			 */
-			spin_lock(&mdsc->cap_dirty_lock);
-			if (!list_empty(&ci->i_dirty_item)) {
-				dout(" moving %p to cap_dirty_migrating\n",
-				     inode);
-				list_move(&ci->i_dirty_item,
-					  &mdsc->cap_dirty_migrating);
+	issued = cap->issued;
+	WARN_ON(issued != cap->implemented);
+
+	tcap = __get_cap_for_mds(ci, target);
+	if (tcap) {
+		/* already have caps from the target */
+		if (tcap->cap_id != t_cap_id ||
+		    ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+			dout(" updating import cap %p mds%d\n", tcap, target);
+			tcap->cap_id = t_cap_id;
+			tcap->seq = t_seq - 1;
+			tcap->issue_seq = t_seq - 1;
+			tcap->mseq = t_mseq;
+			tcap->issued |= issued;
+			tcap->implemented |= issued;
+			if (cap == ci->i_auth_cap)
+				ci->i_auth_cap = tcap;
+			if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+				spin_lock(&mdsc->cap_dirty_lock);
+				list_move_tail(&ci->i_flushing_item,
+					       &tcap->session->s_cap_flushing);
+				spin_unlock(&mdsc->cap_dirty_lock);
 			}
-			spin_unlock(&mdsc->cap_dirty_lock);
 		}
 		__ceph_remove_cap(cap, false);
+		goto out_unlock;
+	}
+
+	if (tsession) {
+		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+		spin_unlock(&ci->i_ceph_lock);
+		/* add placeholder for the export tagert */
+		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+			     t_seq - 1, t_mseq, (u64)-1, flag, NULL);
+		goto retry;
 	}
-	/* else, we already released it */
 
 	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&session->s_mutex);
+
+	/* open target session */
+	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+	if (!IS_ERR(tsession)) {
+		if (mds > target) {
+			mutex_lock(&session->s_mutex);
+			mutex_lock_nested(&tsession->s_mutex,
+					  SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&tsession->s_mutex);
+			mutex_lock_nested(&session->s_mutex,
+					  SINGLE_DEPTH_NESTING);
+		}
+		ceph_add_cap_releases(mdsc, tsession);
+	} else {
+		WARN_ON(1);
+		tsession = NULL;
+		target = -1;
+	}
+	goto retry;
+
+out_unlock:
+	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&session->s_mutex);
+	if (tsession) {
+		mutex_unlock(&tsession->s_mutex);
+		ceph_put_mds_session(tsession);
+	}
 }
 
 /*
@@ -2915,7 +2983,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	void *flock;
 	void *end;
 	u32 flock_len;
-	int open_target_sessions = 0;
 
 	dout("handle_caps from mds%d\n", mds);
 
@@ -2954,6 +3021,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 			if (p + sizeof(*peer) > end)
 				goto bad;
 			peer = p;
+		} else if (op == CEPH_CAP_OP_EXPORT) {
+			/* recorded in unused fields */
+			peer = (void *)&h->size;
 		}
 	}
 
@@ -2989,8 +3059,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		goto done;
 
 	case CEPH_CAP_OP_EXPORT:
-		handle_cap_export(inode, h, session, &open_target_sessions);
-		goto done;
+		handle_cap_export(inode, h, peer, session);
+		goto done_unlocked;
 
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, peer, session,
@@ -3045,8 +3115,6 @@ done:
 done_unlocked:
 	if (inode)
 		iput(inode);
-	if (open_target_sessions)
-		ceph_mdsc_open_export_target_sessions(mdsc, session);
 	return;
 
 bad:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3db97ba15a0..6fc10a7d7c5 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -336,12 +336,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_hold_caps_min = 0;
 	ci->i_hold_caps_max = 0;
 	INIT_LIST_HEAD(&ci->i_cap_delay_list);
-	ci->i_cap_exporting_mds = 0;
-	ci->i_cap_exporting_mseq = 0;
-	ci->i_cap_exporting_issued = 0;
 	INIT_LIST_HEAD(&ci->i_cap_snaps);
 	ci->i_head_snapc = NULL;
 	ci->i_snap_caps = 0;
+	ci->i_cap_exporting_issued = 0;
 
 	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
 		ci->i_nr_by_mode[i] = 0;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a6ba32fb0d4..c299f7d19bf 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -287,14 +287,12 @@ struct ceph_inode_info {
 	unsigned long i_hold_caps_min; /* jiffies */
 	unsigned long i_hold_caps_max; /* jiffies */
 	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
-	int i_cap_exporting_mds;         /* to handle cap migration between */
-	unsigned i_cap_exporting_mseq;   /*  mds's. */
-	unsigned i_cap_exporting_issued;
 	struct ceph_cap_reservation i_cap_migration_resv;
 	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
 	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
 						    dirty|flushing caps */
 	unsigned i_snap_caps;           /* cap bits for snapped files */
+	unsigned i_cap_exporting_issued;
 
 	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
 
-- 
cgit v1.2.3-70-g09d2


From 471252cd8b34b0609973740b25dcd1ff01dc1889 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Tue, 21 Jan 2014 15:21:33 -0500
Subject: pnfs: fix BUG in filelayout_recover_commit_reqs

cond_resched_lock(cinfo->lock) is called everywhere else while holding
the cinfo->lock spinlock.  Not holding this lock while calling
transfer_commit_list in filelayout_recover_commit_reqs causes the BUG
below.

It's true that we can't hold this lock while calling pnfs_put_lseg,
because that might try to lock the inode lock - which might be the
same lock as cinfo->lock.

To reproduce, mount a 2 DS pynfs server and run an O_DIRECT command
that crosses a stripe boundary and is not page aligned, such as:

 dd if=/dev/zero of=/mnt/f bs=17000 count=1 oflag=direct

BUG: sleeping function called from invalid context at linux/fs/nfs/nfs4filelayout.c:1161
in_atomic(): 0, irqs_disabled(): 0, pid: 27, name: kworker/0:1
2 locks held by kworker/0:1/27:
 #0:  (events){.+.+.+}, at: [<ffffffff810501d7>] process_one_work+0x175/0x3a5
 #1:  ((&dreq->work)){+.+...}, at: [<ffffffff810501d7>] process_one_work+0x175/0x3a5
CPU: 0 PID: 27 Comm: kworker/0:1 Not tainted 3.13.0-rc3-branch-dros_testing+ #21
Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013
Workqueue: events nfs_direct_write_schedule_work [nfs]
 0000000000000000 ffff88007a39bbb8 ffffffff81491256 ffff88007b87a130  ffff88007a39bbd8 ffffffff8105f103 ffff880079614000 ffff880079617d40  ffff88007a39bc20 ffffffffa011603e ffff880078988b98 0000000000000000
Call Trace:
 [<ffffffff81491256>] dump_stack+0x4d/0x66
 [<ffffffff8105f103>] __might_sleep+0x100/0x105
 [<ffffffffa011603e>] transfer_commit_list+0x94/0xf1 [nfs_layout_nfsv41_files]
 [<ffffffffa01160d6>] filelayout_recover_commit_reqs+0x3b/0x68 [nfs_layout_nfsv41_files]
 [<ffffffffa00ba53a>] nfs_direct_write_reschedule+0x9f/0x1d6 [nfs]
 [<ffffffff810705df>] ? mark_lock+0x1df/0x224
 [<ffffffff8106e617>] ? trace_hardirqs_off_caller+0x37/0xa4
 [<ffffffff8106e691>] ? trace_hardirqs_off+0xd/0xf
 [<ffffffffa00ba8f8>] nfs_direct_write_schedule_work+0x9d/0xb7 [nfs]
 [<ffffffff810501d7>] ? process_one_work+0x175/0x3a5
 [<ffffffff81050258>] process_one_work+0x1f6/0x3a5
 [<ffffffff810501d7>] ? process_one_work+0x175/0x3a5
 [<ffffffff8105187e>] worker_thread+0x149/0x1f5
 [<ffffffff81051735>] ? rescuer_thread+0x28d/0x28d
 [<ffffffff81056d74>] kthread+0xd2/0xda
 [<ffffffff81056ca2>] ? __kthread_parkme+0x61/0x61
 [<ffffffff8149e66c>] ret_from_fork+0x7c/0xb0
 [<ffffffff81056ca2>] ? __kthread_parkme+0x61/0x61

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4filelayout.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0a93e798df3..03fd8be8c0c 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -1216,17 +1216,17 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
 	struct pnfs_commit_bucket *b;
 	int i;
 
-	/* NOTE cinfo->lock is NOT held, relying on fact that this is
-	 * only called on single thread per dreq.
-	 * Can't take the lock because need to do pnfs_put_lseg
-	 */
+	spin_lock(cinfo->lock);
 	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
 		if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
+			spin_unlock(cinfo->lock);
 			pnfs_put_lseg(b->wlseg);
 			b->wlseg = NULL;
+			spin_lock(cinfo->lock);
 		}
 	}
 	cinfo->ds->nwritten = 0;
+	spin_unlock(cinfo->lock);
 }
 
 static unsigned int
-- 
cgit v1.2.3-70-g09d2


From e9fe69045bd648d75d8d8099b8658a4ee005a8e5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Jan 2014 15:48:13 -0800
Subject: inotify: provide function for name length rounding

Rounding of name length when passing it to userspace was done in several
places.  Provide a function to do it and use it in all places.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_user.c | 41 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891a..1bb6dc8eaf1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -124,6 +124,13 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 	return ret;
 }
 
+static int round_event_name_len(struct fsnotify_event *event)
+{
+	if (!event->name_len)
+		return 0;
+	return roundup(event->name_len + 1, sizeof(struct inotify_event));
+}
+
 /*
  * Get an inotify_kernel_event if one exists and is small
  * enough to fit in "count". Return an error pointer if
@@ -144,9 +151,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	if (event->name_len)
-		event_size += roundup(event->name_len + 1, event_size);
-
+	event_size += round_event_name_len(event);
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
 
@@ -171,7 +176,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	struct fsnotify_event_private_data *fsn_priv;
 	struct inotify_event_private_data *priv;
 	size_t event_size = sizeof(struct inotify_event);
-	size_t name_len = 0;
+	size_t name_len;
+	size_t pad_name_len;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -189,14 +195,13 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		inotify_free_event_priv(fsn_priv);
 	}
 
+	name_len = event->name_len;
 	/*
-	 * round up event->name_len so it is a multiple of event_size
+	 * round up name length so it is a multiple of event_size
 	 * plus an extra byte for the terminating '\0'.
 	 */
-	if (event->name_len)
-		name_len = roundup(event->name_len + 1, event_size);
-	inotify_event.len = name_len;
-
+	pad_name_len = round_event_name_len(event);
+	inotify_event.len = pad_name_len;
 	inotify_event.mask = inotify_mask_to_arg(event->mask);
 	inotify_event.cookie = event->sync_cookie;
 
@@ -209,20 +214,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	/*
 	 * fsnotify only stores the pathname, so here we have to send the pathname
 	 * and then pad that pathname out to a multiple of sizeof(inotify_event)
-	 * with zeros.  I get my zeros from the nul_inotify_event.
+	 * with zeros.
 	 */
-	if (name_len) {
-		unsigned int len_to_zero = name_len - event->name_len;
+	if (pad_name_len) {
 		/* copy the path name */
-		if (copy_to_user(buf, event->file_name, event->name_len))
+		if (copy_to_user(buf, event->file_name, name_len))
 			return -EFAULT;
-		buf += event->name_len;
+		buf += name_len;
 
 		/* fill userspace with 0's */
-		if (clear_user(buf, len_to_zero))
+		if (clear_user(buf, pad_name_len - name_len))
 			return -EFAULT;
-		buf += len_to_zero;
-		event_size += name_len;
+		event_size += pad_name_len;
 	}
 
 	return event_size;
@@ -314,9 +317,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 		list_for_each_entry(holder, &group->notification_list, event_list) {
 			event = holder->event;
 			send_len += sizeof(struct inotify_event);
-			if (event->name_len)
-				send_len += roundup(event->name_len + 1,
-						sizeof(struct inotify_event));
+			send_len += round_event_name_len(event);
 		}
 		mutex_unlock(&group->notification_mutex);
 		ret = put_user(send_len, (int __user *) p);
-- 
cgit v1.2.3-70-g09d2


From 7053aee26a3548ebaba046ae2e52396ccf56ac6c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Jan 2014 15:48:14 -0800
Subject: fsnotify: do not share events between notification groups

Currently fsnotify framework creates one event structure for each
notification event and links this event into all interested notification
groups.  This is done so that we save memory when several notification
groups are interested in the event.  However the need for event
structure shared between inotify & fanotify bloats the event structure
so the result is often higher memory consumption.

Another problem is that fsnotify framework keeps path references with
outstanding events so that fanotify can return open file descriptors
with its events.  This has the undesirable effect that filesystem cannot
be unmounted while there are outstanding events - a regression for
inotify compared to a situation before it was converted to fsnotify
framework.  For fanotify this problem is hard to avoid and users of
fanotify should kind of expect this behavior when they ask for file
descriptors from notified files.

This patch changes fsnotify and its users to create separate event
structure for each group.  This allows for much simpler code (~400 lines
removed by this patch) and also smaller event structures.  For example
on 64-bit system original struct fsnotify_event consumes 120 bytes, plus
additional space for file name, additional 24 bytes for second and each
subsequent group linking the event, and additional 32 bytes for each
inotify group for private data.  After the conversion inotify event
consumes 48 bytes plus space for file name which is considerably less
memory unless file names are long and there are several groups
interested in the events (both of which are uncommon).  Fanotify event
fits in 56 bytes after the conversion (fanotify doesn't care about file
names so its events don't have to have it allocated).  A win unless
there are four or more fanotify groups interested in the event.

The conversion also solves the problem with unmount when only inotify is
used as we don't have to grab path references for inotify events.

[hughd@google.com: fanotify: fix corruption preventing startup]
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/dnotify/dnotify.c          |  11 +-
 fs/notify/fanotify/fanotify.c        | 211 +++++++++++-----------
 fs/notify/fanotify/fanotify.h        |  23 +++
 fs/notify/fanotify/fanotify_user.c   |  41 +++--
 fs/notify/fsnotify.c                 |  37 ++--
 fs/notify/group.c                    |   1 +
 fs/notify/inotify/inotify.h          |  21 ++-
 fs/notify/inotify/inotify_fsnotify.c | 125 +++++--------
 fs/notify/inotify/inotify_user.c     |  86 +++------
 fs/notify/notification.c             | 334 +++--------------------------------
 include/linux/fsnotify_backend.h     | 114 +++---------
 kernel/audit_tree.c                  |   8 +-
 kernel/audit_watch.c                 |  14 +-
 13 files changed, 318 insertions(+), 708 deletions(-)
 create mode 100644 fs/notify/fanotify/fanotify.h

(limited to 'fs')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc..bfca53dbbf3 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,21 +82,20 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  * events.
  */
 static int dnotify_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
-				struct fsnotify_event *event)
+				u32 mask, void *data, int data_type,
+				const unsigned char *file_name)
 {
 	struct dnotify_mark *dn_mark;
-	struct inode *to_tell;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct fown_struct *fown;
-	__u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
+	__u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
 
 	BUG_ON(vfsmount_mark);
 
-	to_tell = event->to_tell;
-
 	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
 
 	spin_lock(&inode_mark->lock);
@@ -155,7 +154,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.should_send_event = dnotify_should_send_event,
 	.free_group_priv = NULL,
 	.freeing_mark = NULL,
-	.free_event_priv = NULL,
+	.free_event = NULL,
 };
 
 /*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b26..c26268d7bd9 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,31 +9,27 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 
-static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+#include "fanotify.h"
+
+static bool should_merge(struct fsnotify_event *old_fsn,
+			 struct fsnotify_event *new_fsn)
 {
-	pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+	struct fanotify_event_info *old, *new;
 
-	if (old->to_tell == new->to_tell &&
-	    old->data_type == new->data_type &&
-	    old->tgid == new->tgid) {
-		switch (old->data_type) {
-		case (FSNOTIFY_EVENT_PATH):
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-			/* dont merge two permission events */
-			if ((old->mask & FAN_ALL_PERM_EVENTS) &&
-			    (new->mask & FAN_ALL_PERM_EVENTS))
-				return false;
+	/* dont merge two permission events */
+	if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) &&
+	    (new_fsn->mask & FAN_ALL_PERM_EVENTS))
+		return false;
 #endif
-			if ((old->path.mnt == new->path.mnt) &&
-			    (old->path.dentry == new->path.dentry))
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_NONE):
-			return true;
-		default:
-			BUG();
-		};
-	}
+	pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
+	old = FANOTIFY_E(old_fsn);
+	new = FANOTIFY_E(new_fsn);
+
+	if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
+	    old->path.mnt == new->path.mnt &&
+	    old->path.dentry == new->path.dentry)
+		return true;
 	return false;
 }
 
@@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 static struct fsnotify_event *fanotify_merge(struct list_head *list,
 					     struct fsnotify_event *event)
 {
-	struct fsnotify_event_holder *test_holder;
-	struct fsnotify_event *test_event = NULL;
-	struct fsnotify_event *new_event;
+	struct fsnotify_event *test_event;
+	bool do_merge = false;
 
 	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
 
-
-	list_for_each_entry_reverse(test_holder, list, event_list) {
-		if (should_merge(test_holder->event, event)) {
-			test_event = test_holder->event;
+	list_for_each_entry_reverse(test_event, list, list) {
+		if (should_merge(test_event, event)) {
+			do_merge = true;
 			break;
 		}
 	}
 
-	if (!test_event)
+	if (!do_merge)
 		return NULL;
 
-	fsnotify_get_event(test_event);
-
-	/* if they are exactly the same we are done */
-	if (test_event->mask == event->mask)
-		return test_event;
-
-	/*
-	 * if the refcnt == 2 this is the only queue
-	 * for this event and so we can update the mask
-	 * in place.
-	 */
-	if (atomic_read(&test_event->refcnt) == 2) {
-		test_event->mask |= event->mask;
-		return test_event;
-	}
-
-	new_event = fsnotify_clone_event(test_event);
-
-	/* done with test_event */
-	fsnotify_put_event(test_event);
-
-	/* couldn't allocate memory, merge was not possible */
-	if (unlikely(!new_event))
-		return ERR_PTR(-ENOMEM);
-
-	/* build new event and replace it on the list */
-	new_event->mask = (test_event->mask | event->mask);
-	fsnotify_replace_event(test_holder, new_event);
-
-	/* we hold a reference on new_event from clone_event */
-	return new_event;
+	test_event->mask |= event->mask;
+	return test_event;
 }
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 static int fanotify_get_response_from_access(struct fsnotify_group *group,
-					     struct fsnotify_event *event)
+					     struct fanotify_event_info *event)
 {
 	int ret;
 
@@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 		return 0;
 
 	/* userspace responded, convert to something usable */
-	spin_lock(&event->lock);
 	switch (event->response) {
 	case FAN_ALLOW:
 		ret = 0;
@@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 		ret = -EPERM;
 	}
 	event->response = 0;
-	spin_unlock(&event->lock);
 
 	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
 		 group, event, ret);
@@ -125,48 +88,8 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 }
 #endif
 
-static int fanotify_handle_event(struct fsnotify_group *group,
-				 struct fsnotify_mark *inode_mark,
-				 struct fsnotify_mark *fanotify_mark,
-				 struct fsnotify_event *event)
-{
-	int ret = 0;
-	struct fsnotify_event *notify_event = NULL;
-
-	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
-	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
-	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
-	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
-	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
-	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
-	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
-	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
-	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
-	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
-
-	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
-	notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
-	if (IS_ERR(notify_event))
-		return PTR_ERR(notify_event);
-
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (event->mask & FAN_ALL_PERM_EVENTS) {
-		/* if we merged we need to wait on the new event */
-		if (notify_event)
-			event = notify_event;
-		ret = fanotify_get_response_from_access(group, event);
-	}
-#endif
-
-	if (notify_event)
-		fsnotify_put_event(notify_event);
-
-	return ret;
-}
-
 static bool fanotify_should_send_event(struct fsnotify_group *group,
-				       struct inode *to_tell,
+				       struct inode *inode,
 				       struct fsnotify_mark *inode_mark,
 				       struct fsnotify_mark *vfsmnt_mark,
 				       __u32 event_mask, void *data, int data_type)
@@ -174,8 +97,8 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 	__u32 marks_mask, marks_ignored_mask;
 	struct path *path = data;
 
-	pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
-		 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
+	pr_debug("%s: group=%p inode=%p inode_mark=%p vfsmnt_mark=%p "
+		 "mask=%x data=%p data_type=%d\n", __func__, group, inode,
 		 inode_mark, vfsmnt_mark, event_mask, data, data_type);
 
 	/* if we don't have enough info to send an event to userspace say no */
@@ -217,6 +140,70 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 	return false;
 }
 
+static int fanotify_handle_event(struct fsnotify_group *group,
+				 struct inode *inode,
+				 struct fsnotify_mark *inode_mark,
+				 struct fsnotify_mark *fanotify_mark,
+				 u32 mask, void *data, int data_type,
+				 const unsigned char *file_name)
+{
+	int ret = 0;
+	struct fanotify_event_info *event;
+	struct fsnotify_event *fsn_event;
+	struct fsnotify_event *notify_fsn_event;
+
+	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+
+	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+		 mask);
+
+	event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+	if (unlikely(!event))
+		return -ENOMEM;
+
+	fsn_event = &event->fse;
+	fsnotify_init_event(fsn_event, inode, mask);
+	event->tgid = get_pid(task_tgid(current));
+	if (data_type == FSNOTIFY_EVENT_PATH) {
+		struct path *path = data;
+		event->path = *path;
+		path_get(&event->path);
+	} else {
+		event->path.mnt = NULL;
+		event->path.dentry = NULL;
+	}
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	event->response = 0;
+#endif
+
+	notify_fsn_event = fsnotify_add_notify_event(group, fsn_event,
+						     fanotify_merge);
+	if (notify_fsn_event) {
+		/* Our event wasn't used in the end. Free it. */
+		fsnotify_destroy_event(group, fsn_event);
+		if (IS_ERR(notify_fsn_event))
+			return PTR_ERR(notify_fsn_event);
+		/* We need to ask about a different events after a merge... */
+		event = FANOTIFY_E(notify_fsn_event);
+		fsn_event = notify_fsn_event;
+	}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (fsn_event->mask & FAN_ALL_PERM_EVENTS)
+		ret = fanotify_get_response_from_access(group, event);
+#endif
+	return ret;
+}
+
 static void fanotify_free_group_priv(struct fsnotify_group *group)
 {
 	struct user_struct *user;
@@ -226,10 +213,20 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
 	free_uid(user);
 }
 
+static void fanotify_free_event(struct fsnotify_event *fsn_event)
+{
+	struct fanotify_event_info *event;
+
+	event = FANOTIFY_E(fsn_event);
+	path_put(&event->path);
+	put_pid(event->tgid);
+	kmem_cache_free(fanotify_event_cachep, event);
+}
+
 const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.should_send_event = fanotify_should_send_event,
 	.free_group_priv = fanotify_free_group_priv,
-	.free_event_priv = NULL,
+	.free_event = fanotify_free_event,
 	.freeing_mark = NULL,
 };
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 00000000000..0e90174a116
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,23 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+extern struct kmem_cache *fanotify_event_cachep;
+
+struct fanotify_event_info {
+	struct fsnotify_event fse;
+	/*
+	 * We hold ref to this path so it may be dereferenced at any point
+	 * during this object's lifetime
+	 */
+	struct path path;
+	struct pid *tgid;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	u32 response;	/* userspace answer to question */
+#endif
+};
+
+static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+{
+	return container_of(fse, struct fanotify_event_info, fse);
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df..57d7c083cb4 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,6 +19,7 @@
 
 #include "../../mount.h"
 #include "../fdinfo.h"
+#include "fanotify.h"
 
 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
 #define FANOTIFY_DEFAULT_MAX_MARKS	8192
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
 static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+struct kmem_cache *fanotify_event_cachep __read_mostly;
 
 struct fanotify_response_event {
 	struct list_head list;
 	__s32 fd;
-	struct fsnotify_event *event;
+	struct fanotify_event_info *event;
 };
 
 /*
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 }
 
 static int create_fd(struct fsnotify_group *group,
-			struct fsnotify_event *event,
-			struct file **file)
+		     struct fanotify_event_info *event,
+		     struct file **file)
 {
 	int client_fd;
 	struct file *new_file;
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group,
 	if (client_fd < 0)
 		return client_fd;
 
-	if (event->data_type != FSNOTIFY_EVENT_PATH) {
-		WARN_ON(1);
-		put_unused_fd(client_fd);
-		return -EINVAL;
-	}
-
 	/*
 	 * we need a new file handle for the userspace program so it can read even if it was
 	 * originally opened O_WRONLY.
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group,
 }
 
 static int fill_event_metadata(struct fsnotify_group *group,
-				   struct fanotify_event_metadata *metadata,
-				   struct fsnotify_event *event,
-				   struct file **file)
+			       struct fanotify_event_metadata *metadata,
+			       struct fsnotify_event *fsn_event,
+			       struct file **file)
 {
 	int ret = 0;
+	struct fanotify_event_info *event;
 
 	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
-		 group, metadata, event);
+		 group, metadata, fsn_event);
 
 	*file = NULL;
+	event = container_of(fsn_event, struct fanotify_event_info, fse);
 	metadata->event_len = FAN_EVENT_METADATA_LEN;
 	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
 	metadata->vers = FANOTIFY_METADATA_VERSION;
 	metadata->reserved = 0;
-	metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+	metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
 	metadata->pid = pid_vnr(event->tgid);
-	if (unlikely(event->mask & FAN_Q_OVERFLOW))
+	if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
 		metadata->fd = FAN_NOFD;
 	else {
 		metadata->fd = create_fd(group, event, file);
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
 	if (!re)
 		return -ENOMEM;
 
-	re->event = event;
+	re->event = FANOTIFY_E(event);
 	re->fd = fd;
 
 	mutex_lock(&group->fanotify_data.access_mutex);
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
 	if (atomic_read(&group->fanotify_data.bypass_perm)) {
 		mutex_unlock(&group->fanotify_data.access_mutex);
 		kmem_cache_free(fanotify_response_event_cache, re);
-		event->response = FAN_ALLOW;
+		FANOTIFY_E(event)->response = FAN_ALLOW;
 		return 0;
 	}
 		
@@ -273,7 +271,7 @@ out_close_fd:
 out:
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	if (event->mask & FAN_ALL_PERM_EVENTS) {
-		event->response = FAN_DENY;
+		FANOTIFY_E(event)->response = FAN_DENY;
 		wake_up(&group->fanotify_data.access_waitq);
 	}
 #endif
@@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 			if (IS_ERR(kevent))
 				break;
 			ret = copy_event_to_user(group, kevent, buf);
-			fsnotify_put_event(kevent);
+			fsnotify_destroy_event(group, kevent);
 			if (ret < 0)
 				break;
 			buf += ret;
@@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct fsnotify_group *group;
-	struct fsnotify_event_holder *holder;
+	struct fsnotify_event *fsn_event;
 	void __user *p;
 	int ret = -ENOTTY;
 	size_t send_len = 0;
@@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 	switch (cmd) {
 	case FIONREAD:
 		mutex_lock(&group->notification_mutex);
-		list_for_each_entry(holder, &group->notification_list, event_list)
+		list_for_each_entry(fsn_event, &group->notification_list, list)
 			send_len += FAN_EVENT_METADATA_LEN;
 		mutex_unlock(&group->notification_mutex);
 		ret = put_user(send_len, (int __user *) p);
@@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void)
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
 	fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
 						   SLAB_PANIC);
+	fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
 
 	return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b..7c754c91c3f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
 			 struct fsnotify_mark *vfsmount_mark,
 			 __u32 mask, void *data,
 			 int data_is, u32 cookie,
-			 const unsigned char *file_name,
-			 struct fsnotify_event **event)
+			 const unsigned char *file_name)
 {
 	struct fsnotify_group *group = NULL;
 	__u32 inode_test_mask = 0;
@@ -170,10 +169,10 @@ static int send_to_group(struct inode *to_tell,
 
 	pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
 		 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
-		 " data=%p data_is=%d cookie=%d event=%p\n",
+		 " data=%p data_is=%d cookie=%d\n",
 		 __func__, group, to_tell, mask, inode_mark,
 		 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
-		 data_is, cookie, *event);
+		 data_is, cookie);
 
 	if (!inode_test_mask && !vfsmount_test_mask)
 		return 0;
@@ -183,14 +182,9 @@ static int send_to_group(struct inode *to_tell,
 					  data_is) == false)
 		return 0;
 
-	if (!*event) {
-		*event = fsnotify_create_event(to_tell, mask, data,
-						data_is, file_name,
-						cookie, GFP_KERNEL);
-		if (!*event)
-			return -ENOMEM;
-	}
-	return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
+	return group->ops->handle_event(group, to_tell, inode_mark,
+					vfsmount_mark, mask, data, data_is,
+					file_name);
 }
 
 /*
@@ -205,7 +199,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
 	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
 	struct fsnotify_group *inode_group, *vfsmount_group;
-	struct fsnotify_event *event = NULL;
 	struct mount *mnt;
 	int idx, ret = 0;
 	/* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +251,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 
 		if (inode_group > vfsmount_group) {
 			/* handle inode */
-			ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
-					    data_is, cookie, file_name, &event);
+			ret = send_to_group(to_tell, inode_mark, NULL, mask,
+					    data, data_is, cookie, file_name);
 			/* we didn't use the vfsmount_mark */
 			vfsmount_group = NULL;
 		} else if (vfsmount_group > inode_group) {
-			ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
-					    data_is, cookie, file_name, &event);
+			ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
+					    data, data_is, cookie, file_name);
 			inode_group = NULL;
 		} else {
 			ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
-					    mask, data, data_is, cookie, file_name,
-					    &event);
+					    mask, data, data_is, cookie,
+					    file_name);
 		}
 
 		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +278,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	ret = 0;
 out:
 	srcu_read_unlock(&fsnotify_mark_srcu, idx);
-	/*
-	 * fsnotify_create_event() took a reference so the event can't be cleaned
-	 * up while we are still trying to add it to lists, drop that one.
-	 */
-	if (event)
-		fsnotify_put_event(event);
 
 	return ret;
 }
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b..ee674fe2cec 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	INIT_LIST_HEAD(&group->marks_list);
 
 	group->ops = ops;
+	fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW);
 
 	return group;
 }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4b..485eef3f440 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
 #include <linux/inotify.h>
 #include <linux/slab.h> /* struct kmem_cache */
 
-extern struct kmem_cache *event_priv_cachep;
-
-struct inotify_event_private_data {
-	struct fsnotify_event_private_data fsnotify_event_priv_data;
+struct inotify_event_info {
+	struct fsnotify_event fse;
 	int wd;
+	u32 sync_cookie;
+	int name_len;
+	char name[];
 };
 
 struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
 	int wd;
 };
 
+static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
+{
+	return container_of(fse, struct inotify_event_info, fse);
+}
+
 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 					   struct fsnotify_group *group);
-extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern int inotify_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				u32 mask, void *data, int data_type,
+				const unsigned char *file_name);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b..6fabbd163d1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,100 +34,80 @@
 #include "inotify.h"
 
 /*
- * Check if 2 events contain the same information.  We do not compare private data
- * but at this moment that isn't a problem for any know fsnotify listeners.
+ * Check if 2 events contain the same information.
  */
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+static bool event_compare(struct fsnotify_event *old_fsn,
+			  struct fsnotify_event *new_fsn)
 {
-	if ((old->mask == new->mask) &&
-	    (old->to_tell == new->to_tell) &&
-	    (old->data_type == new->data_type) &&
-	    (old->name_len == new->name_len)) {
-		switch (old->data_type) {
-		case (FSNOTIFY_EVENT_INODE):
-			/* remember, after old was put on the wait_q we aren't
-			 * allowed to look at the inode any more, only thing
-			 * left to check was if the file_name is the same */
-			if (!old->name_len ||
-			    !strcmp(old->file_name, new->file_name))
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_PATH):
-			if ((old->path.mnt == new->path.mnt) &&
-			    (old->path.dentry == new->path.dentry))
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_NONE):
-			if (old->mask & FS_Q_OVERFLOW)
-				return true;
-			else if (old->mask & FS_IN_IGNORED)
-				return false;
-			return true;
-		};
-	}
+	struct inotify_event_info *old, *new;
+
+	if (old_fsn->mask & FS_IN_IGNORED)
+		return false;
+	old = INOTIFY_E(old_fsn);
+	new = INOTIFY_E(new_fsn);
+	if ((old_fsn->mask == new_fsn->mask) &&
+	    (old_fsn->inode == new_fsn->inode) &&
+	    (old->name_len == new->name_len) &&
+	    (!old->name_len || !strcmp(old->name, new->name)))
+		return true;
 	return false;
 }
 
 static struct fsnotify_event *inotify_merge(struct list_head *list,
 					    struct fsnotify_event *event)
 {
-	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
 
-	/* and the list better be locked by something too */
-	spin_lock(&event->lock);
-
-	last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
-	last_event = last_holder->event;
-	if (event_compare(last_event, event))
-		fsnotify_get_event(last_event);
-	else
-		last_event = NULL;
-
-	spin_unlock(&event->lock);
-
+	last_event = list_entry(list->prev, struct fsnotify_event, list);
+	if (!event_compare(last_event, event))
+		return NULL;
 	return last_event;
 }
 
-static int inotify_handle_event(struct fsnotify_group *group,
-				struct fsnotify_mark *inode_mark,
-				struct fsnotify_mark *vfsmount_mark,
-				struct fsnotify_event *event)
+int inotify_handle_event(struct fsnotify_group *group,
+			 struct inode *inode,
+			 struct fsnotify_mark *inode_mark,
+			 struct fsnotify_mark *vfsmount_mark,
+			 u32 mask, void *data, int data_type,
+			 const unsigned char *file_name)
 {
 	struct inotify_inode_mark *i_mark;
-	struct inode *to_tell;
-	struct inotify_event_private_data *event_priv;
-	struct fsnotify_event_private_data *fsn_event_priv;
+	struct inotify_event_info *event;
 	struct fsnotify_event *added_event;
-	int wd, ret = 0;
+	struct fsnotify_event *fsn_event;
+	int ret = 0;
+	int len = 0;
+	int alloc_len = sizeof(struct inotify_event_info);
 
 	BUG_ON(vfsmount_mark);
 
-	pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
-		 event, event->to_tell, event->mask);
+	if (file_name) {
+		len = strlen(file_name);
+		alloc_len += len + 1;
+	}
 
-	to_tell = event->to_tell;
+	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+		 mask);
 
 	i_mark = container_of(inode_mark, struct inotify_inode_mark,
 			      fsn_mark);
-	wd = i_mark->wd;
 
-	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
-	if (unlikely(!event_priv))
+	event = kmalloc(alloc_len, GFP_KERNEL);
+	if (unlikely(!event))
 		return -ENOMEM;
 
-	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+	fsn_event = &event->fse;
+	fsnotify_init_event(fsn_event, inode, mask);
+	event->wd = i_mark->wd;
+	event->name_len = len;
+	if (len)
+		strcpy(event->name, file_name);
 
-	fsnotify_get_group(group);
-	fsn_event_priv->group = group;
-	event_priv->wd = wd;
-
-	added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+	added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
 	if (added_event) {
-		inotify_free_event_priv(fsn_event_priv);
-		if (!IS_ERR(added_event))
-			fsnotify_put_event(added_event);
-		else
+		/* Our event wasn't used in the end. Free it. */
+		fsnotify_destroy_event(group, fsn_event);
+		if (IS_ERR(added_event))
 			ret = PTR_ERR(added_event);
 	}
 
@@ -202,22 +182,15 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 	free_uid(group->inotify_data.user);
 }
 
-void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+static void inotify_free_event(struct fsnotify_event *fsn_event)
 {
-	struct inotify_event_private_data *event_priv;
-
-
-	event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
-				  fsnotify_event_priv_data);
-
-	fsnotify_put_group(fsn_event_priv->group);
-	kmem_cache_free(event_priv_cachep, event_priv);
+	kfree(INOTIFY_E(fsn_event));
 }
 
 const struct fsnotify_ops inotify_fsnotify_ops = {
 	.handle_event = inotify_handle_event,
 	.should_send_event = inotify_should_send_event,
 	.free_group_priv = inotify_free_group_priv,
-	.free_event_priv = inotify_free_event_priv,
+	.free_event = inotify_free_event,
 	.freeing_mark = inotify_freeing_mark,
 };
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1bb6dc8eaf1..497395c8274 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
 static int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
-struct kmem_cache *event_priv_cachep __read_mostly;
 
 #ifdef CONFIG_SYSCTL
 
@@ -124,8 +123,11 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 	return ret;
 }
 
-static int round_event_name_len(struct fsnotify_event *event)
+static int round_event_name_len(struct fsnotify_event *fsn_event)
 {
+	struct inotify_event_info *event;
+
+	event = INOTIFY_E(fsn_event);
 	if (!event->name_len)
 		return 0;
 	return roundup(event->name_len + 1, sizeof(struct inotify_event));
@@ -169,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
  * buffer we had in "get_one_event()" above.
  */
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
-				  struct fsnotify_event *event,
+				  struct fsnotify_event *fsn_event,
 				  char __user *buf)
 {
 	struct inotify_event inotify_event;
-	struct fsnotify_event_private_data *fsn_priv;
-	struct inotify_event_private_data *priv;
+	struct inotify_event_info *event;
 	size_t event_size = sizeof(struct inotify_event);
 	size_t name_len;
 	size_t pad_name_len;
 
-	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
-	/* we get the inotify watch descriptor from the event private data */
-	spin_lock(&event->lock);
-	fsn_priv = fsnotify_remove_priv_from_event(group, event);
-	spin_unlock(&event->lock);
-
-	if (!fsn_priv)
-		inotify_event.wd = -1;
-	else {
-		priv = container_of(fsn_priv, struct inotify_event_private_data,
-				    fsnotify_event_priv_data);
-		inotify_event.wd = priv->wd;
-		inotify_free_event_priv(fsn_priv);
-	}
+	pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
 
+	event = INOTIFY_E(fsn_event);
 	name_len = event->name_len;
 	/*
 	 * round up name length so it is a multiple of event_size
 	 * plus an extra byte for the terminating '\0'.
 	 */
-	pad_name_len = round_event_name_len(event);
+	pad_name_len = round_event_name_len(fsn_event);
 	inotify_event.len = pad_name_len;
-	inotify_event.mask = inotify_mask_to_arg(event->mask);
+	inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+	inotify_event.wd = event->wd;
 	inotify_event.cookie = event->sync_cookie;
 
 	/* send the main event */
@@ -218,7 +207,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	 */
 	if (pad_name_len) {
 		/* copy the path name */
-		if (copy_to_user(buf, event->file_name, name_len))
+		if (copy_to_user(buf, event->name, name_len))
 			return -EFAULT;
 		buf += name_len;
 
@@ -257,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 			if (IS_ERR(kevent))
 				break;
 			ret = copy_event_to_user(group, kevent, buf);
-			fsnotify_put_event(kevent);
+			fsnotify_destroy_event(group, kevent);
 			if (ret < 0)
 				break;
 			buf += ret;
@@ -300,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 			  unsigned long arg)
 {
 	struct fsnotify_group *group;
-	struct fsnotify_event_holder *holder;
-	struct fsnotify_event *event;
+	struct fsnotify_event *fsn_event;
 	void __user *p;
 	int ret = -ENOTTY;
 	size_t send_len = 0;
@@ -314,10 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case FIONREAD:
 		mutex_lock(&group->notification_mutex);
-		list_for_each_entry(holder, &group->notification_list, event_list) {
-			event = holder->event;
+		list_for_each_entry(fsn_event, &group->notification_list,
+				    list) {
 			send_len += sizeof(struct inotify_event);
-			send_len += round_event_name_len(event);
+			send_len += round_event_name_len(fsn_event);
 		}
 		mutex_unlock(&group->notification_mutex);
 		ret = put_user(send_len, (int __user *) p);
@@ -504,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark *i_mark;
-	struct fsnotify_event *ignored_event, *notify_event;
-	struct inotify_event_private_data *event_priv;
-	struct fsnotify_event_private_data *fsn_event_priv;
-	int ret;
 
-	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-
-	ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
-					      FSNOTIFY_EVENT_NONE, NULL, 0,
-					      GFP_NOFS);
-	if (!ignored_event)
-		goto skip_send_ignore;
-
-	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
-	if (unlikely(!event_priv))
-		goto skip_send_ignore;
-
-	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-
-	fsnotify_get_group(group);
-	fsn_event_priv->group = group;
-	event_priv->wd = i_mark->wd;
-
-	notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
-	if (notify_event) {
-		if (IS_ERR(notify_event))
-			ret = PTR_ERR(notify_event);
-		else
-			fsnotify_put_event(notify_event);
-		inotify_free_event_priv(fsn_event_priv);
-	}
-
-skip_send_ignore:
-	/* matches the reference taken when the event was created */
-	if (ignored_event)
-		fsnotify_put_event(ignored_event);
+	/* Queue ignore event for the watch */
+	inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
+			     NULL, FSNOTIFY_EVENT_NONE, NULL);
 
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 	/* remove this mark from the idr */
 	inotify_remove_from_idr(group, i_mark);
 
@@ -837,7 +794,6 @@ static int __init inotify_user_setup(void)
 	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
 
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
-	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
 	inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160..952237b8e2d 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
-static struct kmem_cache *fsnotify_event_cachep;
-static struct kmem_cache *fsnotify_event_holder_cachep;
-/*
- * This is a magic event we send when the q is too full.  Since it doesn't
- * hold real event information we just keep one system wide and use it any time
- * it is needed.  It's refcnt is set 1 at kernel init time and will never
- * get set to 0 so it will never get 'freed'
- */
-static struct fsnotify_event *q_overflow_event;
 static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
 
 /**
@@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
 	return list_empty(&group->notification_list) ? true : false;
 }
 
-void fsnotify_get_event(struct fsnotify_event *event)
+void fsnotify_destroy_event(struct fsnotify_group *group,
+			    struct fsnotify_event *event)
 {
-	atomic_inc(&event->refcnt);
-}
-
-void fsnotify_put_event(struct fsnotify_event *event)
-{
-	if (!event)
+	/* Overflow events are per-group and we don't want to free them */
+	if (!event || event->mask == FS_Q_OVERFLOW)
 		return;
 
-	if (atomic_dec_and_test(&event->refcnt)) {
-		pr_debug("%s: event=%p\n", __func__, event);
-
-		if (event->data_type == FSNOTIFY_EVENT_PATH)
-			path_put(&event->path);
-
-		BUG_ON(!list_empty(&event->private_data_list));
-
-		kfree(event->file_name);
-		put_pid(event->tgid);
-		kmem_cache_free(fsnotify_event_cachep, event);
-	}
-}
-
-struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
-{
-	return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
-}
-
-void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
-{
-	if (holder)
-		kmem_cache_free(fsnotify_event_holder_cachep, holder);
-}
-
-/*
- * Find the private data that the group previously attached to this event when
- * the group added the event to the notification queue (fsnotify_add_notify_event)
- */
-struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
-{
-	struct fsnotify_event_private_data *lpriv;
-	struct fsnotify_event_private_data *priv = NULL;
-
-	assert_spin_locked(&event->lock);
-
-	list_for_each_entry(lpriv, &event->private_data_list, event_list) {
-		if (lpriv->group == group) {
-			priv = lpriv;
-			list_del(&priv->event_list);
-			break;
-		}
-	}
-	return priv;
+	group->ops->free_event(event);
 }
 
 /*
@@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
  * event off the queue to deal with.  If the event is successfully added to the
  * group's notification queue, a reference is taken on event.
  */
-struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-						 struct fsnotify_event_private_data *priv,
+struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
+						 struct fsnotify_event *event,
 						 struct fsnotify_event *(*merge)(struct list_head *,
 										 struct fsnotify_event *))
 {
 	struct fsnotify_event *return_event = NULL;
-	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
 
-	pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
-
-	/*
-	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
-	 * Check if we expect to be able to use that holder.  If not alloc a new
-	 * holder.
-	 * For the overflow event it's possible that something will use the in
-	 * event holder before we get the lock so we may need to jump back and
-	 * alloc a new holder, this can't happen for most events...
-	 */
-	if (!list_empty(&event->holder.event_list)) {
-alloc_holder:
-		holder = fsnotify_alloc_event_holder();
-		if (!holder)
-			return ERR_PTR(-ENOMEM);
-	}
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
 	mutex_lock(&group->notification_mutex);
 
 	if (group->q_len >= group->max_events) {
-		event = q_overflow_event;
-
-		/*
-		 * we need to return the overflow event
-		 * which means we need a ref
-		 */
-		fsnotify_get_event(event);
+		/* Queue overflow event only if it isn't already queued */
+		if (list_empty(&group->overflow_event.list))
+			event = &group->overflow_event;
 		return_event = event;
-
-		/* sorry, no private data on the overflow event */
-		priv = NULL;
 	}
 
 	if (!list_empty(list) && merge) {
-		struct fsnotify_event *tmp;
-
-		tmp = merge(list, event);
-		if (tmp) {
-			mutex_unlock(&group->notification_mutex);
-
-			if (return_event)
-				fsnotify_put_event(return_event);
-			if (holder != &event->holder)
-				fsnotify_destroy_event_holder(holder);
-			return tmp;
-		}
-	}
-
-	spin_lock(&event->lock);
-
-	if (list_empty(&event->holder.event_list)) {
-		if (unlikely(holder))
-			fsnotify_destroy_event_holder(holder);
-		holder = &event->holder;
-	} else if (unlikely(!holder)) {
-		/* between the time we checked above and got the lock the in
-		 * event holder was used, go back and get a new one */
-		spin_unlock(&event->lock);
-		mutex_unlock(&group->notification_mutex);
-
+		return_event = merge(list, event);
 		if (return_event) {
-			fsnotify_put_event(return_event);
-			return_event = NULL;
+			mutex_unlock(&group->notification_mutex);
+			return return_event;
 		}
-
-		goto alloc_holder;
 	}
 
 	group->q_len++;
-	holder->event = event;
-
-	fsnotify_get_event(event);
-	list_add_tail(&holder->event_list, list);
-	if (priv)
-		list_add_tail(&priv->event_list, &event->private_data_list);
-	spin_unlock(&event->lock);
+	list_add_tail(&event->list, list);
 	mutex_unlock(&group->notification_mutex);
 
 	wake_up(&group->notification_waitq);
@@ -230,32 +119,20 @@ alloc_holder:
 }
 
 /*
- * Remove and return the first event from the notification list.  There is a
- * reference held on this event since it was on the list.  It is the responsibility
- * of the caller to drop this reference.
+ * Remove and return the first event from the notification list.  It is the
+ * responsibility of the caller to destroy the obtained event
  */
 struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
-	struct fsnotify_event_holder *holder;
 
 	BUG_ON(!mutex_is_locked(&group->notification_mutex));
 
 	pr_debug("%s: group=%p\n", __func__, group);
 
-	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
-
-	event = holder->event;
-
-	spin_lock(&event->lock);
-	holder->event = NULL;
-	list_del_init(&holder->event_list);
-	spin_unlock(&event->lock);
-
-	/* event == holder means we are referenced through the in event holder */
-	if (holder != &event->holder)
-		fsnotify_destroy_event_holder(holder);
-
+	event = list_first_entry(&group->notification_list,
+				 struct fsnotify_event, list);
+	list_del(&event->list);
 	group->q_len--;
 
 	return event;
@@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
  */
 struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 {
-	struct fsnotify_event *event;
-	struct fsnotify_event_holder *holder;
-
 	BUG_ON(!mutex_is_locked(&group->notification_mutex));
 
-	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
-	event = holder->event;
-
-	return event;
+	return list_first_entry(&group->notification_list,
+				struct fsnotify_event, list);
 }
 
 /*
@@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 void fsnotify_flush_notify(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
-	struct fsnotify_event_private_data *priv;
 
 	mutex_lock(&group->notification_mutex);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		event = fsnotify_remove_notify_event(group);
-		/* if they don't implement free_event_priv they better not have attached any */
-		if (group->ops->free_event_priv) {
-			spin_lock(&event->lock);
-			priv = fsnotify_remove_priv_from_event(group, event);
-			spin_unlock(&event->lock);
-			if (priv)
-				group->ops->free_event_priv(priv);
-		}
-		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+		fsnotify_destroy_event(group, event);
 	}
 	mutex_unlock(&group->notification_mutex);
 }
 
-static void initialize_event(struct fsnotify_event *event)
-{
-	INIT_LIST_HEAD(&event->holder.event_list);
-	atomic_set(&event->refcnt, 1);
-
-	spin_lock_init(&event->lock);
-
-	INIT_LIST_HEAD(&event->private_data_list);
-}
-
-/*
- * Caller damn well better be holding whatever mutex is protecting the
- * old_holder->event_list and the new_event must be a clean event which
- * cannot be found anywhere else in the kernel.
- */
-int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
-			   struct fsnotify_event *new_event)
-{
-	struct fsnotify_event *old_event = old_holder->event;
-	struct fsnotify_event_holder *new_holder = &new_event->holder;
-
-	enum event_spinlock_class {
-		SPINLOCK_OLD,
-		SPINLOCK_NEW,
-	};
-
-	pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
-
-	/*
-	 * if the new_event's embedded holder is in use someone
-	 * screwed up and didn't give us a clean new event.
-	 */
-	BUG_ON(!list_empty(&new_holder->event_list));
-
-	spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
-	spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
-
-	new_holder->event = new_event;
-	list_replace_init(&old_holder->event_list, &new_holder->event_list);
-
-	spin_unlock(&new_event->lock);
-	spin_unlock(&old_event->lock);
-
-	/* event == holder means we are referenced through the in event holder */
-	if (old_holder != &old_event->holder)
-		fsnotify_destroy_event_holder(old_holder);
-
-	fsnotify_get_event(new_event); /* on the list take reference */
-	fsnotify_put_event(old_event); /* off the list, drop reference */
-
-	return 0;
-}
-
-struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
-{
-	struct fsnotify_event *event;
-
-	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
-	if (!event)
-		return NULL;
-
-	pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
-
-	memcpy(event, old_event, sizeof(*event));
-	initialize_event(event);
-
-	if (event->name_len) {
-		event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
-		if (!event->file_name) {
-			kmem_cache_free(fsnotify_event_cachep, event);
-			return NULL;
-		}
-	}
-	event->tgid = get_pid(old_event->tgid);
-	if (event->data_type == FSNOTIFY_EVENT_PATH)
-		path_get(&event->path);
-
-	return event;
-}
-
 /*
  * fsnotify_create_event - Allocate a new event which will be sent to each
  * group's handle_event function if the group was interested in this
  * particular event.
  *
- * @to_tell the inode which is supposed to receive the event (sometimes a
+ * @inode the inode which is supposed to receive the event (sometimes a
  *	parent of the inode to which the event happened.
  * @mask what actually happened.
  * @data pointer to the object which was actually affected
  * @data_type flag indication if the data is a file, path, inode, nothing...
  * @name the filename, if available
  */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-					     int data_type, const unsigned char *name,
-					     u32 cookie, gfp_t gfp)
+void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
+			 u32 mask)
 {
-	struct fsnotify_event *event;
-
-	event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
-	if (!event)
-		return NULL;
-
-	pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
-		 __func__, event, to_tell, mask, data, data_type);
-
-	initialize_event(event);
-
-	if (name) {
-		event->file_name = kstrdup(name, gfp);
-		if (!event->file_name) {
-			kmem_cache_free(fsnotify_event_cachep, event);
-			return NULL;
-		}
-		event->name_len = strlen(event->file_name);
-	}
-
-	event->tgid = get_pid(task_tgid(current));
-	event->sync_cookie = cookie;
-	event->to_tell = to_tell;
-	event->data_type = data_type;
-
-	switch (data_type) {
-	case FSNOTIFY_EVENT_PATH: {
-		struct path *path = data;
-		event->path.dentry = path->dentry;
-		event->path.mnt = path->mnt;
-		path_get(&event->path);
-		break;
-	}
-	case FSNOTIFY_EVENT_INODE:
-		event->inode = data;
-		break;
-	case FSNOTIFY_EVENT_NONE:
-		event->inode = NULL;
-		event->path.dentry = NULL;
-		event->path.mnt = NULL;
-		break;
-	default:
-		BUG();
-	}
-
+	INIT_LIST_HEAD(&event->list);
+	event->inode = inode;
 	event->mask = mask;
-
-	return event;
-}
-
-static __init int fsnotify_notification_init(void)
-{
-	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
-	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
-
-	q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
-						 FSNOTIFY_EVENT_NONE, NULL, 0,
-						 GFP_KERNEL);
-	if (!q_overflow_event)
-		panic("unable to allocate fsnotify q_overflow_event\n");
-
-	return 0;
 }
-subsys_initcall(fsnotify_notification_init);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4b2ee8d12f5..7f3d7dcfcd0 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -15,7 +15,6 @@
 #include <linux/path.h> /* struct path */
 #include <linux/spinlock.h>
 #include <linux/types.h>
-
 #include <linux/atomic.h>
 
 /*
@@ -79,6 +78,7 @@ struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark;
 struct fsnotify_event_private_data;
+struct fsnotify_fname;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
@@ -99,12 +99,26 @@ struct fsnotify_ops {
 				  struct fsnotify_mark *vfsmount_mark,
 				  __u32 mask, void *data, int data_type);
 	int (*handle_event)(struct fsnotify_group *group,
+			    struct inode *inode,
 			    struct fsnotify_mark *inode_mark,
 			    struct fsnotify_mark *vfsmount_mark,
-			    struct fsnotify_event *event);
+			    u32 mask, void *data, int data_type,
+			    const unsigned char *file_name);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
-	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
+	void (*free_event)(struct fsnotify_event *event);
+};
+
+/*
+ * all of the information about the original object we want to now send to
+ * a group.  If you want to carry more info from the accessing task to the
+ * listener this structure is where you need to be adding fields.
+ */
+struct fsnotify_event {
+	struct list_head list;
+	/* inode may ONLY be dereferenced during handle_event(). */
+	struct inode *inode;	/* either the inode the event happened to or its parent */
+	u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 };
 
 /*
@@ -148,7 +162,11 @@ struct fsnotify_group {
 					 * a group */
 	struct list_head marks_list;	/* all inode marks for this group */
 
-	struct fasync_struct    *fsn_fa;    /* async notification */
+	struct fasync_struct *fsn_fa;    /* async notification */
+
+	struct fsnotify_event overflow_event;	/* Event we queue when the
+						 * notification list is too
+						 * full */
 
 	/* groups can define private fields here or use the void *private */
 	union {
@@ -177,76 +195,10 @@ struct fsnotify_group {
 	};
 };
 
-/*
- * A single event can be queued in multiple group->notification_lists.
- *
- * each group->notification_list will point to an event_holder which in turns points
- * to the actual event that needs to be sent to userspace.
- *
- * Seemed cheaper to create a refcnt'd event and a small holder for every group
- * than create a different event for every group
- *
- */
-struct fsnotify_event_holder {
-	struct fsnotify_event *event;
-	struct list_head event_list;
-};
-
-/*
- * Inotify needs to tack data onto an event.  This struct lets us later find the
- * correct private data of the correct group.
- */
-struct fsnotify_event_private_data {
-	struct fsnotify_group *group;
-	struct list_head event_list;
-};
-
-/*
- * all of the information about the original object we want to now send to
- * a group.  If you want to carry more info from the accessing task to the
- * listener this structure is where you need to be adding fields.
- */
-struct fsnotify_event {
-	/*
-	 * If we create an event we are also likely going to need a holder
-	 * to link to a group.  So embed one holder in the event.  Means only
-	 * one allocation for the common case where we only have one group
-	 */
-	struct fsnotify_event_holder holder;
-	spinlock_t lock;	/* protection for the associated event_holder and private_list */
-	/* to_tell may ONLY be dereferenced during handle_event(). */
-	struct inode *to_tell;	/* either the inode the event happened to or its parent */
-	/*
-	 * depending on the event type we should have either a path or inode
-	 * We hold a reference on path, but NOT on inode.  Since we have the ref on
-	 * the path, it may be dereferenced at any point during this object's
-	 * lifetime.  That reference is dropped when this object's refcnt hits
-	 * 0.  If this event contains an inode instead of a path, the inode may
-	 * ONLY be used during handle_event().
-	 */
-	union {
-		struct path path;
-		struct inode *inode;
-	};
 /* when calling fsnotify tell it if the data is a path or inode */
 #define FSNOTIFY_EVENT_NONE	0
 #define FSNOTIFY_EVENT_PATH	1
 #define FSNOTIFY_EVENT_INODE	2
-	int data_type;		/* which of the above union we have */
-	atomic_t refcnt;	/* how many groups still are using/need to send this event */
-	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
-
-	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
-	const unsigned char *file_name;
-	size_t name_len;
-	struct pid *tgid;
-
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	__u32 response;	/* userspace answer to question */
-#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
-
-	struct list_head private_data_list;	/* groups can store private data here */
-};
 
 /*
  * Inode specific fields in an fsnotify_mark
@@ -370,17 +322,12 @@ extern void fsnotify_put_group(struct fsnotify_group *group);
 extern void fsnotify_destroy_group(struct fsnotify_group *group);
 /* fasync handler function */
 extern int fsnotify_fasync(int fd, struct file *file, int on);
-/* take a reference to an event */
-extern void fsnotify_get_event(struct fsnotify_event *event);
-extern void fsnotify_put_event(struct fsnotify_event *event);
-/* find private data previously attached to an event and unlink it */
-extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group,
-									   struct fsnotify_event *event);
-
+/* Free event from memory */
+extern void fsnotify_destroy_event(struct fsnotify_group *group,
+				   struct fsnotify_event *event);
 /* attach the event to the group notification queue */
 extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
 							struct fsnotify_event *event,
-							struct fsnotify_event_private_data *priv,
 							struct fsnotify_event *(*merge)(struct list_head *,
 											struct fsnotify_event *));
 /* true if the group notification queue is empty */
@@ -430,15 +377,8 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct list_head *list);
 
 /* put here because inotify does some weird stuff when destroying watches */
-extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is,
-						    const unsigned char *name,
-						    u32 cookie, gfp_t gfp);
-
-/* fanotify likes to change events after they are on lists... */
-extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event);
-extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
-				  struct fsnotify_event *new_event);
+extern void fsnotify_init_event(struct fsnotify_event *event,
+				struct inode *to_tell, u32 mask);
 
 #else
 
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 43c307dc945..bcc0b182122 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -912,9 +912,11 @@ static void evict_chunk(struct audit_chunk *chunk)
 }
 
 static int audit_tree_handle_event(struct fsnotify_group *group,
+				   struct inode *to_tell,
 				   struct fsnotify_mark *inode_mark,
-				   struct fsnotify_mark *vfsmonut_mark,
-				   struct fsnotify_event *event)
+				   struct fsnotify_mark *vfsmount_mark,
+				   u32 mask, void *data, int data_type,
+				   const unsigned char *file_name)
 {
 	BUG();
 	return -EOPNOTSUPP;
@@ -945,7 +947,7 @@ static const struct fsnotify_ops audit_tree_ops = {
 	.handle_event = audit_tree_handle_event,
 	.should_send_event = audit_tree_send_event,
 	.free_group_priv = NULL,
-	.free_event_priv = NULL,
+	.free_event = NULL,
 	.freeing_mark = audit_tree_freeing_mark,
 };
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369..a760c32cb63 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -475,25 +475,25 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i
 
 /* Update watch data in audit rules based on fsnotify events. */
 static int audit_watch_handle_event(struct fsnotify_group *group,
+				    struct inode *to_tell,
 				    struct fsnotify_mark *inode_mark,
 				    struct fsnotify_mark *vfsmount_mark,
-				    struct fsnotify_event *event)
+				    u32 mask, void *data, int data_type,
+				    const unsigned char *dname)
 {
 	struct inode *inode;
-	__u32 mask = event->mask;
-	const char *dname = event->file_name;
 	struct audit_parent *parent;
 
 	parent = container_of(inode_mark, struct audit_parent, mark);
 
 	BUG_ON(group != audit_watch_group);
 
-	switch (event->data_type) {
+	switch (data_type) {
 	case (FSNOTIFY_EVENT_PATH):
-		inode = event->path.dentry->d_inode;
+		inode = ((struct path *)data)->dentry->d_inode;
 		break;
 	case (FSNOTIFY_EVENT_INODE):
-		inode = event->inode;
+		inode = (struct inode *)data;
 		break;
 	default:
 		BUG();
@@ -516,7 +516,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
 	.handle_event = 	audit_watch_handle_event,
 	.free_group_priv = 	NULL,
 	.freeing_mark = 	NULL,
-	.free_event_priv = 	NULL,
+	.free_event = 		NULL,
 };
 
 static int __init audit_watch_init(void)
-- 
cgit v1.2.3-70-g09d2


From 83c4c4b0a3aadc1ce7b5b2870ce1fc1f65498da0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Jan 2014 15:48:15 -0800
Subject: fsnotify: remove .should_send_event callback

After removing event structure creation from the generic layer there is
no reason for separate .should_send_event and .handle_event callbacks.
So just remove the first one.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/dnotify/dnotify.c          | 22 ++++------------------
 fs/notify/fanotify/fanotify.c        | 18 ++++++++++--------
 fs/notify/fsnotify.c                 |  5 -----
 fs/notify/inotify/inotify_fsnotify.c | 24 +++++++-----------------
 include/linux/fsnotify_backend.h     |  4 ----
 kernel/audit_tree.c                  | 12 +-----------
 kernel/audit_watch.c                 |  9 ---------
 7 files changed, 22 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index bfca53dbbf3..928688e3ee2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -94,6 +94,10 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 	struct fown_struct *fown;
 	__u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
 
+	/* not a dir, dnotify doesn't care */
+	if (!S_ISDIR(inode->i_mode))
+		return 0;
+
 	BUG_ON(vfsmount_mark);
 
 	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
@@ -121,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 	return 0;
 }
 
-/*
- * Given an inode and mask determine if dnotify would be interested in sending
- * userspace notification for that pair.
- */
-static bool dnotify_should_send_event(struct fsnotify_group *group,
-				      struct inode *inode,
-				      struct fsnotify_mark *inode_mark,
-				      struct fsnotify_mark *vfsmount_mark,
-				      __u32 mask, void *data, int data_type)
-{
-	/* not a dir, dnotify doesn't care */
-	if (!S_ISDIR(inode->i_mode))
-		return false;
-
-	return true;
-}
-
 static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
 	struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -151,7 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 
 static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.handle_event = dnotify_handle_event,
-	.should_send_event = dnotify_should_send_event,
 	.free_group_priv = NULL,
 	.freeing_mark = NULL,
 	.free_event = NULL,
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index c26268d7bd9..1f8f05220f8 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -88,18 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 }
 #endif
 
-static bool fanotify_should_send_event(struct fsnotify_group *group,
-				       struct inode *inode,
-				       struct fsnotify_mark *inode_mark,
+static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 				       struct fsnotify_mark *vfsmnt_mark,
-				       __u32 event_mask, void *data, int data_type)
+				       u32 event_mask,
+				       void *data, int data_type)
 {
 	__u32 marks_mask, marks_ignored_mask;
 	struct path *path = data;
 
-	pr_debug("%s: group=%p inode=%p inode_mark=%p vfsmnt_mark=%p "
-		 "mask=%x data=%p data_type=%d\n", __func__, group, inode,
-		 inode_mark, vfsmnt_mark, event_mask, data, data_type);
+	pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
+		 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
+		 event_mask, data, data_type);
 
 	/* if we don't have enough info to send an event to userspace say no */
 	if (data_type != FSNOTIFY_EVENT_PATH)
@@ -163,6 +162,10 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
 	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
 
+	if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
+					data_type))
+		return 0;
+
 	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
 		 mask);
 
@@ -225,7 +228,6 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
 
 const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
-	.should_send_event = fanotify_should_send_event,
 	.free_group_priv = fanotify_free_group_priv,
 	.free_event = fanotify_free_event,
 	.freeing_mark = NULL,
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7c754c91c3f..1d4e1ea2f37 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -177,11 +177,6 @@ static int send_to_group(struct inode *to_tell,
 	if (!inode_test_mask && !vfsmount_test_mask)
 		return 0;
 
-	if (group->ops->should_send_event(group, to_tell, inode_mark,
-					  vfsmount_mark, mask, data,
-					  data_is) == false)
-		return 0;
-
 	return group->ops->handle_event(group, to_tell, inode_mark,
 					vfsmount_mark, mask, data, data_is,
 					file_name);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 6fabbd163d1..aad1a35e9af 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -81,6 +81,13 @@ int inotify_handle_event(struct fsnotify_group *group,
 
 	BUG_ON(vfsmount_mark);
 
+	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+	    (data_type == FSNOTIFY_EVENT_PATH)) {
+		struct path *path = data;
+
+		if (d_unlinked(path->dentry))
+			return 0;
+	}
 	if (file_name) {
 		len = strlen(file_name);
 		alloc_len += len + 1;
@@ -122,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
 	inotify_ignored_and_remove_idr(fsn_mark, group);
 }
 
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-				      struct fsnotify_mark *inode_mark,
-				      struct fsnotify_mark *vfsmount_mark,
-				      __u32 mask, void *data, int data_type)
-{
-	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-	    (data_type == FSNOTIFY_EVENT_PATH)) {
-		struct path *path = data;
-
-		if (d_unlinked(path->dentry))
-			return false;
-	}
-
-	return true;
-}
-
 /*
  * This is NEVER supposed to be called.  Inotify marks should either have been
  * removed from the idr when the watch was removed or in the
@@ -189,7 +180,6 @@ static void inotify_free_event(struct fsnotify_event *fsn_event)
 
 const struct fsnotify_ops inotify_fsnotify_ops = {
 	.handle_event = inotify_handle_event,
-	.should_send_event = inotify_should_send_event,
 	.free_group_priv = inotify_free_group_priv,
 	.free_event = inotify_free_event,
 	.freeing_mark = inotify_freeing_mark,
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7f3d7dcfcd0..7d8d5e60859 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -94,10 +94,6 @@ struct fsnotify_fname;
  * 		userspace messages that marks have been removed.
  */
 struct fsnotify_ops {
-	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
-				  struct fsnotify_mark *inode_mark,
-				  struct fsnotify_mark *vfsmount_mark,
-				  __u32 mask, void *data, int data_type);
 	int (*handle_event)(struct fsnotify_group *group,
 			    struct inode *inode,
 			    struct fsnotify_mark *inode_mark,
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index bcc0b182122..ae8103b057f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -918,8 +918,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
 				   u32 mask, void *data, int data_type,
 				   const unsigned char *file_name)
 {
-	BUG();
-	return -EOPNOTSUPP;
+	return 0;
 }
 
 static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -935,17 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 	BUG_ON(atomic_read(&entry->refcnt) < 1);
 }
 
-static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
-				  struct fsnotify_mark *inode_mark,
-				  struct fsnotify_mark *vfsmount_mark,
-				  __u32 mask, void *data, int data_type)
-{
-	return false;
-}
-
 static const struct fsnotify_ops audit_tree_ops = {
 	.handle_event = audit_tree_handle_event,
-	.should_send_event = audit_tree_send_event,
 	.free_group_priv = NULL,
 	.free_event = NULL,
 	.freeing_mark = audit_tree_freeing_mark,
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index a760c32cb63..367ac9a79ac 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -465,14 +465,6 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 	}
 }
 
-static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-					  struct fsnotify_mark *inode_mark,
-					  struct fsnotify_mark *vfsmount_mark,
-					  __u32 mask, void *data, int data_type)
-{
-       return true;
-}
-
 /* Update watch data in audit rules based on fsnotify events. */
 static int audit_watch_handle_event(struct fsnotify_group *group,
 				    struct inode *to_tell,
@@ -512,7 +504,6 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 }
 
 static const struct fsnotify_ops audit_watch_fsnotify_ops = {
-	.should_send_event = 	audit_watch_should_send_event,
 	.handle_event = 	audit_watch_handle_event,
 	.free_group_priv = 	NULL,
 	.freeing_mark = 	NULL,
-- 
cgit v1.2.3-70-g09d2


From 56b27cf6030dd36c56a5542ab8bfa406d337f083 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Jan 2014 15:48:16 -0800
Subject: fsnotify: remove pointless NULL initializers

We usually rely on the fact that struct members not specified in the
initializer are set to NULL.  So do that with fsnotify function pointers
as well.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/dnotify/dnotify.c   | 3 ---
 fs/notify/fanotify/fanotify.c | 1 -
 kernel/audit_tree.c           | 2 --
 kernel/audit_watch.c          | 3 ---
 4 files changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 928688e3ee2..0b9ff4395e6 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -138,9 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 
 static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.handle_event = dnotify_handle_event,
-	.free_group_priv = NULL,
-	.freeing_mark = NULL,
-	.free_event = NULL,
 };
 
 /*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 1f8f05220f8..58772623f02 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -230,5 +230,4 @@ const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.free_group_priv = fanotify_free_group_priv,
 	.free_event = fanotify_free_event,
-	.freeing_mark = NULL,
 };
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ae8103b057f..67ccf0e7cca 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -936,8 +936,6 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 
 static const struct fsnotify_ops audit_tree_ops = {
 	.handle_event = audit_tree_handle_event,
-	.free_group_priv = NULL,
-	.free_event = NULL,
 	.freeing_mark = audit_tree_freeing_mark,
 };
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 367ac9a79ac..2596fac5dcb 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -505,9 +505,6 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 
 static const struct fsnotify_ops audit_watch_fsnotify_ops = {
 	.handle_event = 	audit_watch_handle_event,
-	.free_group_priv = 	NULL,
-	.freeing_mark = 	NULL,
-	.free_event = 		NULL,
 };
 
 static int __init audit_watch_init(void)
-- 
cgit v1.2.3-70-g09d2


From ff8fb335221e2c446b0d4cbea26be371fd2feb64 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 21 Jan 2014 15:48:20 -0800
Subject: ocfs2: remove versioning information

The versioning information is confusing for end-users.  The numbers are
stuck at 1.5.0 when the tools version have moved to 1.8.2.  Remove the
versioning system in the OCFS2 modules and let the kernel version be the
guide to debug issues.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Acked-by: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/Makefile              |  1 -
 fs/ocfs2/cluster/Makefile      |  2 +-
 fs/ocfs2/cluster/nodemanager.c |  4 +---
 fs/ocfs2/cluster/ver.c         | 42 -----------------------------------------
 fs/ocfs2/cluster/ver.h         | 31 ------------------------------
 fs/ocfs2/dlm/Makefile          |  2 +-
 fs/ocfs2/dlm/dlmdomain.c       |  5 +----
 fs/ocfs2/dlm/dlmver.c          | 42 -----------------------------------------
 fs/ocfs2/dlm/dlmver.h          | 31 ------------------------------
 fs/ocfs2/dlmfs/Makefile        |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c         |  4 +---
 fs/ocfs2/dlmfs/dlmfsver.c      | 42 -----------------------------------------
 fs/ocfs2/dlmfs/dlmfsver.h      | 31 ------------------------------
 fs/ocfs2/super.c               |  4 +---
 fs/ocfs2/ver.c                 | 43 ------------------------------------------
 fs/ocfs2/ver.h                 | 31 ------------------------------
 16 files changed, 7 insertions(+), 310 deletions(-)
 delete mode 100644 fs/ocfs2/cluster/ver.c
 delete mode 100644 fs/ocfs2/cluster/ver.h
 delete mode 100644 fs/ocfs2/dlm/dlmver.c
 delete mode 100644 fs/ocfs2/dlm/dlmver.h
 delete mode 100644 fs/ocfs2/dlmfs/dlmfsver.c
 delete mode 100644 fs/ocfs2/dlmfs/dlmfsver.h
 delete mode 100644 fs/ocfs2/ver.c
 delete mode 100644 fs/ocfs2/ver.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b3298..ce210d4951a 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
 	symlink.o 		\
 	sysfile.o 		\
 	uptodate.o		\
-	ver.o			\
 	quota_local.o		\
 	quota_global.o		\
 	xattr.o			\
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d860..1aefc0350ec 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
-	quorum.o tcp.o netdebug.o ver.o
+	quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5..441c84e169e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
 #include "heartbeat.h"
 #include "masklog.h"
 #include "sys.h"
-#include "ver.h"
 
 /* for now we operate under the assertion that there can be only one
  * cluster active at a time.  Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
 {
 	int ret = -1;
 
-	cluster_print_version();
-
 	ret = o2hb_init();
 	if (ret)
 		goto out;
@@ -984,6 +981,7 @@ out:
 
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
 
 module_init(init_o2nm)
 module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad..00000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define CLUSTER_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-
-void cluster_print_version(void)
-{
-	printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c..00000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-
-void cluster_print_version(void);
-
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb1..bd1aab1f49a 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
-	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
 
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf84..33660a4a52f 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
 #include "dlmdomain.h"
 #include "dlmdebug.h"
 
-#include "dlmver.h"
-
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
 
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void)
 {
 	int status;
 
-	dlm_print_version();
-
 	status = dlm_init_mle_cache();
 	if (status) {
 		mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void)
 
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
 
 module_init(dlm_init);
 module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158..00000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-
-void dlm_print_version(void)
-{
-	printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a1..00000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLM_VER_H
-#define DLM_VER_H
-
-void dlm_print_version(void);
-
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a670..eed3db8c5b4 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
 
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
 
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e..09b7d9dac71 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
 
 #include "stackglue.h"
 #include "userdlm.h"
-#include "dlmfsver.h"
 
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
 	int status;
 	int cleanup_inode = 0, cleanup_worker = 0;
 
-	dlmfs_print_version();
-
 	status = bdi_init(&dlmfs_backing_dev_info);
 	if (status)
 		return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
 
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
 
 module_init(init_dlmfs_fs)
 module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f8..00000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
-	printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25..00000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-
-void dlmfs_print_version(void);
-
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c41492957aa..fcd595e5d2c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "ver.h"
 #include "xattr.h"
 #include "quota.h"
 #include "refcounttree.h"
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL;
 
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
 
 struct mount_options
 {
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void)
 {
 	int status, i;
 
-	ocfs2_print_version();
-
 	for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
 		init_waitqueue_head(&ocfs2__ioend_wq[i]);
 
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a..00000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define OCFS2_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-
-void ocfs2_print_version(void)
-{
-	printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2..00000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-
-void ocfs2_print_version(void);
-
-#endif /* OCFS2_VER_H */
-- 
cgit v1.2.3-70-g09d2


From c74a3bdd9b529d924d1abf986079b783dd105ace Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 21 Jan 2014 15:48:21 -0800
Subject: ocfs2: add clustername to cluster connection

This is an effort of removing ocfs2_controld.pcmk and getting ocfs2 DLM
handling up to the times with respect to DLM (>=4.0.1) and corosync
(2.3.x).  AFAIK, cman also is being phased out for a unified corosync
cluster stack.

fs/dlm performs all the functions with respect to fencing and node
management and provides the API's to do so for ocfs2.  For all future
references, DLM stands for fs/dlm code.

The advantages are:
 + No need to run an additional userspace daemon (ocfs2_controld)
 + No controld device handling and controld protocol
 + Shifting responsibilities of node management to DLM layer

For backward compatibility, we are keeping the controld handling code.
Once enough time has passed we can remove a significant portion of the
code.  This was tested by using the kernel with changes on older
unmodified tools.  The kernel used ocfs2_controld as expected, and
displayed the appropriate warning message.

This feature requires modification in the userspace ocfs2-tools.  The
changes can be found at: https://github.com/goldwynr/ocfs2-tools branch:
nocontrold Currently, not many checks are present in the userspace code,
but that would change soon.

This patch (of 6):

Add clustername to cluster connection.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlmglue.c   |  2 ++
 fs/ocfs2/ocfs2.h     |  1 +
 fs/ocfs2/stackglue.c | 11 ++++++++---
 fs/ocfs2/stackglue.h |  9 ++++++++-
 fs/ocfs2/super.c     |  8 +++++---
 5 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3407b2c62b2..85852d62aa5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 
 	/* for now, uuid == domain */
 	status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+				       osb->osb_cluster_name,
+				       strlen(osb->osb_cluster_name),
 				       osb->uuid_str,
 				       strlen(osb->uuid_str),
 				       &lproto, ocfs2_do_node_down, osb,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c79..553f53cc73a 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -387,6 +387,7 @@ struct ocfs2_super
 	u8 osb_stackflags;
 
 	char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+	char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
 	struct ocfs2_cluster_connection *cconn;
 	struct ocfs2_lock_res osb_super_lockres;
 	struct ocfs2_lock_res osb_rename_lockres;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index cb7ec0b63dd..6537979b8ac 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
 EXPORT_SYMBOL_GPL(ocfs2_plock);
 
 int ocfs2_cluster_connect(const char *stack_name,
+			  const char *cluster_name,
+			  int cluster_name_len,
 			  const char *group,
 			  int grouplen,
 			  struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name,
 		goto out;
 	}
 
-	memcpy(new_conn->cc_name, group, grouplen);
+	strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
 	new_conn->cc_namelen = grouplen;
+	strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1);
+	new_conn->cc_cluster_name_len = cluster_name_len;
 	new_conn->cc_recovery_handler = recovery_handler;
 	new_conn->cc_recovery_data = recovery_data;
 
@@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
 
 	if (cluster_stack_name[0])
 		stack_name = cluster_stack_name;
-	return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
-				     recovery_handler, recovery_data, conn);
+	return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+				     lproto, recovery_handler, recovery_data,
+				     conn);
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0..6d90f4192c1 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
  */
 #define GROUP_NAME_MAX		64
 
+/* This shadows  OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX	16
+
 
 /*
  * ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
  * locking compatibility.
  */
 struct ocfs2_cluster_connection {
-	char cc_name[GROUP_NAME_MAX];
+	char cc_name[GROUP_NAME_MAX + 1];
 	int cc_namelen;
+	char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+	int cc_cluster_name_len;
 	struct ocfs2_protocol_version cc_version;
 	struct ocfs2_locking_protocol *cc_proto;
 	void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -239,6 +244,8 @@ struct ocfs2_stack_plugin {
 
 /* Used by the filesystem */
 int ocfs2_cluster_connect(const char *stack_name,
+			  const char *cluster_name,
+			  int cluster_name_len,
 			  const char *group,
 			  int grouplen,
 			  struct ocfs2_locking_protocol *lproto,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fcd595e5d2c..5445d72eb8e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2223,10 +2223,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	if (ocfs2_clusterinfo_valid(osb)) {
 		osb->osb_stackflags =
 			OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
-		memcpy(osb->osb_cluster_stack,
+		strlcpy(osb->osb_cluster_stack,
 		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
-		       OCFS2_STACK_LABEL_LEN);
-		osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+		       OCFS2_STACK_LABEL_LEN + 1);
 		if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
 			mlog(ML_ERROR,
 			     "couldn't mount because of an invalid "
@@ -2235,6 +2234,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 			status = -EINVAL;
 			goto bail;
 		}
+		strlcpy(osb->osb_cluster_name,
+			OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+			OCFS2_CLUSTER_NAME_LEN + 1);
 	} else {
 		/* The empty string is identical with classic tools that
 		 * don't know about s_cluster_info. */
-- 
cgit v1.2.3-70-g09d2


From 66e188fc3173eaeb32d3479758685e34889aff14 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 21 Jan 2014 15:48:22 -0800
Subject: ocfs2: add DLM recovery callbacks

These are the callbacks called by the fs/dlm code in case the membership
changes.  If there is a failure while/during calling any of these, the
DLM creates a new membership and relays to the rest of the nodes.

 - recover_prep() is called when DLM understands a node is down.
 - recover_slot() is called once all nodes have acknowledged
   recover_prep and recovery can begin.
 - recover_done() is called once the recovery is complete.  It returns
   the new membership.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stack_user.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231..4111855a4de 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -110,6 +110,8 @@
 struct ocfs2_live_connection {
 	struct list_head		oc_list;
 	struct ocfs2_cluster_connection	*oc_conn;
+	atomic_t                        oc_this_node;
+	int                             oc_our_slot;
 };
 
 struct ocfs2_control_private {
@@ -799,6 +801,42 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
 	return 0;
 }
 
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+	struct ocfs2_cluster_connection *conn = arg;
+	printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+			slot->nodeid, slot->slot);
+	conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+		int num_slots, int our_slot,
+		uint32_t generation)
+{
+	struct ocfs2_cluster_connection *conn = arg;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	int i;
+
+	for (i = 0; i < num_slots; i++)
+		if (slots[i].slot == our_slot) {
+			atomic_set(&lc->oc_this_node, slots[i].nodeid);
+			break;
+		}
+
+	lc->oc_our_slot = our_slot;
+}
+
+const struct dlm_lockspace_ops ocfs2_ls_ops = {
+	.recover_prep = user_recover_prep,
+	.recover_slot = user_recover_slot,
+	.recover_done = user_recover_done,
+};
+
 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 {
 	dlm_lockspace_t *fsdlm;
-- 
cgit v1.2.3-70-g09d2


From 24aa338611e9b02d045a1a99050135b0d49f41b5 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 21 Jan 2014 15:48:23 -0800
Subject: ocfs2: shift allocation ocfs2_live_connection to user_connect()

We perform this because the DLM recovery callbacks will require the
ocfs2_live_connection structure to record the node information when
dlm_new_lockspace() is updated (in the last patch of the series).

Before calling dlm_new_lockspace(), we need the structure ready for the
.recover_done() callback, which would set oc_this_node.  This is the
reason we allocate ocfs2_live_connection beforehand in user_connect().

[AKPM] rc initialization is not required because it assigned in case of
errors.  It will be cleared by compiler anyways.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reveiwed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stack_user.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 4111855a4de..0afb86d9b27 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -200,15 +200,10 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
  * mount path.  Since the VFS prevents multiple calls to
  * fill_super(), we can't get dupes here.
  */
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
-				     struct ocfs2_live_connection **c_ret)
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+				     struct ocfs2_live_connection *c)
 {
 	int rc = 0;
-	struct ocfs2_live_connection *c;
-
-	c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
-	if (!c)
-		return -ENOMEM;
 
 	mutex_lock(&ocfs2_control_lock);
 	c->oc_conn = conn;
@@ -222,12 +217,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
 	}
 
 	mutex_unlock(&ocfs2_control_lock);
-
-	if (!rc)
-		*c_ret = c;
-	else
-		kfree(c);
-
 	return rc;
 }
 
@@ -840,12 +829,18 @@ const struct dlm_lockspace_ops ocfs2_ls_ops = {
 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 {
 	dlm_lockspace_t *fsdlm;
-	struct ocfs2_live_connection *uninitialized_var(control);
-	int rc = 0;
+	struct ocfs2_live_connection *lc;
+	int rc;
 
 	BUG_ON(conn == NULL);
 
-	rc = ocfs2_live_connection_new(conn, &control);
+	lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+	if (!lc) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = ocfs2_live_connection_attach(conn, lc);
 	if (rc)
 		goto out;
 
@@ -861,20 +856,24 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 		       conn->cc_version.pv_major, conn->cc_version.pv_minor,
 		       running_proto.pv_major, running_proto.pv_minor);
 		rc = -EPROTO;
-		ocfs2_live_connection_drop(control);
+		ocfs2_live_connection_drop(lc);
+		lc = NULL;
 		goto out;
 	}
 
 	rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
 			       NULL, NULL, NULL, &fsdlm);
 	if (rc) {
-		ocfs2_live_connection_drop(control);
+		ocfs2_live_connection_drop(lc);
+		lc = NULL;
 		goto out;
 	}
 
-	conn->cc_private = control;
+	conn->cc_private = lc;
 	conn->cc_lockspace = fsdlm;
 out:
+	if (rc && lc)
+		kfree(lc);
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 3e8341516409d026636be4d7534b84e6e90bef37 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 21 Jan 2014 15:48:24 -0800
Subject: ocfs2: pass ocfs2_cluster_connection to ocfs2_this_node

This is done to differentiate between using and not using controld and
use the connection information accordingly.

We need to be backward compatible.  So, we use a new enum
ocfs2_connection_type to identify when controld is used and when it is
not.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlmglue.c    |  2 +-
 fs/ocfs2/stack_o2cb.c |  3 ++-
 fs/ocfs2/stack_user.c | 16 ++++++++++++++--
 fs/ocfs2/stackglue.c  |  5 +++--
 fs/ocfs2/stackglue.h  |  6 ++++--
 5 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 85852d62aa5..19986959d14 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3007,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_cluster_this_node(&osb->node_num);
+	status = ocfs2_cluster_this_node(conn, &osb->node_num);
 	if (status < 0) {
 		mlog_errno(status);
 		mlog(ML_ERROR,
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456..1724d43d3da 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 	return 0;
 }
 
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+				  unsigned int *node)
 {
 	int node_num;
 
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 0afb86d9b27..22b95acc2a8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -103,6 +103,11 @@
 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN	2
 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8
 
+enum ocfs2_connection_type {
+	WITH_CONTROLD,
+	NO_CONTROLD
+};
+
 /*
  * ocfs2_live_connection is refcounted because the filesystem and
  * miscdevice sides can detach in different order.  Let's just be safe.
@@ -110,6 +115,7 @@
 struct ocfs2_live_connection {
 	struct list_head		oc_list;
 	struct ocfs2_cluster_connection	*oc_conn;
+	enum ocfs2_connection_type	oc_type;
 	atomic_t                        oc_this_node;
 	int                             oc_our_slot;
 };
@@ -840,6 +846,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 		goto out;
 	}
 
+	lc->oc_type = WITH_CONTROLD;
 	rc = ocfs2_live_connection_attach(conn, lc);
 	if (rc)
 		goto out;
@@ -886,11 +893,16 @@ static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 	return 0;
 }
 
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+				  unsigned int *this_node)
 {
 	int rc;
+	struct ocfs2_live_connection *lc = conn->cc_private;
 
-	rc = ocfs2_control_get_this_node();
+	if (lc->oc_type == WITH_CONTROLD)
+		rc = ocfs2_control_get_this_node();
+	else
+		rc = -EINVAL;
 	if (rc < 0)
 		return rc;
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 6537979b8ac..1324e6600e5 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -465,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
 
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+			    unsigned int *node)
 {
-	return active_stack->sp_ops->this_node(node);
+	return active_stack->sp_ops->this_node(conn, node);
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 6d90f4192c1..66334a30cea 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -157,7 +157,8 @@ struct ocfs2_stack_operations {
 	 * ->this_node() returns the cluster's unique identifier for the
 	 * local node.
 	 */
-	int (*this_node)(unsigned int *node);
+	int (*this_node)(struct ocfs2_cluster_connection *conn,
+			 unsigned int *node);
 
 	/*
 	 * Call the underlying dlm lock function.  The ->dlm_lock()
@@ -267,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 			     int hangup_pending);
 void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+			    unsigned int *node);
 
 struct ocfs2_lock_res;
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
-- 
cgit v1.2.3-70-g09d2


From 415036303319c0a46caf9059711710bfa34e3bf3 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 21 Jan 2014 15:48:25 -0800
Subject: ocfs2: framework for version LVB

Use the native DLM locks for version control negotiation.  Most of the
framework is taken from gfs2/lock_dlm.c

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stack_user.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 22b95acc2a8..d3867100aca 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -102,6 +102,7 @@
 #define OCFS2_TEXT_UUID_LEN			32
 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN	2
 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8
+#define VERSION_LOCK				"version_lock"
 
 enum ocfs2_connection_type {
 	WITH_CONTROLD,
@@ -118,6 +119,9 @@ struct ocfs2_live_connection {
 	enum ocfs2_connection_type	oc_type;
 	atomic_t                        oc_this_node;
 	int                             oc_our_slot;
+	struct dlm_lksb                 oc_version_lksb;
+	char                            oc_lvb[DLM_LVB_LEN];
+	struct completion               oc_sync_wait;
 };
 
 struct ocfs2_control_private {
@@ -796,6 +800,103 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
 	return 0;
 }
 
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+	struct ocfs2_protocol_version *pv =
+		(struct ocfs2_protocol_version *)lvb;
+	/*
+	 * ocfs2_protocol_version has two u8 variables, so we don't
+	 * need any endian conversion.
+	 */
+	ver->pv_major = pv->pv_major;
+	ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+	struct ocfs2_protocol_version *pv =
+		(struct ocfs2_protocol_version *)lvb;
+	/*
+	 * ocfs2_protocol_version has two u8 variables, so we don't
+	 * need any endian conversion.
+	 */
+	pv->pv_major = ver->pv_major;
+	pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+	struct ocfs2_cluster_connection *conn = arg;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+		struct dlm_lksb *lksb, char *name)
+{
+	int error;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+
+	error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+	if (error) {
+		printk(KERN_ERR "%s lkid %x error %d\n",
+				name, lksb->sb_lkid, error);
+		return error;
+	}
+
+	wait_for_completion(&lc->oc_sync_wait);
+
+	if (lksb->sb_status != -DLM_EUNLOCK) {
+		printk(KERN_ERR "%s lkid %x status %d\n",
+				name, lksb->sb_lkid, lksb->sb_status);
+		return -1;
+	}
+	return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+		int mode, uint32_t flags,
+		struct dlm_lksb *lksb, char *name)
+{
+	int error, status;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+
+	error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+			name, strlen(name),
+			0, sync_wait_cb, conn, NULL);
+	if (error) {
+		printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+				name, lksb->sb_lkid, flags, mode, error);
+		return error;
+	}
+
+	wait_for_completion(&lc->oc_sync_wait);
+
+	status = lksb->sb_status;
+
+	if (status && status != -EAGAIN) {
+		printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+				name, lksb->sb_lkid, flags, mode, status);
+	}
+
+	return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+		int flags)
+{
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	return sync_lock(conn, mode, flags,
+			&lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
 static void user_recover_prep(void *arg)
 {
 }
-- 
cgit v1.2.3-70-g09d2


From c994c2ebdbbc391a42f177c8eb7882ebf3f142d8 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 21 Jan 2014 15:48:32 -0800
Subject: ocfs2: use the new DLM operation callbacks while requesting new
 lockspace

Attempt to use the new DLM operations.  If it is not supported, use the
traditional ocfs2_controld.

To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
It first attempts to take the lock in EX mode (non-blocking).  If
successful (which means it is the first mount), it writes the version
number and downconverts to PR lock.  If it is unsuccessful, it reads the
version from the lock.

If this becomes the standard (with o2cb as well), it could simplify
userspace tools to check if the filesystem is mounted on other nodes.

Dan: Since ocfs2_protocol_version are two u8 values, the additional
checks with LONG* don't make sense.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stack_user.c | 126 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 102 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index d3867100aca..673ab40eb81 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/reboot.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 
 #include "stackglue.h"
@@ -122,6 +123,7 @@ struct ocfs2_live_connection {
 	struct dlm_lksb                 oc_version_lksb;
 	char                            oc_lvb[DLM_LVB_LEN];
 	struct completion               oc_sync_wait;
+	wait_queue_head_t		oc_wait;
 };
 
 struct ocfs2_control_private {
@@ -218,7 +220,7 @@ static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
 	mutex_lock(&ocfs2_control_lock);
 	c->oc_conn = conn;
 
-	if (atomic_read(&ocfs2_control_opened))
+	if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
 		list_add(&c->oc_list, &ocfs2_live_connection_list);
 	else {
 		printk(KERN_ERR
@@ -897,6 +899,55 @@ static int version_unlock(struct ocfs2_cluster_connection *conn)
 	return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
 }
 
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ *    version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ *    taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+	int ret;
+	struct ocfs2_live_connection *lc = conn->cc_private;
+	struct ocfs2_protocol_version pv;
+
+	running_proto.pv_major =
+		ocfs2_user_plugin.sp_max_proto.pv_major;
+	running_proto.pv_minor =
+		ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+	lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+	ret = version_lock(conn, DLM_LOCK_EX,
+			DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+	if (!ret) {
+		conn->cc_version.pv_major = running_proto.pv_major;
+		conn->cc_version.pv_minor = running_proto.pv_minor;
+		version_to_lvb(&running_proto, lc->oc_lvb);
+		version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+	} else if (ret == -EAGAIN) {
+		ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+		if (ret)
+			goto out;
+		lvb_to_version(lc->oc_lvb, &pv);
+
+		if ((pv.pv_major != running_proto.pv_major) ||
+				(pv.pv_minor > running_proto.pv_minor)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		conn->cc_version.pv_major = pv.pv_major;
+		conn->cc_version.pv_minor = pv.pv_minor;
+	}
+out:
+	return ret;
+}
+
 static void user_recover_prep(void *arg)
 {
 }
@@ -925,6 +976,7 @@ static void user_recover_done(void *arg, struct dlm_slot *slots,
 		}
 
 	lc->oc_our_slot = our_slot;
+	wake_up(&lc->oc_wait);
 }
 
 const struct dlm_lockspace_ops ocfs2_ls_ops = {
@@ -933,11 +985,21 @@ const struct dlm_lockspace_ops ocfs2_ls_ops = {
 	.recover_done = user_recover_done,
 };
 
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	version_unlock(conn);
+	dlm_release_lockspace(conn->cc_lockspace, 2);
+	conn->cc_lockspace = NULL;
+	ocfs2_live_connection_drop(conn->cc_private);
+	conn->cc_private = NULL;
+	return 0;
+}
+
 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 {
 	dlm_lockspace_t *fsdlm;
 	struct ocfs2_live_connection *lc;
-	int rc;
+	int rc, ops_rv;
 
 	BUG_ON(conn == NULL);
 
@@ -947,11 +1009,44 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 		goto out;
 	}
 
-	lc->oc_type = WITH_CONTROLD;
+	init_waitqueue_head(&lc->oc_wait);
+	init_completion(&lc->oc_sync_wait);
+	atomic_set(&lc->oc_this_node, 0);
+	conn->cc_private = lc;
+	lc->oc_type = NO_CONTROLD;
+
+	rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+			       DLM_LSFL_FS, DLM_LVB_LEN,
+			       &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+	if (rc)
+		goto out;
+
+	if (ops_rv == -EOPNOTSUPP) {
+		lc->oc_type = WITH_CONTROLD;
+		printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+				"version of dlm_controld and/or ocfs2-tools."
+				" Please consider upgrading.\n");
+	} else if (ops_rv) {
+		rc = ops_rv;
+		goto out;
+	}
+	conn->cc_lockspace = fsdlm;
+
 	rc = ocfs2_live_connection_attach(conn, lc);
 	if (rc)
 		goto out;
 
+	if (lc->oc_type == NO_CONTROLD) {
+		rc = get_protocol_version(conn);
+		if (rc) {
+			printk(KERN_ERR "ocfs2: Could not determine"
+					" locking version\n");
+			user_cluster_disconnect(conn);
+			goto out;
+		}
+		wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+	}
+
 	/*
 	 * running_proto must have been set before we allowed any mounts
 	 * to proceed.
@@ -959,40 +1054,20 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 	if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
 		printk(KERN_ERR
 		       "Unable to mount with fs locking protocol version "
-		       "%u.%u because the userspace control daemon has "
-		       "negotiated %u.%u\n",
+		       "%u.%u because negotiated protocol is %u.%u\n",
 		       conn->cc_version.pv_major, conn->cc_version.pv_minor,
 		       running_proto.pv_major, running_proto.pv_minor);
 		rc = -EPROTO;
 		ocfs2_live_connection_drop(lc);
 		lc = NULL;
-		goto out;
-	}
-
-	rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
-			       NULL, NULL, NULL, &fsdlm);
-	if (rc) {
-		ocfs2_live_connection_drop(lc);
-		lc = NULL;
-		goto out;
 	}
 
-	conn->cc_private = lc;
-	conn->cc_lockspace = fsdlm;
 out:
 	if (rc && lc)
 		kfree(lc);
 	return rc;
 }
 
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
-	dlm_release_lockspace(conn->cc_lockspace, 2);
-	conn->cc_lockspace = NULL;
-	ocfs2_live_connection_drop(conn->cc_private);
-	conn->cc_private = NULL;
-	return 0;
-}
 
 static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
 				  unsigned int *this_node)
@@ -1002,8 +1077,11 @@ static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
 
 	if (lc->oc_type == WITH_CONTROLD)
 		rc = ocfs2_control_get_this_node();
+	else if (lc->oc_type == NO_CONTROLD)
+		rc = atomic_read(&lc->oc_this_node);
 	else
 		rc = -EINVAL;
+
 	if (rc < 0)
 		return rc;
 
-- 
cgit v1.2.3-70-g09d2


From 0a2fcd8988ac682f443fd5b0a7c48154a7b42ef2 Mon Sep 17 00:00:00 2001
From: Younger Liu <liuyiyang@hisense.com>
Date: Tue, 21 Jan 2014 15:48:33 -0800
Subject: ocfs2: remove redundant ocfs2_alloc_dinode_update_counts() and
 ocfs2_block_group_set_bits()

ocfs2_alloc_dinode_update_counts() and ocfs2_block_group_set_bits() are
already provided in suballoc.c.  So, the same functions in
move_extents.c are not needed any more.

Declare the functions in suballoc.h and remove redundant functions in
move_extents.c.

Signed-off-by: Younger Liu <liuyiyang@hisense.com>
Cc: Younger Liu <younger.liucn@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/move_extents.c | 77 -------------------------------------------------
 fs/ocfs2/suballoc.c     | 12 ++------
 fs/ocfs2/suballoc.h     | 12 ++++++++
 3 files changed, 14 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 631a9821347..64c304d668f 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
 	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
 }
 
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
-				       handle_t *handle,
-				       struct buffer_head *di_bh,
-				       u32 num_bits,
-				       u16 chain)
-{
-	int ret;
-	u32 tmp_used;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-	struct ocfs2_chain_list *cl =
-				(struct ocfs2_chain_list *) &di->id2.i_chain;
-
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
-	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
-	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-	ocfs2_journal_dirty(handle, di_bh);
-
-out:
-	return ret;
-}
-
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
-					     struct inode *alloc_inode,
-					     struct ocfs2_group_desc *bg,
-					     struct buffer_head *group_bh,
-					     unsigned int bit_off,
-					     unsigned int num_bits)
-{
-	int status;
-	void *bitmap = bg->bg_bitmap;
-	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-
-	/* All callers get the descriptor via
-	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-
-	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
-	     num_bits);
-
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
-	status = ocfs2_journal_access_gd(handle,
-					 INODE_CACHE(alloc_inode),
-					 group_bh,
-					 journal_type);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-			    " count %u but claims %u are freed. num_bits %d",
-			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
-			    le16_to_cpu(bg->bg_bits),
-			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
-		return -EROFS;
-	}
-	while (num_bits--)
-		ocfs2_set_bit(bit_off++, bitmap);
-
-	ocfs2_journal_dirty(handle, group_bh);
-
-bail:
-	return status;
-}
-
 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
 			     u32 len, int ext_flags)
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2c91452c404..47ae2663a6f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 				     struct ocfs2_suballoc_result *res);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 					 int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
-					     struct inode *alloc_inode,
-					     struct ocfs2_group_desc *bg,
-					     struct buffer_head *group_bh,
-					     unsigned int bit_off,
-					     unsigned int num_bits);
 static int ocfs2_relink_block_group(handle_t *handle,
 				    struct inode *alloc_inode,
 				    struct buffer_head *fe_bh,
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
 	return status;
 }
 
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
 					     struct inode *alloc_inode,
 					     struct ocfs2_group_desc *bg,
 					     struct buffer_head *group_bh,
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	ocfs2_journal_dirty(handle, group_bh);
 
 bail:
-	if (status)
-		mlog_errno(status);
 	return status;
 }
 
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode,
 	return ret;
 }
 
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 				       handle_t *handle,
 				       struct buffer_head *di_bh,
 				       u32 num_bits,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa5091..218d8036b3e 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 			   u32 bits_wanted,
 			   struct ocfs2_alloc_context **ac);
 
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+			 handle_t *handle,
+			 struct buffer_head *di_bh,
+			 u32 num_bits,
+			 u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+			 struct inode *alloc_inode,
+			 struct ocfs2_group_desc *bg,
+			 struct buffer_head *group_bh,
+			 unsigned int bit_off,
+			 unsigned int num_bits);
+
 int ocfs2_claim_metadata(handle_t *handle,
 			 struct ocfs2_alloc_context *ac,
 			 u32 bits_wanted,
-- 
cgit v1.2.3-70-g09d2


From 19e8ac2721148a475833d7b5f893f9b81fbb0045 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Tue, 21 Jan 2014 15:48:34 -0800
Subject: ocfs2: return EOPNOTSUPP if the device does not support discard

For FITRIM ioctl(2), we should return EOPNOTSUPP to inform the user that
the storage device does not support discard if it is, otherwise return
success would confuse the user even though there is no free blocks were
trimmed at all.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ioctl.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455..4de6a2af592 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/blkdev.h>
 #include <linux/compat.h>
 
 #include <cluster/masklog.h>
@@ -966,12 +967,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case FITRIM:
 	{
 		struct super_block *sb = inode->i_sb;
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
 		struct fstrim_range range;
 		int ret = 0;
 
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
+		if (!blk_queue_discard(q))
+			return -EOPNOTSUPP;
+
 		if (copy_from_user(&range, argp, sizeof(range)))
 			return -EFAULT;
 
-- 
cgit v1.2.3-70-g09d2


From aa89762c54800208d5afdcd8e6bf124818f17fe0 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Tue, 21 Jan 2014 15:48:35 -0800
Subject: ocfs2: return EINVAL if the given range to discard is less than block
 size

For FITRIM ioctl(2), we should not keep silence if the given range
length ls less than a block size as there is no data blocks would be
discareded.  Hence it should return EINVAL instead.  This issue can be
verified via xfstests/generic/288 which is used for FITRIM argument
handling tests.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc7411fe185..8750ae1b863 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	start = range->start >> osb->s_clustersize_bits;
 	len = range->len >> osb->s_clustersize_bits;
 	minlen = range->minlen >> osb->s_clustersize_bits;
-	trimmed = 0;
-
-	if (!len) {
-		range->len = 0;
-		return 0;
-	}
 
-	if (minlen >= osb->bitmap_cpg)
+	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
 		return -EINVAL;
 
 	main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		goto out_unlock;
 	}
 
+	len = range->len >> osb->s_clustersize_bits;
 	if (start + len > le32_to_cpu(main_bm->i_clusters))
 		len = le32_to_cpu(main_bm->i_clusters) - start;
 
@@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
 	last_bit = osb->bitmap_cpg;
 
+	trimmed = 0;
 	for (group = first_group; group <= last_group;) {
 		if (first_bit + len >= osb->bitmap_cpg)
 			last_bit = osb->bitmap_cpg;
-- 
cgit v1.2.3-70-g09d2


From 1ba2212bb3e9527a4b3d18692f5ffe204f29bb47 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Tue, 21 Jan 2014 15:48:36 -0800
Subject: ocfs2: adjust minlen with discard_granularity in the FITRIM ioctl

Adjust minlen with discard_granularity for FITRIM ioctl(2) if the given
minimum size in bytes is less than it because, discard granularity is
used to tell us that the minimum size of extent that can be discarded by
the storage device.

This is inspired by ext4 commit 5c2ed62fd447 ("ext4: Adjust minlen with
discard_granularity in the FITRIM ioctl") from Lukas Czerner.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 4de6a2af592..8ca3c29accb 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -980,6 +980,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (copy_from_user(&range, argp, sizeof(range)))
 			return -EFAULT;
 
+		range.minlen = max_t(u64, q->limits.discard_granularity,
+				     range.minlen);
 		ret = ocfs2_trim_fs(sb, &range);
 		if (ret < 0)
 			return ret;
-- 
cgit v1.2.3-70-g09d2


From 16eac4be46fdd19e4e0bcd06e77e947266d2cf35 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 21 Jan 2014 15:48:37 -0800
Subject: ocfs2: fix sparse non static symbol warning

Fixes the following sparse warning:

  fs/ocfs2/stack_user.c:930:32: warning:
   symbol 'ocfs2_ls_ops' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stack_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 673ab40eb81..13a8537d8e8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -979,7 +979,7 @@ static void user_recover_done(void *arg, struct dlm_slot *slots,
 	wake_up(&lc->oc_wait);
 }
 
-const struct dlm_lockspace_ops ocfs2_ls_ops = {
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
 	.recover_prep = user_recover_prep,
 	.recover_slot = user_recover_slot,
 	.recover_done = user_recover_done,
-- 
cgit v1.2.3-70-g09d2


From a2a3b39824e152ecf0e7357ccc7b9d6fd4b9fe7e Mon Sep 17 00:00:00 2001
From: Tariq Saeed <tariq.x.saeed@oracle.com>
Date: Tue, 21 Jan 2014 15:48:38 -0800
Subject: ocfs2: punch hole should return EINVAL if the length argument in
 ioctl is negative

An unreserve space ioctl OCFS2_IOC_UNRESVSP/64 should reject a negative
length.

Orabug:14789508

Signed-off-by: Tariq Saseed <tariq.x.saeed@oracle.com>
Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad1..f42eecef647 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 	}
 	size = sr->l_start + sr->l_len;
 
-	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+	    cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
 		if (sr->l_len <= 0) {
 			ret = -EINVAL;
 			goto out_inode_unlock;
-- 
cgit v1.2.3-70-g09d2


From 75f82eaa502c75d881c1db7979f3c2bf2da6865f Mon Sep 17 00:00:00 2001
From: Yiwen Jiang <jiangyiwen@huawei.com>
Date: Tue, 21 Jan 2014 15:48:39 -0800
Subject: ocfs2: fix NULL pointer dereference when dismount and ocfs2rec
 simultaneously

2 nodes cluster, say Node A and Node B, mount the same ocfs2 volume, and
create a file 1.

Node A			Node B
open 1, get open lock
                        rm 1, and then add 1 to orphan_dir
storage link down,
o2hb_write_timeout
->o2quo_disk_timeout
->emergency_restart
                        at the moment, Node B dismount and do
			ocfs2rec simultaneously
                        1) ocfs2_dismount_volume
			->ocfs2_recovery_exit
			->wait_event(osb->recovery_event)
			->flush_workqueue(ocfs2_wq)
			2) ocfs2rec
			->queue_work(&journal->j_recovery_work)
                        ->ocfs2_recover_orphans
			->ocfs2_commit_truncate
                        ->queue_delayed_work(&osb->osb_truncate_log_wq)

In ocfs2_recovery_exit, it flushes workqueue and then releases system
inodes.  When doing ocfs2rec, it will call ocfs2_flush_truncate_log
which will try to get sys_root_inode, and NULL pointer dereference
occurs.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: joyce <xuejiufei@huawei.com>
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/super.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5445d72eb8e..49d84f80f36 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1945,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_shutdown_local_alloc(osb);
 
-	ocfs2_truncate_log_shutdown(osb);
-
 	/* This will disable recovery and flush any recovery work. */
 	ocfs2_recovery_exit(osb);
 
+	/*
+	 * During dismount, when it recovers another node it will call
+	 * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
+	 */
+	ocfs2_truncate_log_shutdown(osb);
+
 	ocfs2_journal_shutdown(osb);
 
 	ocfs2_sync_blockdev(sb);
-- 
cgit v1.2.3-70-g09d2


From 0afaa12047a45ebe651f29a3b4818e523f862c28 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 21 Jan 2014 15:48:42 -0800
Subject: posix_acl: uninlining

Uninline vast tracts of nested inline functions in
include/linux/posix_acl.h.

This reduces the text+data+bss size of x86_64 allyesconfig vmlinux by
8026 bytes.

The patch also regularises the positioning of the EXPORT_SYMBOLs in
posix_acl.c.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Benny Halevy <bhalevy@primarydata.com>
Cc: Benny Halevy <bhalevy@panasas.com>
Cc: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/posix_acl.c            | 84 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/posix_acl.h | 78 ++++---------------------------------------
 2 files changed, 85 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f8..021e7c069b8 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,11 +22,80 @@
 
 #include <linux/errno.h>
 
-EXPORT_SYMBOL(posix_acl_init);
-EXPORT_SYMBOL(posix_acl_alloc);
-EXPORT_SYMBOL(posix_acl_valid);
-EXPORT_SYMBOL(posix_acl_equiv_mode);
-EXPORT_SYMBOL(posix_acl_from_mode);
+struct posix_acl **acl_by_type(struct inode *inode, int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return &inode->i_acl;
+	case ACL_TYPE_DEFAULT:
+		return &inode->i_default_acl;
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(acl_by_type);
+
+struct posix_acl *get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl **p = acl_by_type(inode, type);
+	struct posix_acl *acl = ACCESS_ONCE(*p);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *p;
+		if (acl != ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
+	return acl;
+}
+EXPORT_SYMBOL(get_cached_acl);
+
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
+{
+	return rcu_dereference(*acl_by_type(inode, type));
+}
+EXPORT_SYMBOL(get_cached_acl_rcu);
+
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct posix_acl **p = acl_by_type(inode, type);
+	struct posix_acl *old;
+	spin_lock(&inode->i_lock);
+	old = *p;
+	rcu_assign_pointer(*p, posix_acl_dup(acl));
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+EXPORT_SYMBOL(set_cached_acl);
+
+void forget_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl **p = acl_by_type(inode, type);
+	struct posix_acl *old;
+	spin_lock(&inode->i_lock);
+	old = *p;
+	*p = ACL_NOT_CACHED;
+	spin_unlock(&inode->i_lock);
+	if (old != ACL_NOT_CACHED)
+		posix_acl_release(old);
+}
+EXPORT_SYMBOL(forget_cached_acl);
+
+void forget_all_cached_acls(struct inode *inode)
+{
+	struct posix_acl *old_access, *old_default;
+	spin_lock(&inode->i_lock);
+	old_access = inode->i_acl;
+	old_default = inode->i_default_acl;
+	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+	spin_unlock(&inode->i_lock);
+	if (old_access != ACL_NOT_CACHED)
+		posix_acl_release(old_access);
+	if (old_default != ACL_NOT_CACHED)
+		posix_acl_release(old_default);
+}
+EXPORT_SYMBOL(forget_all_cached_acls);
 
 /*
  * Init a fresh posix_acl
@@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count)
 	atomic_set(&acl->a_refcount, 1);
 	acl->a_count = count;
 }
+EXPORT_SYMBOL(posix_acl_init);
 
 /*
  * Allocate a new ACL with the specified number of entries.
@@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags)
 		posix_acl_init(acl, count);
 	return acl;
 }
+EXPORT_SYMBOL(posix_acl_alloc);
 
 /*
  * Clone an ACL.
@@ -146,6 +217,7 @@ posix_acl_valid(const struct posix_acl *acl)
 		return 0;
 	return -EINVAL;
 }
+EXPORT_SYMBOL(posix_acl_valid);
 
 /*
  * Returns 0 if the acl can be exactly represented in the traditional
@@ -186,6 +258,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
                 *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
         return not_equiv;
 }
+EXPORT_SYMBOL(posix_acl_equiv_mode);
 
 /*
  * Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +280,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
 	acl->a_entries[2].e_perm = (mode & S_IRWXO);
 	return acl;
 }
+EXPORT_SYMBOL(posix_acl_from_mode);
 
 /*
  * Return 0 if current is granted want access to the inode
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 7931efe7117..fb616942e4c 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -94,78 +94,12 @@ extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
-#ifdef CONFIG_FS_POSIX_ACL
-static inline struct posix_acl **acl_by_type(struct inode *inode, int type)
-{
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		return &inode->i_acl;
-	case ACL_TYPE_DEFAULT:
-		return &inode->i_default_acl;
-	default:
-		BUG();
-	}
-}
-
-static inline struct posix_acl *get_cached_acl(struct inode *inode, int type)
-{
-	struct posix_acl **p = acl_by_type(inode, type);
-	struct posix_acl *acl = ACCESS_ONCE(*p);
-	if (acl) {
-		spin_lock(&inode->i_lock);
-		acl = *p;
-		if (acl != ACL_NOT_CACHED)
-			acl = posix_acl_dup(acl);
-		spin_unlock(&inode->i_lock);
-	}
-	return acl;
-}
-
-static inline struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
-{
-	return rcu_dereference(*acl_by_type(inode, type));
-}
-
-static inline void set_cached_acl(struct inode *inode,
-				  int type,
-				  struct posix_acl *acl)
-{
-	struct posix_acl **p = acl_by_type(inode, type);
-	struct posix_acl *old;
-	spin_lock(&inode->i_lock);
-	old = *p;
-	rcu_assign_pointer(*p, posix_acl_dup(acl));
-	spin_unlock(&inode->i_lock);
-	if (old != ACL_NOT_CACHED)
-		posix_acl_release(old);
-}
-
-static inline void forget_cached_acl(struct inode *inode, int type)
-{
-	struct posix_acl **p = acl_by_type(inode, type);
-	struct posix_acl *old;
-	spin_lock(&inode->i_lock);
-	old = *p;
-	*p = ACL_NOT_CACHED;
-	spin_unlock(&inode->i_lock);
-	if (old != ACL_NOT_CACHED)
-		posix_acl_release(old);
-}
-
-static inline void forget_all_cached_acls(struct inode *inode)
-{
-	struct posix_acl *old_access, *old_default;
-	spin_lock(&inode->i_lock);
-	old_access = inode->i_acl;
-	old_default = inode->i_default_acl;
-	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
-	spin_unlock(&inode->i_lock);
-	if (old_access != ACL_NOT_CACHED)
-		posix_acl_release(old_access);
-	if (old_default != ACL_NOT_CACHED)
-		posix_acl_release(old_default);
-}
-#endif
+struct posix_acl **acl_by_type(struct inode *inode, int type);
+struct posix_acl *get_cached_acl(struct inode *inode, int type);
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
+void forget_cached_acl(struct inode *inode, int type);
+void forget_all_cached_acls(struct inode *inode);
 
 static inline void cache_no_acl(struct inode *inode)
 {
-- 
cgit v1.2.3-70-g09d2


From 38316c8ab7f17bc1be8f2898278d5f5131e18bf2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 21 Jan 2014 15:48:43 -0800
Subject: fs/compat_ioctl.c: fix an underflow issue (harmless)

We cap "nmsgs" at I2C_RDRW_IOCTL_MAX_MSGS (42) but the current code
allows negative values.  It's harmless but it makes my static checker
upset so I've made nsmgs unsigned.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dc52e13d58e..3881610b643 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
 	struct i2c_msg			__user *tmsgs;
 	struct i2c_msg32		__user *umsgs;
 	compat_caddr_t			datap;
-	int				nmsgs, i;
+	u32				nmsgs;
+	int				i;
 
 	if (get_user(nmsgs, &udata->nmsgs))
 		return -EFAULT;
-- 
cgit v1.2.3-70-g09d2


From 4e4f9e66a75921aa260c2f5bf626bdea54e51ba2 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Tue, 21 Jan 2014 15:48:44 -0800
Subject: fs/read_write.c:compat_readv(): remove bogus area verify

The compat_do_readv_writev() function was doing a verify_area on the
incoming iov, but the nr_segs value is not checked.  If someone passes
in a -1 for nr_segs, for instance, the function should return an EINVAL.
However, it returns a EFAULT because the verify_area fails because it is
checking an array of size MAX_UINT.  The check is bogus, anyway, because
the next check, compat_rw_copy_check_uvector(), will do all the
necessary checking, anyway.  The non-compat do_readv_writev() function
doesn't do this check, so I think it's safe to just remove the code.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/read_write.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 58e440df1bc..1193ffd0356 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	io_fn_t fn;
 	iov_fn_t fnv;
 
-	ret = -EFAULT;
-	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
-		goto out;
-
 	ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
 					       UIO_FASTIOV, iovstack, &iov);
 	if (ret <= 0)
-- 
cgit v1.2.3-70-g09d2


From b5bd856a0c2a6331ee3300fb589aeea56eba110b Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 21 Jan 2014 15:48:45 -0800
Subject: fs/super.c: fix WARN on alloc_super() fail path

On fail path alloc_super() calls destroy_super(), which issues a warning
if the sb's s_mounts list is not empty, in particular if it has not been
initialized.  That said s_mounts must be initialized in alloc_super()
before any possible failure, but currently it is initialized close to
the end of the function leading to a useless warning dumped to log if
either percpu_counter_init() or list_lru_init() fails.  Let's fix this.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index e5f6c2cfac3..cecd780e0f4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	if (!s)
 		return NULL;
 
+	INIT_LIST_HEAD(&s->s_mounts);
+
 	if (security_sb_alloc(s))
 		goto fail;
 
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	if (list_lru_init(&s->s_inode_lru))
 		goto fail;
 
-	INIT_LIST_HEAD(&s->s_mounts);
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
-- 
cgit v1.2.3-70-g09d2


From af52b040eba5e6982d8665af8cd4dd69a466d5c3 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Tue, 21 Jan 2014 15:48:46 -0800
Subject: fs/ramfs: don't use module_init for non-modular core code

The ramfs is always built in.  It will never be modular, so using
module_init as an alias for __initcall is rather misleading.

Fix this up now, so that we can relocate module_init from init.h into
module.h in the future.  If we don't do this, we'd have to add module.h
to obviously non-modular code, and that would be a worse thing.

Note that direct use of __initcall is discouraged, vs.  one of the
priority categorized subgroups.  As __initcall gets mapped onto
device_initcall, our use of fs_initcall (which makes sense for fs code)
will thus change this registration from level 6-device to level 5-fs
(i.e. slightly earlier).  However no observable impact of that small
difference has been observed during testing, or is expected.

Also note that this change uncovers a missing semicolon bug in the
registration of the initcall.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d..6a3e2c42018 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -275,4 +275,4 @@ int __init init_ramfs_fs(void)
 
 	return err;
 }
-module_init(init_ramfs_fs)
+fs_initcall(init_ramfs_fs);
-- 
cgit v1.2.3-70-g09d2


From 34e431b0ae398fc54ea69ff85ec700722c9da773 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 21 Jan 2014 15:49:05 -0800
Subject: /proc/meminfo: provide estimated available memory

Many load balancing and workload placing programs check /proc/meminfo to
estimate how much free memory is available.  They generally do this by
adding up "free" and "cached", which was fine ten years ago, but is
pretty much guaranteed to be wrong today.

It is wrong because Cached includes memory that is not freeable as page
cache, for example shared memory segments, tmpfs, and ramfs, and it does
not include reclaimable slab memory, which can take up a large fraction
of system memory on mostly idle systems with lots of files.

Currently, the amount of memory that is available for a new workload,
without pushing the system into swap, can be estimated from MemFree,
Active(file), Inactive(file), and SReclaimable, as well as the "low"
watermarks from /proc/zoneinfo.

However, this may change in the future, and user space really should not
be expected to know kernel internals to come up with an estimate for the
amount of free memory.

It is more convenient to provide such an estimate in /proc/meminfo.  If
things change in the future, we only have to change it in one place.

Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Erik Mouw <erik.mouw_2@nxp.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  9 +++++++++
 fs/proc/meminfo.c                  | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 22d89aa3721..8533f5f9bb2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -767,6 +767,7 @@ The "Locked" indicates whether the mapping is locked in memory or not.
 
 MemTotal:     16344972 kB
 MemFree:      13634064 kB
+MemAvailable: 14836172 kB
 Buffers:          3656 kB
 Cached:        1195708 kB
 SwapCached:          0 kB
@@ -799,6 +800,14 @@ AnonHugePages:   49152 kB
     MemTotal: Total usable ram (i.e. physical ram minus a few reserved
               bits and the kernel binary code)
      MemFree: The sum of LowFree+HighFree
+MemAvailable: An estimate of how much memory is available for starting new
+              applications, without swapping. Calculated from MemFree,
+              SReclaimable, the size of the file LRU lists, and the low
+              watermarks in each zone.
+              The estimate takes into account that the system needs some
+              page cache to function well, and that not all reclaimable
+              slab will be reclaimable, due to items being in use. The
+              impact of those factors will vary from system to system.
      Buffers: Relatively temporary storage for raw disk blocks
               shouldn't get tremendously large (20MB or so)
       Cached: in-memory cache for files read from the disk (the
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a77d2b29919..24270eceddb 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	unsigned long committed;
 	struct vmalloc_info vmi;
 	long cached;
+	long available;
+	unsigned long pagecache;
+	unsigned long wmark_low = 0;
 	unsigned long pages[NR_LRU_LISTS];
+	struct zone *zone;
 	int lru;
 
 /*
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
 		pages[lru] = global_page_state(NR_LRU_BASE + lru);
 
+	for_each_zone(zone)
+		wmark_low += zone->watermark[WMARK_LOW];
+
+	/*
+	 * Estimate the amount of memory available for userspace allocations,
+	 * without causing swapping.
+	 *
+	 * Free memory cannot be taken below the low watermark, before the
+	 * system starts swapping.
+	 */
+	available = i.freeram - wmark_low;
+
+	/*
+	 * Not all the page cache can be freed, otherwise the system will
+	 * start swapping. Assume at least half of the page cache, or the
+	 * low watermark worth of cache, needs to stay.
+	 */
+	pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+	pagecache -= min(pagecache / 2, wmark_low);
+	available += pagecache;
+
+	/*
+	 * Part of the reclaimable swap consists of items that are in use,
+	 * and cannot be freed. Cap this estimate at the low watermark.
+	 */
+	available += global_page_state(NR_SLAB_RECLAIMABLE) -
+		     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+	if (available < 0)
+		available = 0;
+
 	/*
 	 * Tagged format, for easy grepping and expansion.
 	 */
 	seq_printf(m,
 		"MemTotal:       %8lu kB\n"
 		"MemFree:        %8lu kB\n"
+		"MemAvailable:   %8lu kB\n"
 		"Buffers:        %8lu kB\n"
 		"Cached:         %8lu kB\n"
 		"SwapCached:     %8lu kB\n"
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		,
 		K(i.totalram),
 		K(i.freeram),
+		K(available),
 		K(i.bufferram),
 		K(cached),
 		K(total_swapcache_pages()),
-- 
cgit v1.2.3-70-g09d2


From 048ed4b6266144fdee55089c9eef55b0c1d42ba1 Mon Sep 17 00:00:00 2001
From: wangweidong <wangweidong1@huawei.com>
Date: Tue, 21 Jan 2014 15:44:11 +0800
Subject: sctp: remove macros sctp_{lock|release}_sock

Redefined {lock|release}_sock to sctp_{lock|release}_sock for user space friendly
code which we haven't use in years, so removing them.

Signed-off-by: Wang Weidong <wangweidong1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/lowcomms.c       |  4 ++--
 include/net/sctp/sctp.h |  2 --
 net/sctp/socket.c       | 62 ++++++++++++++++++++++++-------------------------
 3 files changed, 33 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa..ce53dffd236 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -713,11 +713,11 @@ static void process_sctp_notification(struct connection *con,
 				return;
 
 			/* Peel off a new sock */
-			sctp_lock_sock(con->sock->sk);
+			lock_sock(con->sock->sk);
 			ret = sctp_do_peeloff(con->sock->sk,
 				sn->sn_assoc_change.sac_assoc_id,
 				&new_con->sock);
-			sctp_release_sock(con->sock->sk);
+			release_sock(con->sock->sk);
 			if (ret < 0) {
 				log_print("Can't peel off a socket for "
 					  "connection %d to node %d: err=%d",
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 572cd5a5292..ec18d306bad 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -171,8 +171,6 @@ extern struct kmem_cache *sctp_bucket_cachep __read_mostly;
  */
 
 /* sock lock wrappers. */
-#define sctp_lock_sock(sk)       lock_sock(sk)
-#define sctp_release_sock(sk)    release_sock(sk)
 #define sctp_bh_lock_sock(sk)    bh_lock_sock(sk)
 #define sctp_bh_unlock_sock(sk)  bh_unlock_sock(sk)
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 98532cbd842..893aa56c91c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -272,7 +272,7 @@ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
 {
 	int retval = 0;
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	pr_debug("%s: sk:%p, addr:%p, addr_len:%d\n", __func__, sk,
 		 addr, addr_len);
@@ -284,7 +284,7 @@ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
 	else
 		retval = -EINVAL;
 
-	sctp_release_sock(sk);
+	release_sock(sk);
 
 	return retval;
 }
@@ -1461,7 +1461,7 @@ static void sctp_close(struct sock *sk, long timeout)
 
 	pr_debug("%s: sk:%p, timeout:%ld\n", __func__, sk, timeout);
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 	sk->sk_shutdown = SHUTDOWN_MASK;
 	sk->sk_state = SCTP_SS_CLOSING;
 
@@ -1505,7 +1505,7 @@ static void sctp_close(struct sock *sk, long timeout)
 		sctp_wait_for_close(sk, timeout);
 
 	/* This will run the backlog queue.  */
-	sctp_release_sock(sk);
+	release_sock(sk);
 
 	/* Supposedly, no process has access to the socket, but
 	 * the net layers still may.
@@ -1665,7 +1665,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 
 	pr_debug("%s: about to look up association\n", __func__);
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	/* If a msg_name has been specified, assume this is to be used.  */
 	if (msg_name) {
@@ -1949,7 +1949,7 @@ out_free:
 		sctp_association_free(asoc);
 	}
 out_unlock:
-	sctp_release_sock(sk);
+	release_sock(sk);
 
 out_nounlock:
 	return sctp_error(sk, msg_flags, err);
@@ -2035,7 +2035,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
 		 "addr_len:%p)\n", __func__, sk, msg, len, noblock, flags,
 		 addr_len);
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) {
 		err = -ENOTCONN;
@@ -2119,7 +2119,7 @@ out_free:
 		sctp_ulpevent_free(event);
 	}
 out:
-	sctp_release_sock(sk);
+	release_sock(sk);
 	return err;
 }
 
@@ -3590,7 +3590,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 		goto out_nounlock;
 	}
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	switch (optname) {
 	case SCTP_SOCKOPT_BINDX_ADD:
@@ -3708,7 +3708,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 		break;
 	}
 
-	sctp_release_sock(sk);
+	release_sock(sk);
 
 out_nounlock:
 	return retval;
@@ -3736,7 +3736,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr,
 	int err = 0;
 	struct sctp_af *af;
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk,
 		 addr, addr_len);
@@ -3752,7 +3752,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr,
 		err = __sctp_connect(sk, addr, af->sockaddr_len, NULL);
 	}
 
-	sctp_release_sock(sk);
+	release_sock(sk);
 	return err;
 }
 
@@ -3778,7 +3778,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
 	long timeo;
 	int error = 0;
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	sp = sctp_sk(sk);
 	ep = sp->ep;
@@ -3816,7 +3816,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
 	sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP);
 
 out:
-	sctp_release_sock(sk);
+	release_sock(sk);
 	*err = error;
 	return newsk;
 }
@@ -3826,7 +3826,7 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
 	int rc = -ENOTCONN;
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	/*
 	 * SEQPACKET-style sockets in LISTENING state are valid, for
@@ -3856,7 +3856,7 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 		break;
 	}
 out:
-	sctp_release_sock(sk);
+	release_sock(sk);
 	return rc;
 }
 
@@ -5754,7 +5754,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	if (get_user(len, optlen))
 		return -EFAULT;
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	switch (optname) {
 	case SCTP_STATUS:
@@ -5878,7 +5878,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		break;
 	}
 
-	sctp_release_sock(sk);
+	release_sock(sk);
 	return retval;
 }
 
@@ -6144,7 +6144,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
 	if (unlikely(backlog < 0))
 		return err;
 
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 	/* Peeled-off sockets are not allowed to listen().  */
 	if (sctp_style(sk, UDP_HIGH_BANDWIDTH))
@@ -6177,7 +6177,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
 
 	err = 0;
 out:
-	sctp_release_sock(sk);
+	release_sock(sk);
 	return err;
 }
 
@@ -6474,9 +6474,9 @@ static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p)
 	 * does not fit in the user's buffer, but this seems to be the
 	 * only way to honor MSG_DONTWAIT realistically.
 	 */
-	sctp_release_sock(sk);
+	release_sock(sk);
 	*timeo_p = schedule_timeout(*timeo_p);
-	sctp_lock_sock(sk);
+	lock_sock(sk);
 
 ready:
 	finish_wait(sk_sleep(sk), &wait);
@@ -6659,10 +6659,10 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
 		/* Let another process have a go.  Since we are going
 		 * to sleep anyway.
 		 */
-		sctp_release_sock(sk);
+		release_sock(sk);
 		current_timeo = schedule_timeout(current_timeo);
 		BUG_ON(sk != asoc->base.sk);
-		sctp_lock_sock(sk);
+		lock_sock(sk);
 
 		*timeo_p = current_timeo;
 	}
@@ -6767,9 +6767,9 @@ static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p)
 		/* Let another process have a go.  Since we are going
 		 * to sleep anyway.
 		 */
-		sctp_release_sock(sk);
+		release_sock(sk);
 		current_timeo = schedule_timeout(current_timeo);
-		sctp_lock_sock(sk);
+		lock_sock(sk);
 
 		*timeo_p = current_timeo;
 	}
@@ -6812,9 +6812,9 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo)
 					  TASK_INTERRUPTIBLE);
 
 		if (list_empty(&ep->asocs)) {
-			sctp_release_sock(sk);
+			release_sock(sk);
 			timeo = schedule_timeout(timeo);
-			sctp_lock_sock(sk);
+			lock_sock(sk);
 		}
 
 		err = -EINVAL;
@@ -6847,9 +6847,9 @@ static void sctp_wait_for_close(struct sock *sk, long timeout)
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		if (list_empty(&sctp_sk(sk)->ep->asocs))
 			break;
-		sctp_release_sock(sk);
+		release_sock(sk);
 		timeout = schedule_timeout(timeout);
-		sctp_lock_sock(sk);
+		lock_sock(sk);
 	} while (!signal_pending(current) && timeout);
 
 	finish_wait(sk_sleep(sk), &wait);
@@ -7046,7 +7046,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 		newsk->sk_shutdown |= RCV_SHUTDOWN;
 
 	newsk->sk_state = SCTP_SS_ESTABLISHED;
-	sctp_release_sock(newsk);
+	release_sock(newsk);
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 5837c80e870bc3b12ac6a98cdc9ce7a9522a8fb6 Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Tue, 21 Jan 2014 20:32:05 -0800
Subject: bio-integrity: Fix bio_integrity_verify segment start bug

This patch addresses a bug in bio_integrity_verify() code that has
been causing DIF READ verify operations to be silently skipped.

The issue is that bio->bi_idx will have been incremented within
bio_advance() code in the normal blk_update_request() ->
req_bio_endio() completion path, and bio_integrity_verify() is
using bio_for_each_segment() which starts the bio segment walk
at the current bio->bi_idx.

So instead use bio_for_each_segment_all() to always start the bio
segment walk from zero, regardless of the current bio->bi_idx
value after bio_advance() has been called.

Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: stable@kernel.dk # >= v3.10
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/bio-integrity.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 80d972d739e..0bad24ddc2e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -447,20 +447,21 @@ static int bio_integrity_verify(struct bio *bio)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 	struct blk_integrity_exchg bix;
-	struct bio_vec bv;
-	struct bvec_iter iter;
+	struct bio_vec *bv;
 	sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
 	unsigned int sectors, total, ret;
 	void *prot_buf = bio->bi_integrity->bip_buf;
+	int i;
 
 	ret = total = 0;
 	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
 	bix.sector_size = bi->sector_size;
 
-	bio_for_each_segment(bv, bio, iter) {
-		void *kaddr = kmap_atomic(bv.bv_page);
-		bix.data_buf = kaddr + bv.bv_offset;
-		bix.data_size = bv.bv_len;
+	bio_for_each_segment_all(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page);
+
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
 		bix.prot_buf = prot_buf;
 		bix.sector = sector;
 
@@ -471,7 +472,7 @@ static int bio_integrity_verify(struct bio *bio)
 			return ret;
 		}
 
-		sectors = bv.bv_len / bi->sector_size;
+		sectors = bv->bv_len / bi->sector_size;
 		sector += sectors;
 		prot_buf += sectors * bi->tuple_size;
 		total += sectors * bi->tuple_size;
-- 
cgit v1.2.3-70-g09d2


From a18ff063406dd6aec41fda598eabe2691007a30d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 21 Jan 2014 13:32:12 +0900
Subject: f2fs: call mark_inode_dirty to flush dirty pages

If a dentry page is updated, we should call mark_inode_dirty to add the inode
into the dirty list, so that its dentry pages are flushed to the disk.
Otherwise, the inode can be evicted without flush.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/data.c  | 4 +++-
 fs/f2fs/dir.c   | 6 ++----
 fs/f2fs/namei.c | 3 +++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f5fac16bc6e..0ae55872350 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -249,6 +249,7 @@ int reserve_new_block(struct dnode_of_data *dn)
 
 	__set_data_blkaddr(dn, NEW_ADDR);
 	dn->data_blkaddr = NEW_ADDR;
+	mark_inode_dirty(dn->inode);
 	sync_inode_page(dn);
 	return 0;
 }
@@ -564,7 +565,6 @@ repeat:
 		i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
 		/* Only the directory inode sets new_i_size */
 		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
-		mark_inode_dirty_sync(inode);
 	}
 	return page;
 
@@ -1060,6 +1060,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
 	trace_f2fs_set_page_dirty(page, DATA);
 
 	SetPageUptodate(page);
+	mark_inode_dirty(inode);
+
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
 		set_dirty_dir_page(inode, page);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index cd055b6fcff..b2b77ccea82 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -388,6 +388,8 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
 		clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
 	}
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+
 	if (F2FS_I(dir)->i_current_depth != current_depth) {
 		F2FS_I(dir)->i_current_depth = current_depth;
 		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -395,8 +397,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
 
 	if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
 		update_inode_page(dir);
-	else
-		mark_inode_dirty(dir);
 
 	if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
 		clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
@@ -553,8 +553,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	if (inode && S_ISDIR(inode->i_mode)) {
 		drop_nlink(dir);
 		update_inode_page(dir);
-	} else {
-		mark_inode_dirty(dir);
 	}
 
 	if (inode) {
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a68838dae33..3d32f2969c5 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -430,6 +430,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (old_dir_entry)
 			drop_nlink(new_inode);
 		drop_nlink(new_inode);
+		mark_inode_dirty(new_inode);
 
 		if (!new_inode->i_nlink)
 			add_orphan_inode(sbi, new_inode->i_ino);
@@ -459,11 +460,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			f2fs_set_link(old_inode, old_dir_entry,
 						old_dir_page, new_dir);
 			F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+			update_inode_page(old_inode);
 		} else {
 			kunmap(old_dir_page);
 			f2fs_put_page(old_dir_page, 0);
 		}
 		drop_nlink(old_dir);
+		mark_inode_dirty(old_dir);
 		update_inode_page(old_dir);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From e8dae6045882e32a067326f47b7ccd3aaf8814e2 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 21 Jan 2014 18:31:38 +0900
Subject: f2fs: move a branch for code redability

This patch moves a function in f2fs_delete_entry for code readability.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/dir.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b2b77ccea82..2b7c255bcbd 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -550,12 +550,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 
-	if (inode && S_ISDIR(inode->i_mode)) {
-		drop_nlink(dir);
-		update_inode_page(dir);
-	}
-
 	if (inode) {
+		if (S_ISDIR(inode->i_mode)) {
+			drop_nlink(dir);
+			update_inode_page(dir);
+		}
 		inode->i_ctime = CURRENT_TIME;
 		drop_nlink(inode);
 		if (S_ISDIR(inode->i_mode)) {
-- 
cgit v1.2.3-70-g09d2


From 9df27d982d58b9372bc476fb6b9bab861d617029 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 20 Jan 2014 18:37:04 +0800
Subject: f2fs: add help function META_MAPPING

Introduce help function META_MAPPING() to get the cache meta blocks'
address space.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 8 ++++----
 fs/f2fs/debug.c      | 4 ++--
 fs/f2fs/f2fs.h       | 5 +++++
 fs/f2fs/node.c       | 2 +-
 fs/f2fs/segment.c    | 2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 4de0345f73f..f9d4f7de75a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab;
  */
 struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
-	struct address_space *mapping = sbi->meta_inode->i_mapping;
+	struct address_space *mapping = META_MAPPING(sbi);
 	struct page *page = NULL;
 repeat:
 	page = grab_cache_page(mapping, index);
@@ -50,7 +50,7 @@ repeat:
  */
 struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
-	struct address_space *mapping = sbi->meta_inode->i_mapping;
+	struct address_space *mapping = META_MAPPING(sbi);
 	struct page *page;
 repeat:
 	page = grab_cache_page(mapping, index);
@@ -128,7 +128,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 						long nr_to_write)
 {
-	struct address_space *mapping = sbi->meta_inode->i_mapping;
+	struct address_space *mapping = META_MAPPING(sbi);
 	pgoff_t index = 0, end = LONG_MAX;
 	struct pagevec pvec;
 	long nwritten = 0;
@@ -771,7 +771,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	wait_on_all_pages_writeback(sbi);
 
 	filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
-	filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
+	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
 
 	/* update user_block_counts */
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 63cb7e215e0..8bdc365be9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -53,7 +53,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->prefree_count = prefree_segments(sbi);
 	si->dirty_count = dirty_segments(sbi);
 	si->node_pages = sbi->node_inode->i_mapping->nrpages;
-	si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
+	si->meta_pages = META_MAPPING(sbi)->nrpages;
 	si->nats = NM_I(sbi)->nat_cnt;
 	si->sits = SIT_I(sbi)->dirty_sentries;
 	si->fnids = NM_I(sbi)->fcnt;
@@ -168,7 +168,7 @@ get_cache:
 	si->cache_mem += NM_I(sbi)->nat_cnt;
 	npages = sbi->node_inode->i_mapping->nrpages;
 	si->cache_mem += npages << PAGE_CACHE_SHIFT;
-	npages = sbi->meta_inode->i_mapping->nrpages;
+	npages = META_MAPPING(sbi)->nrpages;
 	si->cache_mem += npages << PAGE_CACHE_SHIFT;
 	si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
 	si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5ab3981938d..117e30f6b88 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -533,6 +533,11 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
 	return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
 }
 
+static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
+{
+	return sbi->meta_inode->i_mapping;
+}
+
 static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
 {
 	sbi->s_dirty = 1;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 226a05a27e3..527bd12d8ae 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -87,7 +87,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
  */
 static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 {
-	struct address_space *mapping = sbi->meta_inode->i_mapping;
+	struct address_space *mapping = META_MAPPING(sbi);
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct page *page;
 	pgoff_t index;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index e82423fbcb9..7caac5f2ca9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1585,7 +1585,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 
 static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
 {
-	struct address_space *mapping = sbi->meta_inode->i_mapping;
+	struct address_space *mapping = META_MAPPING(sbi);
 	struct page *page;
 	block_t blk_addr, prev_blk_addr = 0;
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
-- 
cgit v1.2.3-70-g09d2


From 63f5384c9a7df95a0e0eb6745f3038c703bdf4c3 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 20 Jan 2014 18:37:30 +0800
Subject: f2fs: remove the orphan block page array

As the orphan_blocks may be max to 504, so it is not security
and rigorous to store such a large array in the kernel stack
as Dan Carpenter said.
In fact, grab_meta_page has locked the page in the page cache,
and we can use find_get_page() to fetch the page safely in the
downstream, so we can remove the page array directly.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f9d4f7de75a..ed82de6bfb4 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -311,11 +311,10 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
 		(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
 	struct page *page = NULL;
-	struct page *pages[orphan_blocks];
 	struct orphan_inode_entry *orphan = NULL;
 
 	for (index = 0; index < orphan_blocks; index++)
-		pages[index] = grab_meta_page(sbi, start_blk + index);
+		grab_meta_page(sbi, start_blk + index);
 
 	index = 1;
 	spin_lock(&sbi->orphan_inode_lock);
@@ -324,10 +323,12 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	/* loop for each orphan inode entry and write them in Jornal block */
 	list_for_each_entry(orphan, head, list) {
 		if (!page) {
-			page = pages[index - 1];
+			page = find_get_page(META_MAPPING(sbi), start_blk++);
+			f2fs_bug_on(!page);
 			orphan_blk =
 				(struct f2fs_orphan_block *)page_address(page);
 			memset(orphan_blk, 0, sizeof(*orphan_blk));
+			f2fs_put_page(page, 0);
 		}
 
 		orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
-- 
cgit v1.2.3-70-g09d2


From 4ef51a8fcc7c54ca3ad948a8b4310b3bd5490c72 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 21 Jan 2014 18:51:16 +0900
Subject: f2fs: introduce NODE_MAPPING for code consistency

This patch adds NODE_MAPPING which is similar as META_MAPPING introduced by
Gu Zheng.

Cc: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/debug.c      |  4 ++--
 fs/f2fs/f2fs.h       |  5 +++++
 fs/f2fs/node.c       | 50 ++++++++++++++++++++++----------------------------
 4 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ed82de6bfb4..293d0486a40 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -771,7 +771,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	/* wait for previous submitted node/meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
 
-	filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
+	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
 	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
 
 	/* update user_block_counts */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8bdc365be9e..3de9d20d0c1 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -52,7 +52,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->free_secs = free_sections(sbi);
 	si->prefree_count = prefree_segments(sbi);
 	si->dirty_count = dirty_segments(sbi);
-	si->node_pages = sbi->node_inode->i_mapping->nrpages;
+	si->node_pages = NODE_MAPPING(sbi)->nrpages;
 	si->meta_pages = META_MAPPING(sbi)->nrpages;
 	si->nats = NM_I(sbi)->nat_cnt;
 	si->sits = SIT_I(sbi)->dirty_sentries;
@@ -166,7 +166,7 @@ get_cache:
 	/* free nids */
 	si->cache_mem = NM_I(sbi)->fcnt;
 	si->cache_mem += NM_I(sbi)->nat_cnt;
-	npages = sbi->node_inode->i_mapping->nrpages;
+	npages = NODE_MAPPING(sbi)->nrpages;
 	si->cache_mem += npages << PAGE_CACHE_SHIFT;
 	npages = META_MAPPING(sbi)->nrpages;
 	si->cache_mem += npages << PAGE_CACHE_SHIFT;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 117e30f6b88..af51a0bd2de 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -538,6 +538,11 @@ static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
 	return sbi->meta_inode->i_mapping;
 }
 
+static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
+{
+	return sbi->node_inode->i_mapping;
+}
+
 static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
 {
 	sbi->s_dirty = 1;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 527bd12d8ae..bbfc655493d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -681,7 +681,6 @@ fail:
 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	struct address_space *node_mapping = sbi->node_inode->i_mapping;
 	int err = 0, cont = 1;
 	int level, offset[4], noffset[4];
 	unsigned int nofs = 0;
@@ -756,7 +755,7 @@ skip_partial:
 		if (offset[1] == 0 &&
 				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 			lock_page(page);
-			if (unlikely(page->mapping != node_mapping)) {
+			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 				f2fs_put_page(page, 1);
 				goto restart;
 			}
@@ -842,7 +841,6 @@ struct page *new_node_page(struct dnode_of_data *dn,
 				unsigned int ofs, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
-	struct address_space *mapping = sbi->node_inode->i_mapping;
 	struct node_info old_ni, new_ni;
 	struct page *page;
 	int err;
@@ -850,7 +848,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
 		return ERR_PTR(-EPERM);
 
-	page = grab_cache_page(mapping, dn->nid);
+	page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -920,18 +918,17 @@ static int read_node_page(struct page *page, int rw)
  */
 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 {
-	struct address_space *mapping = sbi->node_inode->i_mapping;
 	struct page *apage;
 	int err;
 
-	apage = find_get_page(mapping, nid);
+	apage = find_get_page(NODE_MAPPING(sbi), nid);
 	if (apage && PageUptodate(apage)) {
 		f2fs_put_page(apage, 0);
 		return;
 	}
 	f2fs_put_page(apage, 0);
 
-	apage = grab_cache_page(mapping, nid);
+	apage = grab_cache_page(NODE_MAPPING(sbi), nid);
 	if (!apage)
 		return;
 
@@ -944,11 +941,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 
 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
 {
-	struct address_space *mapping = sbi->node_inode->i_mapping;
 	struct page *page;
 	int err;
 repeat:
-	page = grab_cache_page(mapping, nid);
+	page = grab_cache_page(NODE_MAPPING(sbi), nid);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -963,7 +959,7 @@ repeat:
 		f2fs_put_page(page, 1);
 		return ERR_PTR(-EIO);
 	}
-	if (unlikely(page->mapping != mapping)) {
+	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
@@ -980,7 +976,6 @@ got_it:
 struct page *get_node_page_ra(struct page *parent, int start)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
-	struct address_space *mapping = sbi->node_inode->i_mapping;
 	struct blk_plug plug;
 	struct page *page;
 	int err, i, end;
@@ -991,7 +986,7 @@ struct page *get_node_page_ra(struct page *parent, int start)
 	if (!nid)
 		return ERR_PTR(-ENOENT);
 repeat:
-	page = grab_cache_page(mapping, nid);
+	page = grab_cache_page(NODE_MAPPING(sbi), nid);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -1016,7 +1011,7 @@ repeat:
 	blk_finish_plug(&plug);
 
 	lock_page(page);
-	if (unlikely(page->mapping != mapping)) {
+	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
@@ -1047,7 +1042,6 @@ void sync_inode_page(struct dnode_of_data *dn)
 int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
 					struct writeback_control *wbc)
 {
-	struct address_space *mapping = sbi->node_inode->i_mapping;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	int step = ino ? 2 : 0;
@@ -1061,7 +1055,7 @@ next_step:
 
 	while (index <= end) {
 		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
 				PAGECACHE_TAG_DIRTY,
 				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
@@ -1094,7 +1088,7 @@ next_step:
 			else if (!trylock_page(page))
 				continue;
 
-			if (unlikely(page->mapping != mapping)) {
+			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 continue_unlock:
 				unlock_page(page);
 				continue;
@@ -1121,7 +1115,7 @@ continue_unlock:
 				set_fsync_mark(page, 0);
 				set_dentry_mark(page, 0);
 			}
-			mapping->a_ops->writepage(page, wbc);
+			NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
 			wrote++;
 
 			if (--wbc->nr_to_write == 0)
@@ -1148,18 +1142,19 @@ continue_unlock:
 
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	struct address_space *mapping = sbi->node_inode->i_mapping;
 	pgoff_t index = 0, end = LONG_MAX;
 	struct pagevec pvec;
-	int nr_pages;
 	int ret2 = 0, ret = 0;
 
 	pagevec_init(&pvec, 0);
-	while ((index <= end) &&
-			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			PAGECACHE_TAG_WRITEBACK,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
-		unsigned i;
+
+	while (index <= end) {
+		int i, nr_pages;
+		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_WRITEBACK,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -1178,9 +1173,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 		cond_resched();
 	}
 
-	if (unlikely(test_and_clear_bit(AS_ENOSPC, &mapping->flags)))
+	if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
 		ret2 = -ENOSPC;
-	if (unlikely(test_and_clear_bit(AS_EIO, &mapping->flags)))
+	if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
 		ret2 = -EIO;
 	if (!ret)
 		ret = ret2;
@@ -1538,13 +1533,12 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
 
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 {
-	struct address_space *mapping = sbi->node_inode->i_mapping;
 	struct f2fs_inode *src, *dst;
 	nid_t ino = ino_of_node(page);
 	struct node_info old_ni, new_ni;
 	struct page *ipage;
 
-	ipage = grab_cache_page(mapping, ino);
+	ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
 	if (!ipage)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From 28a625cbc2a14f17b83e47ef907b2658576a32aa Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 22 Jan 2014 19:36:57 +0100
Subject: fuse: fix pipe_buf_operations

Having this struct in module memory could Oops when if the module is
unloaded while the buffer still persists in a pipe.

Since sock_pipe_buf_ops is essentially the same as fuse_dev_pipe_buf_steal
merge them into nosteal_pipe_buf_ops (this is the same as
default_pipe_buf_ops except stealing the page from the buffer is not
allowed).

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: stable@vger.kernel.org
---
 fs/fuse/dev.c             | 22 +++++-----------------
 fs/splice.c               | 18 ++++++++++++++++++
 include/linux/pipe_fs_i.h |  2 ++
 net/core/skbuff.c         | 32 +-------------------------------
 4 files changed, 26 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ef74ad5fd36..fa8cb4b7b8f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
 }
 
-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
-				   struct pipe_buffer *buf)
-{
-	return 1;
-}
-
-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
-	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
-	.confirm = generic_pipe_buf_confirm,
-	.release = generic_pipe_buf_release,
-	.steal = fuse_dev_pipe_buf_steal,
-	.get = generic_pipe_buf_get,
-};
-
 static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 				    struct pipe_inode_info *pipe,
 				    size_t len, unsigned int flags)
@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 		buf->page = bufs[page_nr].page;
 		buf->offset = bufs[page_nr].offset;
 		buf->len = bufs[page_nr].len;
-		buf->ops = &fuse_dev_pipe_buf_ops;
+		/*
+		 * Need to be careful about this.  Having buf->ops in module
+		 * code can Oops if the buffer persists after module unload.
+		 */
+		buf->ops = &nosteal_pipe_buf_ops;
 
 		pipe->nrbufs++;
 		page_nr++;
diff --git a/fs/splice.c b/fs/splice.c
index 46a08f772d7..12028fa41de 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
+static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+/* Pipe buffer operations for a socket and similar. */
+const struct pipe_buf_operations nosteal_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_nosteal,
+	.get = generic_pipe_buf_get,
+};
+EXPORT_SYMBOL(nosteal_pipe_buf_ops);
+
 static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 			    unsigned long vlen, loff_t offset)
 {
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b8809fef61f..ab575269211 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -157,6 +157,8 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
 
+extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
+
 /* for F_SETPIPE_SZ and F_GETPIPE_SZ */
 long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
 struct pipe_inode_info *get_pipe_info(struct file *file);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 06e72d3cdf6..0b5149c5bc4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -74,36 +74,6 @@
 struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
 
-static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
-				  struct pipe_buffer *buf)
-{
-	put_page(buf->page);
-}
-
-static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
-				struct pipe_buffer *buf)
-{
-	get_page(buf->page);
-}
-
-static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
-			       struct pipe_buffer *buf)
-{
-	return 1;
-}
-
-
-/* Pipe buffer operations for a socket. */
-static const struct pipe_buf_operations sock_pipe_buf_ops = {
-	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
-	.confirm = generic_pipe_buf_confirm,
-	.release = sock_pipe_buf_release,
-	.steal = sock_pipe_buf_steal,
-	.get = sock_pipe_buf_get,
-};
-
 /**
  *	skb_panic - private function for out-of-line support
  *	@skb:	buffer
@@ -1830,7 +1800,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 		.partial = partial,
 		.nr_pages_max = MAX_SKB_FRAGS,
 		.flags = flags,
-		.ops = &sock_pipe_buf_ops,
+		.ops = &nosteal_pipe_buf_ops,
 		.spd_release = sock_spd_release,
 	};
 	struct sk_buff *frag_iter;
-- 
cgit v1.2.3-70-g09d2


From 063ec1e595f8a82b5a8fd0acb3e88c8b49a1e6c1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 22 Jan 2014 19:36:58 +0100
Subject: fuse: fix SetPageUptodate() condition in STORE

As noticed by Coverity the "num != 0" condition never triggers.  Instead it
should check for a complete page.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fa8cb4b7b8f..0a648bb455a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1587,7 +1587,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
 
 		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
 		err = fuse_copy_page(cs, &page, offset, this_num, 0);
-		if (!err && offset == 0 && (num != 0 || file_size == end))
+		if (!err && offset == 0 &&
+		    (this_num == PAGE_CACHE_SIZE || file_size == end))
 			SetPageUptodate(page);
 		unlock_page(page);
 		page_cache_release(page);
-- 
cgit v1.2.3-70-g09d2


From 451418fc928b5ec1ee96a9afac807b6312811a2a Mon Sep 17 00:00:00 2001
From: Andrew Gallagher <andrewjcg@fb.com>
Date: Tue, 5 Nov 2013 03:55:43 -0800
Subject: fuse: don't invalidate attrs when not using atime

Various read operations (e.g. readlink, readdir) invalidate the cached
attrs for atime changes.  This patch adds a new function
'fuse_invalidate_atime', which checks for a read-only super block and
avoids the attr invalidation in that case.

Signed-off-by: Andrew Gallagher <andrewjcg@fb.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c    | 14 ++++++++++++--
 fs/fuse/file.c   |  4 ++--
 fs/fuse/fuse_i.h |  2 ++
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c3eb2c46c8f..1d1292c581c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode)
 	get_fuse_inode(inode)->i_time = 0;
 }
 
+/**
+ * Mark the attributes as stale due to an atime change.  Avoid the invalidate if
+ * atime is not used.
+ */
+void fuse_invalidate_atime(struct inode *inode)
+{
+	if (!IS_RDONLY(inode))
+		fuse_invalidate_attr(inode);
+}
+
 /*
  * Just mark the entry as stale, so that a next attempt to look it up
  * will result in a new lookup call to userspace
@@ -1371,7 +1381,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
 	}
 
 	__free_page(page);
-	fuse_invalidate_attr(inode); /* atime changed */
+	fuse_invalidate_atime(inode);
 	return err;
 }
 
@@ -1404,7 +1414,7 @@ static char *read_link(struct dentry *dentry)
 		link[req->out.args[0].size] = '\0';
  out:
 	fuse_put_request(fc, req);
-	fuse_invalidate_attr(inode); /* atime changed */
+	fuse_invalidate_atime(inode);
 	return link;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e70506297b..d53af8f1523 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -687,7 +687,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 		SetPageUptodate(page);
 	}
 
-	fuse_invalidate_attr(inode); /* atime changed */
+	fuse_invalidate_atime(inode);
  out:
 	unlock_page(page);
 	return err;
@@ -716,7 +716,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 			fuse_read_update_size(inode, pos,
 					      req->misc.read.attr_ver);
 		}
-		fuse_invalidate_attr(inode); /* atime changed */
+		fuse_invalidate_atime(inode);
 	}
 
 	for (i = 0; i < req->num_pages; i++) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7d273091266..dc44b9e3a0c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -788,6 +788,8 @@ void fuse_invalidate_attr(struct inode *inode);
 
 void fuse_invalidate_entry_cache(struct dentry *entry);
 
+void fuse_invalidate_atime(struct inode *inode);
+
 /**
  * Acquire reference to fuse_conn
  */
-- 
cgit v1.2.3-70-g09d2


From 7678ac50615d9c7a491d9861e020e4f5f71b594c Mon Sep 17 00:00:00 2001
From: Andrew Gallagher <agallagher@fb.com>
Date: Tue, 5 Nov 2013 16:05:52 +0100
Subject: fuse: support clients that don't implement 'open'

open/release operations require userspace transitions to keep track
of the open count and to perform any FS-specific setup.  However,
for some purely read-only FSs which don't need to perform any setup
at open/release time, we can avoid the performance overhead of
calling into userspace for open/release calls.

This patch adds the necessary support to the fuse kernel modules to prevent
open/release operations from hitting in userspace. When the client returns
ENOSYS, we avoid sending the subsequent release to userspace, and also
remember this so that future opens also don't trigger a userspace
operation.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c   | 37 +++++++++++++++++++++++++++----------
 fs/fuse/fuse_i.h |  3 +++
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d53af8f1523..74f6ca50050 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 	if (atomic_dec_and_test(&ff->count)) {
 		struct fuse_req *req = ff->reserved_req;
 
-		if (sync) {
+		if (ff->fc->no_open) {
+			/*
+			 * Drop the release request when client does not
+			 * implement 'open'
+			 */
+			req->background = 0;
+			path_put(&req->misc.release.path);
+			fuse_put_request(ff->fc, req);
+		} else if (sync) {
 			req->background = 0;
 			fuse_request_send(ff->fc, req);
 			path_put(&req->misc.release.path);
@@ -144,27 +152,36 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir)
 {
-	struct fuse_open_out outarg;
 	struct fuse_file *ff;
-	int err;
 	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 
 	ff = fuse_file_alloc(fc);
 	if (!ff)
 		return -ENOMEM;
 
-	err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
-	if (err) {
-		fuse_file_free(ff);
-		return err;
+	ff->fh = 0;
+	ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
+	if (!fc->no_open || isdir) {
+		struct fuse_open_out outarg;
+		int err;
+
+		err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+		if (!err) {
+			ff->fh = outarg.fh;
+			ff->open_flags = outarg.open_flags;
+
+		} else if (err != -ENOSYS || isdir) {
+			fuse_file_free(ff);
+			return err;
+		} else {
+			fc->no_open = 1;
+		}
 	}
 
 	if (isdir)
-		outarg.open_flags &= ~FOPEN_DIRECT_IO;
+		ff->open_flags &= ~FOPEN_DIRECT_IO;
 
-	ff->fh = outarg.fh;
 	ff->nodeid = nodeid;
-	ff->open_flags = outarg.open_flags;
 	file->private_data = fuse_file_get(ff);
 
 	return 0;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index dc44b9e3a0c..2da5db2c8bd 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -485,6 +485,9 @@ struct fuse_conn {
 	 * and hence races in setting them will not cause malfunction
 	 */
 
+	/** Is open/release not implemented by fs? */
+	unsigned no_open:1;
+
 	/** Is fsync not implemented by fs? */
 	unsigned no_fsync:1;
 
-- 
cgit v1.2.3-70-g09d2


From bf39c00a9a7f3cdb5ce7d6695d9f044daf8f0b53 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Wed, 22 Jan 2014 20:41:57 +0900
Subject: f2fs: drop obsolete node page when it is truncated

If a node page is trucated, we'd better drop the page in the node_inode's page
cache for better memory footprint.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/node.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index bbfc655493d..b0649b76eb4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -518,6 +518,10 @@ invalidate:
 	F2FS_SET_SB_DIRT(sbi);
 
 	f2fs_put_page(dn->node_page, 1);
+
+	invalidate_mapping_pages(NODE_MAPPING(sbi),
+			dn->node_page->index, dn->node_page->index);
+
 	dn->node_page = NULL;
 	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
 }
-- 
cgit v1.2.3-70-g09d2


From ed7e5423014ad89720fcf315c0b73f2c5d0c7bd2 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 22 Jan 2014 20:34:54 +0200
Subject: pnfs: Proper delay for NFS4ERR_RECALLCONFLICT in layout_get_done

An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT
only when a Server Sent a RECALL do to that GET_LAYOUT, or
the RECALL and GET_LAYOUT crossed on the wire.
In any way this means we want to wait at most until in-flight IO
is finished and the RECALL can be satisfied.

So a proper wait here is more like 1/10 of a second, not 15 seconds
like we have now. In case of a server bug we delay exponentially
longer on each retry.

Current code totally craps out performance of very large files on
most pnfs-objects layouts, because of how the map changes when the
file has grown into the next raid group.

[Stable: This will patch back to 3.9. If there are earlier still
 maintained trees, please tell me I'll send a patch]

CC: Stable Tree <stable@vger.kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a9eaaaa436c..a1965329a12 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7409,9 +7409,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layout_hdr *lo;
 	struct nfs4_state *state = NULL;
-	unsigned long timeo, giveup;
+	unsigned long timeo, now, giveup;
 
-	dprintk("--> %s\n", __func__);
+	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
 
 	if (!nfs41_sequence_done(task, &lgp->res.seq_res))
 		goto out;
@@ -7419,12 +7419,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case 0:
 		goto out;
+	/*
+	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+	 * (or clients) writing to the same RAID stripe
+	 */
 	case -NFS4ERR_LAYOUTTRYLATER:
+	/*
+	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
+	 * existing layout before getting a new one).
+	 */
 	case -NFS4ERR_RECALLCONFLICT:
 		timeo = rpc_get_timeout(task->tk_client);
 		giveup = lgp->args.timestamp + timeo;
-		if (time_after(giveup, jiffies))
-			task->tk_status = -NFS4ERR_DELAY;
+		now = jiffies;
+		if (time_after(giveup, now)) {
+			unsigned long delay;
+
+			/* Delay for:
+			 * - Not less then NFS4_POLL_RETRY_MIN.
+			 * - One last time a jiffie before we give up
+			 * - exponential backoff (time_now minus start_attempt)
+			 */
+			delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
+				    min((giveup - now - 1),
+					now - lgp->args.timestamp));
+
+			dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
+				__func__, delay);
+			rpc_delay(task, delay);
+			task->tk_status = 0;
+			rpc_restart_call_prepare(task);
+			goto out; /* Do not call nfs4_async_handle_error() */
+		}
 		break;
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
-- 
cgit v1.2.3-70-g09d2


From aad560b7f63b495f48a7232fd086c5913a676e6f Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 21 Nov 2013 17:58:08 +0200
Subject: ore: Fix wrong math in allocation of per device BIO

At IO preparation we calculate the max pages at each device and
allocate a BIO per device of that size. The calculation was wrong
on some unaligned corner cases offset/length combination and would
make prepare return with -ENOMEM. This would be bad for pnfs-objects
that would in that case IO through MDS. And fatal for exofs were it
would fail writes with EIO.

Fix it by doing the proper math, that will work in all cases. (I
ran a test with all possible offset/length combinations this time
round).

Also when reading we do not need to allocate for the parity units
since we jump over them.

Also lower the max_io_length to take into account the parity pages
so not to allocate BIOs bigger than PAGE_SIZE

CC: Stable Kernel <stable@vger.kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c         | 37 +++++++++++++++++++++++++------------
 include/scsi/osd_ore.h |  1 +
 2 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index b7442288860..85cde3e7629 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
 
 	layout->max_io_length =
 		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
-							layout->group_width;
+					(layout->group_width - layout->parity);
 	if (layout->parity) {
 		unsigned stripe_length =
 				(layout->group_width - layout->parity) *
@@ -286,7 +286,8 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 	if (length) {
 		ore_calc_stripe_info(layout, offset, length, &ios->si);
 		ios->length = ios->si.length;
-		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+		ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+				 ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
 		if (layout->parity)
 			_ore_post_alloc_raid_stuff(ios);
 	}
@@ -536,6 +537,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	u64	H = LmodS - G * T;
 
 	u32	N = div_u64(H, U);
+	u32	Nlast;
 
 	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
 	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
@@ -568,6 +570,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	si->length = T - H;
 	if (si->length > length)
 		si->length = length;
+
+	Nlast = div_u64(H + si->length + U - 1, U);
+	si->maxdevUnits = Nlast - N;
+
 	si->M = M;
 }
 EXPORT_SYMBOL(ore_calc_stripe_info);
@@ -583,13 +589,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 	int ret;
 
 	if (per_dev->bio == NULL) {
-		unsigned pages_in_stripe = ios->layout->group_width *
-					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
-					(ios->layout->group_width -
-					 ios->layout->parity);
-		unsigned bio_size = (nr_pages + pages_in_stripe) /
-					ios->layout->group_width;
+		unsigned bio_size;
+
+		if (!ios->reading) {
+			bio_size = ios->si.maxdevUnits;
+		} else {
+			bio_size = (ios->si.maxdevUnits + 1) *
+			     (ios->layout->group_width - ios->layout->parity) /
+			     ios->layout->group_width;
+		}
+		bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
@@ -609,8 +618,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
 					    pglen, pgbase);
 		if (unlikely(pglen != added_len)) {
-			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
-				   per_dev->bio->bi_vcnt);
+			/* If bi_vcnt == bi_max then this is a SW BUG */
+			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+				   "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+				   per_dev->bio->bi_vcnt,
+				   per_dev->bio->bi_max_vecs,
+				   BIO_MAX_PAGES_KMALLOC, cur_len);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -1098,7 +1111,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 		size_attr->attr = g_attr_logical_length;
 		size_attr->attr.val_ptr = &size_attr->newsize;
 
-		ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+		ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
 			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
 		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
 					&size_attr->attr);
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index a5f9b960dfc..6ca3265a4dc 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -102,6 +102,7 @@ struct ore_striping_info {
 	unsigned unit_off;
 	unsigned cur_pg;
 	unsigned cur_comp;
+	unsigned maxdevUnits;
 };
 
 struct ore_io_state;
-- 
cgit v1.2.3-70-g09d2


From 2deb76db092a91088494c00862bafc4fe7980adb Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 13 Jan 2014 19:10:40 +0200
Subject: ore: Don't crash on NULL bio in _clear_bio

In the case of target returning OSD_ERR_PRI_CLEAR_PAGES when we
only sent for attributes don't crash on NULL bio.
This is an osd-target bug but don't crash regardless

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ore.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 85cde3e7629..dae884694bd 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -431,8 +431,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
 		if (likely(!ret))
 			continue;
 
-		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
-			/* start read offset passed endof file */
+		if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
+		    per_dev->bio) {
+			/* start read offset passed endof file.
+			 * Note: if we do not have bio it means read-attributes
+			 * In this case we should return error to caller.
+			 */
 			_clear_bio(per_dev->bio);
 			ORE_DBGMSG("start read offset passed end of file "
 				"offset=0x%llx, length=0x%llx\n",
-- 
cgit v1.2.3-70-g09d2


From d83c7eb65d9bf0a57e7d5ed87a5bd8e5ea6b1fb6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 13 Jan 2014 23:45:43 +0200
Subject: exofs: Allow O_DIRECT open

With this minimal do nothing patch an application can open O_DIRECT
and then actually do buffered sync IO instead. But the aio API is
supported which is a good thing

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a52a5d23c30..7e7ba9ada40 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
 	WARN_ON(1);
 }
 
+
+ /* TODO: Should be easy enough to do proprly */
+static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
+		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+	return 0;
+}
+
 const struct address_space_operations exofs_aops = {
 	.readpage	= exofs_readpage,
 	.readpages	= exofs_readpages,
@@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = {
 
 	/* Not implemented Yet */
 	.bmap		= NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
-	.direct_IO	= NULL, /* TODO: Should be trivial to do */
+	.direct_IO	= exofs_direct_IO,
 
 	/* With these NULL has special meaning or default is not exported */
 	.get_xip_mem	= NULL,
-- 
cgit v1.2.3-70-g09d2


From c8592fcc66b2fe9cbd505f1faff810f8844d4a97 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 14 Jan 2014 16:05:52 +0200
Subject: exofs: Allow corrupted directory entry to be empty file

If there was an error in fetching an object or extracting
inode info from attributes. Which means corrupted storage.
Let it be an empty ZERO dated directory entry so it can be
deleted. Otherwise the all directory will be inaccessible.

This does not loose data, because if there is an orphan object
somewhere it will be recovered by fschk. But usually this only
means corrupted dir entry. The object was never generated and
only its link exist. This way we can delete the bad entry.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 7e7ba9ada40..390224a162d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1102,8 +1102,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 		/* If object is lost on target we might as well enable it's
 		 * delete.
 		 */
-		if ((ret == -ENOENT) || (ret == -EINVAL))
-			ret = 0;
+		ret = 0;
 		goto out;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 19350e7627a6f3b0f662cbd2eb1128c9961a41fe Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 14 Jan 2014 16:26:13 +0200
Subject: exofs: Print less in r4w

In debug mode exofs is too verbose. Hiding the real problems
remove some trivial stuff.

Also fix some other prints.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 390224a162d..ee4317faccb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 
 		if (offset >= i_size) {
 			*uptodate = true;
-			EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index);
+			EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
 			return ZERO_PAGE(0);
 		}
 
@@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 			*uptodate = true;
 		else
 			*uptodate = PageUptodate(page);
-		EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+		EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
 		return page;
 	} else {
-		EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+		EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
 			     pcol->that_locked_page->index);
 		*uptodate = true;
 		return pcol->that_locked_page;
@@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page)
 	struct page_collect *pcol = priv;
 
 	if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
-		EXOFS_DBGMSG("index=0x%lx\n", page->index);
+		EXOFS_DBGMSG2("index=0x%lx\n", page->index);
 		page_cache_release(page);
 		return;
 	}
-	EXOFS_DBGMSG("that_locked_page index=0x%lx\n",
+	EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
 		     ZERO_PAGE(0) == page ? -1 : page->index);
 }
 
@@ -1018,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
 	if (likely(!ret))
 		truncate_setsize(inode, newsize);
 
-	EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+	EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
 		     inode->i_ino, newsize, ret);
 	return ret;
 }
@@ -1108,7 +1108,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 
 	ret = extract_attr_from_ios(ios, &attrs[0]);
 	if (ret) {
-		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
 		goto out;
 	}
 	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
@@ -1116,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 
 	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (ret) {
-		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
 		goto out;
 	}
 	if (attrs[1].len) {
@@ -1131,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 
 	ret = extract_attr_from_ios(ios, &attrs[2]);
 	if (ret) {
-		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
 		goto out;
 	}
 	if (attrs[2].len) {
-- 
cgit v1.2.3-70-g09d2


From 068c34c0ce8add2e5f01ee6c85710e6fefb832ad Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 9 Jan 2014 16:24:35 -0500
Subject: nfsd: fix encode_entryplus_baggage stack usage

We stick an extra svc_fh in nfsd3_readdirres to save the need to
kmalloc, though maybe it would be fine to kmalloc instead.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3xdr.c | 12 ++++++------
 fs/nfsd/xdr3.h    |  3 +++
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 1ee6baec5fa..de6e39e12cb 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -842,21 +842,21 @@ out:
 
 static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
 {
-	struct svc_fh	fh;
+	struct svc_fh	*fh = &cd->scratch;
 	__be32 err;
 
-	fh_init(&fh, NFS3_FHSIZE);
-	err = compose_entry_fh(cd, &fh, name, namlen);
+	fh_init(fh, NFS3_FHSIZE);
+	err = compose_entry_fh(cd, fh, name, namlen);
 	if (err) {
 		*p++ = 0;
 		*p++ = 0;
 		goto out;
 	}
-	p = encode_post_op_attr(cd->rqstp, p, &fh);
+	p = encode_post_op_attr(cd->rqstp, p, fh);
 	*p++ = xdr_one;			/* yes, a file handle follows */
-	p = encode_fh(p, &fh);
+	p = encode_fh(p, fh);
 out:
-	fh_put(&fh);
+	fh_put(fh);
 	return p;
 }
 
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index b6d5542a4ac..335e04aaf7d 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -174,6 +174,9 @@ struct nfsd3_linkres {
 struct nfsd3_readdirres {
 	__be32			status;
 	struct svc_fh		fh;
+	/* Just to save kmalloc on every readdirplus entry (svc_fh is a
+	 * little large for the stack): */
+	struct svc_fh		scratch;
 	int			count;
 	__be32			verf[2];
 
-- 
cgit v1.2.3-70-g09d2


From e3bba3c3c90cd434c1ccb9e5dc704a96baf9541c Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 23 Jan 2014 15:52:53 -0800
Subject: fs/proc/page.c: add PageAnon check to surely detect thp

stable_page_flags() checks !PageHuge && PageTransCompound && PageLRU to
know that a specified page is thp or not.  But sometimes it's not enough
and we fail to detect thp when the thp is on pagevec.  This happens only
for a few seconds after LRU list operations, but it makes it difficult
to control our applications depending on this flag.

So this patch adds another check PageAnon to detect thps on pagevec.  It
might not give the future extensibility for thp pagecache, but it's OK
at least for now.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index b8730d9ebae..cab84b6272e 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -118,10 +118,12 @@ u64 stable_page_flags(struct page *page)
 	/*
 	 * PageTransCompound can be true for non-huge compound pages (slab
 	 * pages or pages allocated by drivers with __GFP_COMP) because it
-	 * just checks PG_head/PG_tail, so we need to check PageLRU to make
-	 * sure a given page is a thp, not a non-huge compound page.
+	 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+	 * to make sure a given page is a thp, not a non-huge compound page.
 	 */
-	else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
+	else if (PageTransCompound(page) &&
+		 (PageLRU(compound_trans_head(page)) ||
+		  PageAnon(compound_trans_head(page))))
 		u |= 1 << KPF_THP;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 2252b62a56601c9e31396da230b4ce792f167fb4 Mon Sep 17 00:00:00 2001
From: Younger Liu <liuyiyang@hisense.com>
Date: Thu, 23 Jan 2014 15:53:47 -0800
Subject: logfs: check for the return value after calling find_or_create_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In get_mapping_page(), after calling find_or_create_page(), the return
value should be checked.

 This patch has been provided:
http://www.spinics.net/lists/linux-fsdevel/msg66948.html but not been
applied now.

Signed-off-by: Younger Liu <liuyiyang@hisense.com>
Cc: Younger Liu <younger.liucn@gmail.com>
Cc: Vyacheslav Dubeyko <slava@dubeyko.com>
Reviewed-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
Cc: Jörn Engel <joern@logfs.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/logfs/segment.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index d448a777166..7f9b096d8d5 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
 		page = read_cache_page(mapping, index, filler, sb);
 	else {
 		page = find_or_create_page(mapping, index, GFP_NOFS);
-		unlock_page(page);
+		if (page)
+			unlock_page(page);
 	}
 	return page;
 }
-- 
cgit v1.2.3-70-g09d2


From 77719536dc00f8fd8f5abe6dadbde5331c37f996 Mon Sep 17 00:00:00 2001
From: Alex Elder <alex.elder@linaro.org>
Date: Thu, 23 Jan 2014 15:53:59 -0800
Subject: conditionally define U32_MAX

The symbol U32_MAX is defined in several spots.  Change these
definitions to be conditional.  This is in preparation for the next
patch, which centralizes the definition in <linux/kernel.h>.

Signed-off-by: Alex Elder <elder@linaro.org>
Cc: Sage Weil <sage@inktank.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/reiserfs.h      | 2 ++
 include/linux/ceph/decode.h | 2 ++
 net/ipv4/tcp_illinois.c     | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f8adaee537c..66a2e832fa8 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1958,7 +1958,9 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
 #define MAX_US_INT 0xffff
 
 // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
+#ifndef U32_MAX
 #define U32_MAX (~(__u32)0)
+#endif /* !U32_MAX */
 
 static inline loff_t max_reiserfs_offset(struct inode *inode)
 {
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 0442c3d800f..27fe66a279b 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -10,6 +10,7 @@
 
 /* This seemed to be the easiest place to define these */
 
+#ifndef U32_MAX
 #define	U8_MAX	((u8)(~0U))
 #define	U16_MAX	((u16)(~0U))
 #define	U32_MAX	((u32)(~0U))
@@ -24,6 +25,7 @@
 #define	S16_MIN	((s16)(-S16_MAX - 1))
 #define	S32_MIN	((s32)(-S32_MAX - 1))
 #define	S64_MIN	((s64)(-S64_MAX - 1LL))
+#endif /* !U32_MAX */
 
 /*
  * in all cases,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 8a520996f3d..f43947235b3 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -23,7 +23,9 @@
 #define ALPHA_MIN	((3*ALPHA_SCALE)/10)	/* ~0.3 */
 #define ALPHA_MAX	(10*ALPHA_SCALE)	/* 10.0 */
 #define ALPHA_BASE	ALPHA_SCALE		/* 1.0 */
+#ifndef U32_MAX
 #define U32_MAX		((u32)~0U)
+#endif /* !U32_MAX */
 #define RTT_MAX		(U32_MAX / ALPHA_MAX)	/* 3.3 secs */
 
 #define BETA_SHIFT	6
-- 
cgit v1.2.3-70-g09d2


From 04f9b74e4d96d349de12fdd4e6626af4a9f75e09 Mon Sep 17 00:00:00 2001
From: Alex Elder <alex.elder@linaro.org>
Date: Thu, 23 Jan 2014 15:54:01 -0800
Subject: remove extra definitions of U32_MAX

Now that the definition is centralized in <linux/kernel.h>, the
definitions of U32_MAX (and related) elsewhere in the kernel can be
removed.

Signed-off-by: Alex Elder <elder@linaro.org>
Acked-by: Sage Weil <sage@inktank.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/reiserfs.h      |  4 ----
 include/linux/ceph/decode.h | 19 -------------------
 net/ipv4/tcp_illinois.c     |  3 ---
 3 files changed, 26 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 66a2e832fa8..dfb617b2bad 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1958,10 +1958,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
 #define MAX_US_INT 0xffff
 
 // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
-#ifndef U32_MAX
-#define U32_MAX (~(__u32)0)
-#endif /* !U32_MAX */
-
 static inline loff_t max_reiserfs_offset(struct inode *inode)
 {
 	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 27fe66a279b..a6ef9cc267e 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -8,25 +8,6 @@
 
 #include <linux/ceph/types.h>
 
-/* This seemed to be the easiest place to define these */
-
-#ifndef U32_MAX
-#define	U8_MAX	((u8)(~0U))
-#define	U16_MAX	((u16)(~0U))
-#define	U32_MAX	((u32)(~0U))
-#define	U64_MAX	((u64)(~0ULL))
-
-#define	S8_MAX	((s8)(U8_MAX >> 1))
-#define	S16_MAX	((s16)(U16_MAX >> 1))
-#define	S32_MAX	((s32)(U32_MAX >> 1))
-#define	S64_MAX	((s64)(U64_MAX >> 1LL))
-
-#define	S8_MIN	((s8)(-S8_MAX - 1))
-#define	S16_MIN	((s16)(-S16_MAX - 1))
-#define	S32_MIN	((s32)(-S32_MAX - 1))
-#define	S64_MIN	((s64)(-S64_MAX - 1LL))
-#endif /* !U32_MAX */
-
 /*
  * in all cases,
  *   void **p     pointer to position pointer
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index f43947235b3..e498a62b8f9 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -23,9 +23,6 @@
 #define ALPHA_MIN	((3*ALPHA_SCALE)/10)	/* ~0.3 */
 #define ALPHA_MAX	(10*ALPHA_SCALE)	/* 10.0 */
 #define ALPHA_BASE	ALPHA_SCALE		/* 1.0 */
-#ifndef U32_MAX
-#define U32_MAX		((u32)~0U)
-#endif /* !U32_MAX */
 #define RTT_MAX		(U32_MAX / ALPHA_MAX)	/* 3.3 secs */
 
 #define BETA_SHIFT	6
-- 
cgit v1.2.3-70-g09d2


From 7a5f4f1cb0e7581ee7deb938d65f97145fa045f8 Mon Sep 17 00:00:00 2001
From: Todor Minchev <todor@minchev.co.uk>
Date: Thu, 23 Jan 2014 15:54:53 -0800
Subject: fs: binfmt_elf: remove unused defines INTERPRETER_NONE and
 INTERPRETER_ELF

These two defines are unused since the removal of the a.out interpreter
support in the ELF loader in kernel 2.6.25

Signed-off-by: Todor Minchev <todor@minchev.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 571a4232690..67be2951b98 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -543,9 +543,6 @@ out:
  * libraries.  There is no binary dependent code anywhere else.
  */
 
-#define INTERPRETER_NONE 0
-#define INTERPRETER_ELF 2
-
 #ifndef STACK_RND_MASK
 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))	/* 8MB of VA */
 #endif
-- 
cgit v1.2.3-70-g09d2


From 0fa9aa20c33d76e98f44ff1de6e128e39a7738ca Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 23 Jan 2014 15:54:54 -0800
Subject: fs/ramfs/file-nommu.c: make ramfs_nommu_get_unmapped_area() and
 ramfs_nommu_mmap() static

Since commit 853ac43ab194 ("shmem: unify regular and tiny shmem"),
ramfs_nommu_get_unmapped_area() and ramfs_nommu_mmap() are not directly
referenced outside of file-nommu.c.  Thus make them static.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/file-nommu.c | 10 ++++++++--
 include/linux/ramfs.h |  7 -------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 8d5b438cc18..80862b1aeea 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,6 +27,12 @@
 #include "internal.h"
 
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+						   unsigned long addr,
+						   unsigned long len,
+						   unsigned long pgoff,
+						   unsigned long flags);
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 
 const struct address_space_operations ramfs_aops = {
 	.readpage		= simple_readpage,
@@ -197,7 +203,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
  *   - the pages to be mapped must exist
  *   - the pages be physically contiguous in sequence
  */
-unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 					    unsigned long addr, unsigned long len,
 					    unsigned long pgoff, unsigned long flags)
 {
@@ -256,7 +262,7 @@ out:
 /*
  * set up a mapping for shared memory segments
  */
-int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	if (!(vma->vm_flags & VM_SHARED))
 		return -ENOSYS;
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 753207c8ce2..ecc730977a5 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -14,13 +14,6 @@ ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 }
 #else
 extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
-extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
-						   unsigned long addr,
-						   unsigned long len,
-						   unsigned long pgoff,
-						   unsigned long flags);
-
-extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 #endif
 
 extern const struct file_operations ramfs_file_operations;
-- 
cgit v1.2.3-70-g09d2


From 87e06aa3a7e5f9fbc2f5215c4ba9c4a42b404192 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 23 Jan 2014 15:54:55 -0800
Subject: fs/ramfs: move ramfs_aops to inode.c

ramfs_aops is identical in file-mmu.c and file-nommu.c.  Thus move it to
fs/ramfs/inode.c and make it static.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/file-mmu.c   | 7 -------
 fs/ramfs/file-nommu.c | 7 -------
 fs/ramfs/inode.c      | 7 +++++++
 fs/ramfs/internal.h   | 1 -
 4 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 4884ac5ae9b..1e56a4e8cf7 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -30,13 +30,6 @@
 
 #include "internal.h"
 
-const struct address_space_operations ramfs_aops = {
-	.readpage	= simple_readpage,
-	.write_begin	= simple_write_begin,
-	.write_end	= simple_write_end,
-	.set_page_dirty = __set_page_dirty_no_writeback,
-};
-
 const struct file_operations ramfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 80862b1aeea..0b3d8e4cb2f 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,13 +34,6 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long flags);
 static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 
-const struct address_space_operations ramfs_aops = {
-	.readpage		= simple_readpage,
-	.write_begin		= simple_write_begin,
-	.write_end		= simple_write_end,
-	.set_page_dirty		= __set_page_dirty_no_writeback,
-};
-
 const struct file_operations ramfs_file_operations = {
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 6a3e2c42018..d365b1c4eb3 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -43,6 +43,13 @@
 static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
 
+static const struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+	.set_page_dirty	= __set_page_dirty_no_writeback,
+};
+
 static struct backing_dev_info ramfs_backing_dev_info = {
 	.name		= "ramfs",
 	.ra_pages	= 0,	/* No readahead */
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 6b330639b51..a9d8ae88fa1 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,5 +10,4 @@
  */
 
 
-extern const struct address_space_operations ramfs_aops;
 extern const struct inode_operations ramfs_file_inode_operations;
-- 
cgit v1.2.3-70-g09d2


From 6eaba35b437438988078efc92f1ef445a00cd7bc Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Thu, 23 Jan 2014 15:54:57 -0800
Subject: autofs4: allow autofs to work outside the initial PID namespace

Enable autofs4 to work in a "container".  oz_pgrp is converted from
pid_t to struct pid and this is stored at mount time based on the
"pgrp=" option or if the option is missing then the current pgrp.

The "pgrp=" option is interpreted in the PID namespace of the current
process.  This option is flawed in that it doesn't carry the namespace
information, so it should be deprecated.  AFAICS the autofs daemon
always sends the current pgrp, which is the default anyway.

The oz_pgrp is also set from the AUTOFS_DEV_IOCTL_SETPIPEFD_CMD ioctl.
This ioctl sets oz_pgrp to the current pgrp.  It is not allowed to
change the pid namespace.

oz_pgrp is used mainly to determine whether the process traversing the
autofs mount tree is the autofs daemon itself or not.  This function now
compares the pid pointers instead of the pid_t values.

One other use of oz_pgrp is in autofs4_show_options.  There is shows the
virtual pid number (i.e.  the one that is valid inside the PID namespace
of the calling process)

For debugging printk convert oz_pgrp to the value in the initial pid
namespace.

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Acked-by: Ian Kent <raven@themaw.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h  |  4 ++--
 fs/autofs4/dev-ioctl.c | 16 ++++++++++++++--
 fs/autofs4/inode.c     | 36 +++++++++++++++++++++++++++---------
 3 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4218e26df91..acf32054edd 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -104,7 +104,7 @@ struct autofs_sb_info {
 	u32 magic;
 	int pipefd;
 	struct file *pipe;
-	pid_t oz_pgrp;
+	struct pid *oz_pgrp;
 	int catatonic;
 	int version;
 	int sub_version;
@@ -140,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
    filesystem without "magic".) */
 
 static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
-	return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
 
 /* Does a dentry have some pending activity? */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1818ce7f5a0..3182c0e68b4 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 {
 	int pipefd;
 	int err = 0;
+	struct pid *new_pid = NULL;
 
 	if (param->setpipefd.pipefd == -1)
 		return -EINVAL;
@@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 		mutex_unlock(&sbi->wq_mutex);
 		return -EBUSY;
 	} else {
-		struct file *pipe = fget(pipefd);
+		struct file *pipe;
+
+		new_pid = get_task_pid(current, PIDTYPE_PGID);
+
+		if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
+			AUTOFS_WARN("Not allowed to change PID namespace");
+			err = -EINVAL;
+			goto out;
+		}
+
+		pipe = fget(pipefd);
 		if (!pipe) {
 			err = -EBADF;
 			goto out;
@@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 			fput(pipe);
 			goto out;
 		}
-		sbi->oz_pgrp = task_pgrp_nr(current);
+		swap(sbi->oz_pgrp, new_pid);
 		sbi->pipefd = pipefd;
 		sbi->pipe = pipe;
 		sbi->catatonic = 0;
 	}
 out:
+	put_pid(new_pid);
 	mutex_unlock(&sbi->wq_mutex);
 	return err;
 }
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 3b9cc9b973c..a3de082db62 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -56,8 +56,11 @@ void autofs4_kill_sb(struct super_block *sb)
 	 * just call kill_anon_super when we are called from
 	 * deactivate_super.
 	 */
-	if (sbi) /* Free wait queues, close pipe */
+	if (sbi) {
+		/* Free wait queues, close pipe */
 		autofs4_catatonic_mode(sbi);
+		put_pid(sbi->oz_pgrp);
+	}
 
 	DPRINTK("shutting down");
 	kill_litter_super(sb);
@@ -80,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 	if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
 		seq_printf(m, ",gid=%u",
 			from_kgid_munged(&init_user_ns, root_inode->i_gid));
-	seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+	seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
 	seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
 	seq_printf(m, ",minproto=%d", sbi->min_proto);
 	seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -124,7 +127,8 @@ static const match_table_t tokens = {
 };
 
 static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
-		pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
+			 int *pgrp, bool *pgrp_set, unsigned int *type,
+			 int *minproto, int *maxproto)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -132,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
 
 	*uid = current_uid();
 	*gid = current_gid();
-	*pgrp = task_pgrp_nr(current);
 
 	*minproto = AUTOFS_MIN_PROTO_VERSION;
 	*maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -171,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
 			if (match_int(args, &option))
 				return 1;
 			*pgrp = option;
+			*pgrp_set = true;
 			break;
 		case Opt_minproto:
 			if (match_int(args, &option))
@@ -206,6 +210,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	int pipefd;
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
+	int pgrp;
+	bool pgrp_set = false;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -218,7 +224,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->pipe = NULL;
 	sbi->catatonic = 1;
 	sbi->exp_timeout = 0;
-	sbi->oz_pgrp = task_pgrp_nr(current);
+	sbi->oz_pgrp = NULL;
 	sbi->sb = s;
 	sbi->version = 0;
 	sbi->sub_version = 0;
@@ -255,12 +261,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 
 	/* Can this call block? */
 	if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
-				&sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
-				&sbi->max_proto)) {
+			  &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
+			  &sbi->max_proto)) {
 		printk("autofs: called with bogus options\n");
 		goto fail_dput;
 	}
 
+	if (pgrp_set) {
+		sbi->oz_pgrp = find_get_pid(pgrp);
+		if (!sbi->oz_pgrp) {
+			pr_warn("autofs: could not find process group %d\n",
+				pgrp);
+			goto fail_dput;
+		}
+	} else {
+		sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+	}
+
 	if (autofs_type_trigger(sbi->type))
 		__managed_dentry_set_managed(root);
 
@@ -284,9 +301,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		sbi->version = sbi->max_proto;
 	sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
 
-	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
 	pipe = fget(pipefd);
-	
+
 	if (!pipe) {
 		printk("autofs: could not open pipe file descriptor\n");
 		goto fail_dput;
@@ -316,6 +333,7 @@ fail_dput:
 fail_ino:
 	kfree(ino);
 fail_free:
+	put_pid(sbi->oz_pgrp);
 	kfree(sbi);
 	s->s_fs_info = NULL;
 fail_unlock:
-- 
cgit v1.2.3-70-g09d2


From fbff08706d12fcdb160604c4ba790df6707c32cb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 23 Jan 2014 15:54:58 -0800
Subject: autofs4: translate pids to the right namespace for the daemon

The PID and the TGID of the process triggering the mount are sent to the
daemon.  Currently the global pid values are sent (ones valid in the
initial pid namespace) but this is wrong if the autofs daemon itself is
not running in the initial pid namespace.

So send the pid values that are valid in the namespace of the autofs
daemon.

The namespace to use is taken from the oz_pgrp pid pointer, which was
set at mount time to the mounting process' pid namespace.

If the pid translation fails (the triggering process is in an unrelated
pid namespace) then the automount fails with ENOENT.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Acked-by: Ian Kent <raven@themaw.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 689e40d983a..116fd38ee47 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	struct qstr qstr;
 	char *name;
 	int status, ret, type;
+	pid_t pid;
+	pid_t tgid;
 
 	/* In catatonic mode, we don't wait for nobody */
 	if (sbi->catatonic)
 		return -ENOENT;
 
+	/*
+	 * Try translating pids to the namespace of the daemon.
+	 *
+	 * Zero means failure: we are in an unrelated pid namespace.
+	 */
+	pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+	tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+	if (pid == 0 || tgid == 0)
+		return -ENOENT;
+
 	if (!dentry->d_inode) {
 		/*
 		 * A wait for a negative dentry is invalid for certain
@@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->ino = autofs4_get_ino(sbi);
 		wq->uid = current_uid();
 		wq->gid = current_gid();
-		wq->pid = current->pid;
-		wq->tgid = current->tgid;
+		wq->pid = pid;
+		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
 		wq->wait_ctr = 2;
 
-- 
cgit v1.2.3-70-g09d2


From da29b7543957c6e967066f1ee18fab2feb0eeeb3 Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Thu, 23 Jan 2014 15:54:59 -0800
Subject: autofs: fix the return value of autofs4_fill_super

While kzallocing sbi/ino fails, it should return -ENOMEM.

And it should return the err value from autofs_prepare_pipe.

Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/inode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3de082db62..d7bd395ab58 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -212,10 +212,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	struct autofs_info *ino;
 	int pgrp;
 	bool pgrp_set = false;
+	int ret = -EINVAL;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		goto fail_unlock;
+		return -ENOMEM;
 	DPRINTK("starting up, sbi = %p",sbi);
 
 	s->s_fs_info = sbi;
@@ -249,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	 * Get the root inode and dentry, but defer checking for errors.
 	 */
 	ino = autofs4_new_ino(sbi);
-	if (!ino)
+	if (!ino) {
+		ret = -ENOMEM;
 		goto fail_free;
+	}
 	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
 	root = d_make_root(root_inode);
 	if (!root)
@@ -308,7 +311,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		printk("autofs: could not open pipe file descriptor\n");
 		goto fail_dput;
 	}
-	if (autofs_prepare_pipe(pipe) < 0)
+	ret = autofs_prepare_pipe(pipe);
+	if (ret < 0)
 		goto fail_fput;
 	sbi->pipe = pipe;
 	sbi->pipefd = pipefd;
@@ -336,8 +340,7 @@ fail_free:
 	put_pid(sbi->oz_pgrp);
 	kfree(sbi);
 	s->s_fs_info = NULL;
-fail_unlock:
-	return -EINVAL;
+	return ret;
 }
 
 struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
-- 
cgit v1.2.3-70-g09d2


From c24930a9bbb6219f21f670c38b9473181d5f5e10 Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Thu, 23 Jan 2014 15:55:00 -0800
Subject: autofs: use IS_ROOT to replace root dentry checks

Use the helper macro !IS_ROOT to replace parent != dentry->d_parent.  Just
clean up.

Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 92ef341ba0c..2caf36ac3e9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
-	if (p_ino && dentry->d_parent != dentry)
+	if (p_ino && !IS_ROOT(dentry))
 		atomic_inc(&p_ino->count);
 
 	dir->i_mtime = CURRENT_TIME;
@@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
-		if (p_ino && dentry->d_parent != dentry)
+		if (p_ino && !IS_ROOT(dentry))
 			atomic_dec(&p_ino->count);
 	}
 	dput(ino->dentry);
@@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
 	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
-	if (p_ino && dentry->d_parent != dentry)
+	if (p_ino && !IS_ROOT(dentry))
 		atomic_inc(&p_ino->count);
 	inc_nlink(dir);
 	dir->i_mtime = CURRENT_TIME;
-- 
cgit v1.2.3-70-g09d2


From 8dc51fe5ab9edcaebb5438d6462befdc6922b4a1 Mon Sep 17 00:00:00 2001
From: Ian Kent <ikent@redhat.com>
Date: Thu, 23 Jan 2014 15:55:01 -0800
Subject: autofs: fix symlinks aren't checked for expiry

The autofs4 module doesn't consider symlinks for expire as it did in the
older autofs v3 module (so it's actually a long standing regression).

The user space daemon has focused on the use of bind mounts instead of
symlinks for a long time now and that's why this has not been noticed.
But with the future addition of amd map parsing to automount(8), not to
mention amd itself (of am-utils), symlink expiry will be needed.

The direct and offset mount types can't be symlinks and the tree mounts of
version 4 were always real mounts so only indirect mounts need expire
symlinks.

Since the current users of the autofs4 module haven't reported this as a
problem to date this patch probably isn't a candidate for backport to
stable.

Signed-off-by: Ian Kent <ikent@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/expire.c  | 14 ++++++++++++++
 fs/autofs4/symlink.c |  4 ++++
 2 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3d9d3f5d5dd..394e90b02c5 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			goto next;
 		}
 
+		if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+			DPRINTK("checking symlink %p %.*s",
+				dentry, (int)dentry->d_name.len, dentry->d_name.name);
+			/*
+			 * A symlink can't be "busy" in the usual sense so
+			 * just check last used for expire timeout.
+			 */
+			if (autofs4_can_expire(dentry, timeout, do_now)) {
+				expired = dentry;
+				goto found;
+			}
+			goto next;
+		}
+
 		if (simple_empty(dentry))
 			goto next;
 
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index f27c094a191..1e8ea192be2 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,6 +14,10 @@
 
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino && !autofs4_oz_mode(sbi))
+		ino->last_used = jiffies;
 	nd_set_link(nd, dentry->d_inode->i_private);
 	return NULL;
 }
-- 
cgit v1.2.3-70-g09d2


From 7e775f46a125f894a1d71e96797c776dbec161f0 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Thu, 23 Jan 2014 15:55:21 -0800
Subject: fs/pipe.c: skip file_update_time on frozen fs

Pipe has no data associated with fs so it is not good idea to block
pipe_write() if FS is frozen, but we can not update file's time on such
filesystem.  Let's use same idea as we use in touch_time().

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=65701

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0752ef271..78fd0d0788d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -663,10 +663,11 @@ out:
 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
-	if (ret > 0) {
+	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
 		int err = file_update_time(filp);
 		if (err)
 			ret = err;
+		sb_end_write(file_inode(filp)->i_sb);
 	}
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 4b15d61718f0d1bd3bc32e15bffb25a31c1d5782 Mon Sep 17 00:00:00 2001
From: Wenliang Fan <fanwlexca@gmail.com>
Date: Thu, 23 Jan 2014 15:55:22 -0800
Subject: fs/nilfs2: fix integer overflow in nilfs_ioctl_wrap_copy()

The local variable 'pos' in nilfs_ioctl_wrap_copy function can overflow if
a large number was passed to argv->v_index from userspace and the sum of
argv->v_index and argv->v_nmembs exceeds the maximum value of __u64 type
integer (= ~(__u64)0 = 18446744073709551615).

Here, argv->v_index is a 64-bit width argument to specify the start
position of target data items (such as segment number, checkpoint number,
or virtual block address of nilfs), and argv->v_nmembs gives the total
number of the items that userland programs (such as lssu, lscp, or
cleanerd) want to get information about, which also gives the maximum
element count of argv->v_base[] array.

nilfs_ioctl_wrap_copy() calls dofunc() repeatedly and increments the
position variable 'pos' at the end of each iteration if dofunc() itself
didn't update 'pos':

      if (pos == ppos)
              pos += n;

This patch prevents the overflow here by rejecting pairs of a start
position (argv->v_index) and a total count (argv->v_nmembs) which leads to
the overflow.

[konishi.ryusuke@lab.ntt.co.jp: fix signedness issue]
Signed-off-by: Wenliang Fan <fanwlexca@gmail.com>
Cc: Vyacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/ioctl.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b44bdb291b8..d22281d6007 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -57,6 +57,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 	if (argv->v_size > PAGE_SIZE)
 		return -EINVAL;
 
+	/*
+	 * Reject pairs of a start item position (argv->v_index) and a
+	 * total count (argv->v_nmembs) which leads position 'pos' to
+	 * overflow by the increment at the end of the loop.
+	 */
+	if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
+		return -EINVAL;
+
 	buf = (void *)__get_free_pages(GFP_NOFS, 0);
 	if (unlikely(!buf))
 		return -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From d623a9420c9ae2b748ba458c0e9d59084419fce0 Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Thu, 23 Jan 2014 15:55:23 -0800
Subject: nilfs2: add comments for ioctls

Add comments for ioctls in fs/nilfs2/ioctl.c file and describe NILFS2
specific ioctls in Documentation/filesystems/nilfs2.txt.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Reviewed-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Wenliang Fan <fanwlexca@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/nilfs2.txt |  56 ++++++
 fs/nilfs2/ioctl.c                    | 363 ++++++++++++++++++++++++++++++++++-
 2 files changed, 418 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 873a2ab2e9f..06887d46ccf 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -81,6 +81,62 @@ nodiscard(*)		The discard/TRIM commands are sent to the underlying
 			block device when blocks are freed.  This is useful
 			for SSD devices and sparse/thinly-provisioned LUNs.
 
+Ioctls
+======
+
+There is some NILFS2 specific functionality which can be accessed by applications
+through the system call interfaces. The list of all NILFS2 specific ioctls are
+shown in the table below.
+
+Table of NILFS2 specific ioctls
+..............................................................................
+ Ioctl			        Description
+ NILFS_IOCTL_CHANGE_CPMODE      Change mode of given checkpoint between
+			        checkpoint and snapshot state. This ioctl is
+			        used in chcp and mkcp utilities.
+
+ NILFS_IOCTL_DELETE_CHECKPOINT  Remove checkpoint from NILFS2 file system.
+			        This ioctl is used in rmcp utility.
+
+ NILFS_IOCTL_GET_CPINFO         Return info about requested checkpoints. This
+			        ioctl is used in lscp utility and by
+			        nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_CPSTAT         Return checkpoints statistics. This ioctl is
+			        used by lscp, rmcp utilities and by
+			        nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_SUINFO         Return segment usage info about requested
+			        segments. This ioctl is used in lssu,
+			        nilfs_resize utilities and by nilfs_cleanerd
+			        daemon.
+
+ NILFS_IOCTL_GET_SUSTAT         Return segment usage statistics. This ioctl
+			        is used in lssu, nilfs_resize utilities and
+			        by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_VINFO          Return information on virtual block addresses.
+			        This ioctl is used by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_BDESCS         Return information about descriptors of disk
+			        block numbers. This ioctl is used by
+			        nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_CLEAN_SEGMENTS     Do garbage collection operation in the
+			        environment of requested parameters from
+			        userspace. This ioctl is used by
+			        nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_SYNC               Make a checkpoint. This ioctl is used in
+			        mkcp utility.
+
+ NILFS_IOCTL_RESIZE             Resize NILFS2 volume. This ioctl is used
+			        by nilfs_resize utility.
+
+ NILFS_IOCTL_SET_ALLOC_RANGE    Define lower limit of segments in bytes and
+			        upper limit of segments in bytes. This ioctl
+			        is used by nilfs_resize utility.
+
 NILFS2 usage
 ============
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d22281d6007..2b34021948e 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -37,7 +37,26 @@
 #include "sufile.h"
 #include "dat.h"
 
-
+/**
+ * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @dir: set of direction flags
+ * @dofunc: concrete function of get/set metadata info
+ *
+ * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of
+ * calling dofunc() function on the basis of @argv argument.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
 static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 				 struct nilfs_argv *argv, int dir,
 				 ssize_t (*dofunc)(struct the_nilfs *,
@@ -107,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_getflags - ioctl to support lsattr
+ */
 static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
 {
 	unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
@@ -114,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
 	return put_user(flags, (int __user *)argp);
 }
 
+/**
+ * nilfs_ioctl_setflags - ioctl to support chattr
+ */
 static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 				void __user *argp)
 {
@@ -166,11 +191,33 @@ out:
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_getversion - get info about a file's version (generation number)
+ */
 static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
 {
 	return put_user(inode->i_generation, (int __user *)argp);
 }
 
+/**
+ * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_change_cpmode() function changes mode of
+ * given checkpoint between checkpoint and snapshot state. This ioctl
+ * is used in chcp and mkcp utilities.
+ *
+ * Return Value: On success, 0 is returned and mode of a checkpoint is
+ * changed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint mode changing.
+ */
 static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 				     unsigned int cmd, void __user *argp)
 {
@@ -206,6 +253,25 @@ out:
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_delete_checkpoint - remove checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_delete_checkpoint() function removes
+ * checkpoint from NILFS2 file system. This ioctl is used in rmcp
+ * utility.
+ *
+ * Return Value: On success, 0 is returned and a checkpoint is
+ * removed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint removing.
+ */
 static int
 nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
 			      unsigned int cmd, void __user *argp)
@@ -237,6 +303,21 @@ out:
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints
+ * @nilfs: nilfs object
+ * @posp: pointer on array of checkpoint's numbers
+ * @flags: checkpoint mode (checkpoint or snapshot)
+ * @buf: buffer for storing checkponts' info
+ * @size: size in bytes of one checkpoint info item in array
+ * @nmembs: number of checkpoints in array (numbers and infos)
+ *
+ * Description: nilfs_ioctl_do_get_cpinfo() function returns info about
+ * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in
+ * lscp utility and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_cpinfo structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 			  void *buf, size_t size, size_t nmembs)
@@ -250,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_get_cpstat - get checkpoints statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints.
+ * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting checkpoints statistics.
+ */
 static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
 {
@@ -268,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info
+ * @nilfs: nilfs object
+ * @posp: pointer on array of segment numbers
+ * @flags: *not used*
+ * @buf: buffer for storing suinfo array
+ * @size: size in bytes of one suinfo item in array
+ * @nmembs: count of segment numbers and suinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage
+ * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used
+ * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_suinfo structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 			  void *buf, size_t size, size_t nmembs)
@@ -281,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_get_sustat - get segment usage statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_sustat() returns segment usage statistics.
+ * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting segment usage statistics.
+ */
 static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
 {
@@ -299,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_vinfo structures
+ * @size: size in bytes of one vinfo item in array
+ * @nmembs: count of vinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_vinfo() function returns information
+ * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used
+ * by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_vinfo structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 			 void *buf, size_t size, size_t nmembs)
@@ -311,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_bdesc structures
+ * @size: size in bytes of one bdesc item in array
+ * @nmembs: count of bdescs in array
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_bdescs structures in output buffer.
+ */
 static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
 			  void *buf, size_t size, size_t nmembs)
@@ -337,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
 	return nmembs;
 }
 
+/**
+ * nilfs_ioctl_get_bdescs - get disk block descriptors
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and disk block descriptors are
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting disk block descriptors.
+ */
 static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
 {
@@ -360,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC
+ * @inode: inode object
+ * @vdesc: descriptor of virtual block number
+ * @buffers: list of moving buffers
+ *
+ * Description: nilfs_ioctl_move_inode_block() function registers data/node
+ * buffer in the GC pagecache and submit read request.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Requested block doesn't exist.
+ *
+ * %-EEXIST - Blocks conflict is detected.
+ */
 static int nilfs_ioctl_move_inode_block(struct inode *inode,
 					struct nilfs_vdesc *vdesc,
 					struct list_head *buffers)
@@ -405,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
 	return 0;
 }
 
+/**
+ * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection
+ * @sb: superblock object
+ * @argv: vector of arguments from userspace
+ * @buf: array of nilfs_vdesc structures
+ *
+ * Description: nilfs_ioctl_move_blocks() function reads valid data/node
+ * blocks that garbage collector specified with the array of nilfs_vdesc
+ * structures and stores them into page caches of GC inodes.
+ *
+ * Return Value: Number of processed nilfs_vdesc structures or
+ * error code, otherwise.
+ */
 static int nilfs_ioctl_move_blocks(struct super_block *sb,
 				   struct nilfs_argv *argv, void *buf)
 {
@@ -470,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_delete_checkpoints - delete checkpoints
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of periods of checkpoints numbers
+ *
+ * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints
+ * in the period from p_start to p_end, excluding p_end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: Number of processed nilfs_period structures or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
 static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
 					  struct nilfs_argv *argv, void *buf)
 {
@@ -487,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
 	return nmembs;
 }
 
+/**
+ * nilfs_ioctl_free_vblocknrs - free virtual block numbers
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of virtual block numbers
+ *
+ * Description: nilfs_ioctl_free_vblocknrs() function frees
+ * the virtual block numbers specified by @buf and @argv->v_nmembs.
+ *
+ * Return Value: Number of processed virtual block numbers or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
 static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
 				      struct nilfs_argv *argv, void *buf)
 {
@@ -498,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
 	return (ret < 0) ? ret : nmembs;
 }
 
+/**
+ * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of block descriptors
+ *
+ * Description: nilfs_ioctl_mark_blocks_dirty() function marks
+ * metadata file or data blocks as dirty.
+ *
+ * Return Value: Number of processed block descriptors or
+ * error code, otherwise.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ */
 static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
 					 struct nilfs_argv *argv, void *buf)
 {
@@ -579,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_clean_segments - clean segments
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_clean_segments() function makes garbage
+ * collection operation in the environment of requested parameters
+ * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by
+ * nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
 static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 				      unsigned int cmd, void __user *argp)
 {
@@ -690,6 +983,33 @@ out:
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_sync - make a checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_sync() function constructs a logical segment
+ * for checkpointing.  This function guarantees that all modified data
+ * and metadata are written out to the device when it successfully
+ * returned.
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
 static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 			    unsigned int cmd, void __user *argp)
 {
@@ -718,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 	return 0;
 }
 
+/**
+ * nilfs_ioctl_resize - resize NILFS2 volume
+ * @inode: inode object
+ * @filp: file object
+ * @argp: pointer on argument from userspace
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
 static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
 			      void __user *argp)
 {
@@ -743,6 +1071,17 @@ out:
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
+ * of segments in bytes and upper limit of segments in bytes.
+ * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
 static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
 {
 	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -775,6 +1114,28 @@ out:
 	return ret;
 }
 
+/**
+ * nilfs_ioctl_get_info - wrapping function of get metadata info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ * @membsz: size of an item in bytes
+ * @dofunc: concrete function of getting metadata info
+ *
+ * Description: nilfs_ioctl_get_info() gets metadata info by means of
+ * calling dofunc() function.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
 static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 				unsigned int cmd, void __user *argp,
 				size_t membsz,
-- 
cgit v1.2.3-70-g09d2


From d74a054fa4f5a3fc05eae11b3ff0b653b49dd7cb Mon Sep 17 00:00:00 2001
From: Sougata Santra <sougata@tuxera.com>
Date: Thu, 23 Jan 2014 15:55:25 -0800
Subject: hfsplus: remove hfsplus_file_lookup()

HFS+ resource fork lookup breaks opendir() library function.  Since
opendir first calls open() with O_DIRECTORY flag set.  O_DIRECTORY means
"refuse to open if not a directory".  The open system call in the kernel
does a check for inode->i_op->lookup and returns -ENOTDIR.  So if
hfsplus_file_lookup is set it allows opendir() for plain files.

Also resource fork lookup in HFS+ does not work.  Since it is never
invoked after VFS permission checking.  It will always return with
-EACCES.

When we call opendir() on a file, it does not return NULL.  opendir()
library call is based on open with O_DIRECTORY flag passed and then
layered on top of getdents() system call.  O_DIRECTORY means "refuse to
open if not a directory".

The open() system call in the kernel does a check for: do_sys_open()
-->..--> can_lookup() i.e it only checks inode->i_op->lookup and returns
ENOTDIR if this function pointer is not set.

In OSX, we can open "file/rsrc" to get the resource fork of "file".  This
behavior is emulated inside hfsplus on Linux, which means that to some
degree every file acts like a directory.  That is the reason lookup()
inode operations is supported for files, and it is possible to do a lookup
on this specific name.  As a result of this open succeeds without
returning ENOTDIR for HFS+

Please see the LKML discussion thread on this issue:
http://marc.info/?l=linux-fsdevel&m=122823343730412&w=2

I tried to test file/rsrc lookup in HFS+ driver and the feature does not
work.  From OSX:

$ touch test
$ echo "1234" > test/..namedfork/rsrc
$ ls -l test..namedfork/rsrc
--rw-r--r-- 1 tuxera staff 5 10 dec 12:59 test/..namedfork/rsrc

[sougata@ultrabook tmp]$ id
uid=1000(sougata) gid=1000(sougata) groups=1000(sougata),5(tty),18(dialout),1001(vboxusers)

[sougata@ultrabook tmp]$ mount
/dev/sdb1 on /mnt/tmp type hfsplus (rw,relatime,umask=0,uid=1000,gid=1000,nls=utf8)

[sougata@ultrabook tmp]$ ls -l test/rsrc
ls: cannot access test/rsrc: Permission denied

According to this LKML thread it is expected behavior.

http://marc.info/?t=121139033800008&r=1&w=4

I guess now that permission checking happens in vfs generic_permission() ?
 So it turns out that even though the lookup() inode_operation exists for
HFS+ files.  It cannot really get invoked ?.  So if we can disable this
feature to make opendir() work for HFS+.

Signed-off-by: Sougata Santra <sougata@tuxera.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Anton Altaparmakov <aia21@cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/inode.c | 59 ------------------------------------------------------
 1 file changed, 59 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 37213d075f3..3ebda928229 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -178,64 +178,6 @@ const struct dentry_operations hfsplus_dentry_operations = {
 	.d_compare    = hfsplus_compare_dentry,
 };
 
-static struct dentry *hfsplus_file_lookup(struct inode *dir,
-		struct dentry *dentry, unsigned int flags)
-{
-	struct hfs_find_data fd;
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode = NULL;
-	struct hfsplus_inode_info *hip;
-	int err;
-
-	if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
-		goto out;
-
-	inode = HFSPLUS_I(dir)->rsrc_inode;
-	if (inode)
-		goto out;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	hip = HFSPLUS_I(inode);
-	inode->i_ino = dir->i_ino;
-	INIT_LIST_HEAD(&hip->open_dir_list);
-	mutex_init(&hip->extents_lock);
-	hip->extent_state = 0;
-	hip->flags = 0;
-	hip->userflags = 0;
-	set_bit(HFSPLUS_I_RSRC, &hip->flags);
-
-	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
-	if (!err) {
-		err = hfsplus_find_cat(sb, dir->i_ino, &fd);
-		if (!err)
-			err = hfsplus_cat_read_inode(inode, &fd);
-		hfs_find_exit(&fd);
-	}
-	if (err) {
-		iput(inode);
-		return ERR_PTR(err);
-	}
-	hip->rsrc_inode = dir;
-	HFSPLUS_I(dir)->rsrc_inode = inode;
-	igrab(dir);
-
-	/*
-	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
-	 * want resource fork inodes in the regular inode space, we make them
-	 * appear hashed, but do not put on any lists.  hlist_del()
-	 * will work fine and require no locking.
-	 */
-	hlist_add_fake(&inode->i_hash);
-
-	mark_inode_dirty(inode);
-out:
-	d_add(dentry, inode);
-	return NULL;
-}
-
 static void hfsplus_get_perms(struct inode *inode,
 		struct hfsplus_perm *perms, int dir)
 {
@@ -385,7 +327,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 }
 
 static const struct inode_operations hfsplus_file_inode_operations = {
-	.lookup		= hfsplus_file_lookup,
 	.setattr	= hfsplus_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
-- 
cgit v1.2.3-70-g09d2


From abacd2fe3ca10b3ade57f3634053241a660002c2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:31 -0800
Subject: coredump: set_dumpable: fix the theoretical race with itself

set_dumpable() updates MMF_DUMPABLE_MASK in a non-trivial way to ensure
that get_dumpable() can't observe the intermediate state, but this all
can't help if multiple threads call set_dumpable() at the same time.

And in theory commit_creds()->set_dumpable(SUID_DUMP_ROOT) racing with
sys_prctl()->set_dumpable(SUID_DUMP_DISABLE) can result in SUID_DUMP_USER.

Change this code to update both bits atomically via cmpxchg().

Note: this assumes that it is safe to mix bitops and cmpxchg.  IOW, if,
say, an architecture implements cmpxchg() using the locking (like
arch/parisc/lib/bitops.c does), then it should use the same locks for
set_bit/etc.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Alex Kelly <alex.page.kelly@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Vasily Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 49 +++++++++++++++----------------------------------
 1 file changed, 15 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 7ea097f6b34..f039386499d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1614,43 +1614,24 @@ EXPORT_SYMBOL(set_binfmt);
 
 /*
  * set_dumpable converts traditional three-value dumpable to two flags and
- * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
- * these bits are not changed atomically.  So get_dumpable can observe the
- * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
- * return either old dumpable or new one by paying attention to the order of
- * modifying the bits.
- *
- * dumpable |   mm->flags (binary)
- * old  new | initial interim  final
- * ---------+-----------------------
- *  0    1  |   00      01      01
- *  0    2  |   00      10(*)   11
- *  1    0  |   01      00      00
- *  1    2  |   01      11      11
- *  2    0  |   11      10(*)   00
- *  2    1  |   11      11      01
- *
- * (*) get_dumpable regards interim value of 10 as 11.
+ * stores them into mm->flags.
  */
 void set_dumpable(struct mm_struct *mm, int value)
 {
-	switch (value) {
-	case SUID_DUMP_DISABLE:
-		clear_bit(MMF_DUMPABLE, &mm->flags);
-		smp_wmb();
-		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
-		break;
-	case SUID_DUMP_USER:
-		set_bit(MMF_DUMPABLE, &mm->flags);
-		smp_wmb();
-		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
-		break;
-	case SUID_DUMP_ROOT:
-		set_bit(MMF_DUMP_SECURELY, &mm->flags);
-		smp_wmb();
-		set_bit(MMF_DUMPABLE, &mm->flags);
-		break;
-	}
+	unsigned long old, new;
+
+	do {
+		old = ACCESS_ONCE(mm->flags);
+		new = old & ~MMF_DUMPABLE_MASK;
+
+		switch (value) {
+		case SUID_DUMP_ROOT:
+			new |= (1 << MMF_DUMP_SECURELY);
+		case SUID_DUMP_USER:
+			new |= (1<< MMF_DUMPABLE);
+		}
+
+	} while (cmpxchg(&mm->flags, old, new) != old);
 }
 
 int __get_dumpable(unsigned long mm_flags)
-- 
cgit v1.2.3-70-g09d2


From 7288e1187ba935996232246916418c64bb88da30 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:32 -0800
Subject: coredump: kill MMF_DUMPABLE and MMF_DUMP_SECURELY

Nobody actually needs MMF_DUMPABLE/MMF_DUMP_SECURELY, they are only used
to enforce the encoding of SUID_DUMP_* enum in mm->flags &
MMF_DUMPABLE_MASK.

Now that set_dumpable() updates both bits atomically we can kill them and
simply store the value "as is" in 2 lower bits.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Alex Kelly <alex.page.kelly@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Vasily Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 21 ++++++---------------
 include/linux/sched.h |  4 +---
 2 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index f039386499d..f798da06aba 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1613,33 +1613,24 @@ void set_binfmt(struct linux_binfmt *new)
 EXPORT_SYMBOL(set_binfmt);
 
 /*
- * set_dumpable converts traditional three-value dumpable to two flags and
- * stores them into mm->flags.
+ * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
  */
 void set_dumpable(struct mm_struct *mm, int value)
 {
 	unsigned long old, new;
 
+	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
+		return;
+
 	do {
 		old = ACCESS_ONCE(mm->flags);
-		new = old & ~MMF_DUMPABLE_MASK;
-
-		switch (value) {
-		case SUID_DUMP_ROOT:
-			new |= (1 << MMF_DUMP_SECURELY);
-		case SUID_DUMP_USER:
-			new |= (1<< MMF_DUMPABLE);
-		}
-
+		new = (old & ~MMF_DUMPABLE_MASK) | value;
 	} while (cmpxchg(&mm->flags, old, new) != old);
 }
 
 int __get_dumpable(unsigned long mm_flags)
 {
-	int ret;
-
-	ret = mm_flags & MMF_DUMPABLE_MASK;
-	return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
+	return mm_flags & MMF_DUMPABLE_MASK;
 }
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 485234d2fd4..124430ba569 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -400,10 +400,8 @@ extern int get_dumpable(struct mm_struct *mm);
 #define SUID_DUMP_ROOT		2	/* Dump as root */
 
 /* mm flags */
-/* dumpable bits */
-#define MMF_DUMPABLE      0  /* core dump is permitted */
-#define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
 
+/* for SUID_DUMP_* above */
 #define MMF_DUMPABLE_BITS 2
 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
 
-- 
cgit v1.2.3-70-g09d2


From 942be3875a1931c379bbc37053829dd6847e0f3f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:34 -0800
Subject: coredump: make __get_dumpable/get_dumpable inline, kill fs/coredump.h

1. Remove fs/coredump.h. It is not clear why do we need it,
   it only declares __get_dumpable(), signal.c includes it
   for no reason.

2. Now that get_dumpable() and __get_dumpable() are really
   trivial make them inline in linux/sched.h.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Alex Kelly <alex.page.kelly@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Vasily Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c         |  1 -
 fs/coredump.h         |  6 ------
 fs/exec.c             | 18 ------------------
 include/linux/sched.h | 21 +++++++++++++++++----
 4 files changed, 17 insertions(+), 29 deletions(-)
 delete mode 100644 fs/coredump.h

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index bc3fbcd3255..e3ad709a423 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -40,7 +40,6 @@
 
 #include <trace/events/task.h>
 #include "internal.h"
-#include "coredump.h"
 
 #include <trace/events/sched.h>
 
diff --git a/fs/coredump.h b/fs/coredump.h
deleted file mode 100644
index e39ff072110..00000000000
--- a/fs/coredump.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _FS_COREDUMP_H
-#define _FS_COREDUMP_H
-
-extern int __get_dumpable(unsigned long mm_flags);
-
-#endif
diff --git a/fs/exec.c b/fs/exec.c
index f798da06aba..9cbad5b0187 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -62,7 +62,6 @@
 
 #include <trace/events/task.h>
 #include "internal.h"
-#include "coredump.h"
 
 #include <trace/events/sched.h>
 
@@ -1609,7 +1608,6 @@ void set_binfmt(struct linux_binfmt *new)
 	if (new)
 		__module_get(new->module);
 }
-
 EXPORT_SYMBOL(set_binfmt);
 
 /*
@@ -1628,22 +1626,6 @@ void set_dumpable(struct mm_struct *mm, int value)
 	} while (cmpxchg(&mm->flags, old, new) != old);
 }
 
-int __get_dumpable(unsigned long mm_flags)
-{
-	return mm_flags & MMF_DUMPABLE_MASK;
-}
-
-/*
- * This returns the actual value of the suid_dumpable flag. For things
- * that are using this for checking for privilege transitions, it must
- * test against SUID_DUMP_USER rather than treating it as a boolean
- * value.
- */
-int get_dumpable(struct mm_struct *mm)
-{
-	return __get_dumpable(mm->flags);
-}
-
 SYSCALL_DEFINE3(execve,
 		const char __user *, filename,
 		const char __user *const __user *, argv,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 124430ba569..cf9e414dbb9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -391,10 +391,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
 
-
-extern void set_dumpable(struct mm_struct *mm, int value);
-extern int get_dumpable(struct mm_struct *mm);
-
 #define SUID_DUMP_DISABLE	0	/* No setuid dumping */
 #define SUID_DUMP_USER		1	/* Dump as user of process */
 #define SUID_DUMP_ROOT		2	/* Dump as root */
@@ -405,6 +401,23 @@ extern int get_dumpable(struct mm_struct *mm);
 #define MMF_DUMPABLE_BITS 2
 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
 
+extern void set_dumpable(struct mm_struct *mm, int value);
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
+static inline int __get_dumpable(unsigned long mm_flags)
+{
+	return mm_flags & MMF_DUMPABLE_MASK;
+}
+
+static inline int get_dumpable(struct mm_struct *mm)
+{
+	return __get_dumpable(mm->flags);
+}
+
 /* coredump filter bits */
 #define MMF_DUMP_ANON_PRIVATE	2
 #define MMF_DUMP_ANON_SHARED	3
-- 
cgit v1.2.3-70-g09d2


From 74e37200de8e9c4e09b70c21c3f13c2071e77457 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:35 -0800
Subject: proc: cleanup/simplify get_task_state/task_state_array

get_task_state() and task_state_array[] look confusing and suboptimal, it
is not clear what it can actually report to user-space and
task_state_array[] blows .data for no reason.

1. state = (tsk->state & TASK_REPORT) | tsk->exit_state is not
   clear. TASK_REPORT is self-documenting but it is not clear
   what ->exit_state can add.

   Move the potential exit_state's (EXIT_ZOMBIE and EXIT_DEAD)
   into TASK_REPORT and use it to calculate the final result.

2. With the change above it is obvious that task_state_array[]
   has the unused entries just to make BUILD_BUG_ON() happy.

   Change this BUILD_BUG_ON() to use TASK_REPORT rather than
   TASK_STATE_MAX and shrink task_state_array[].

3. Turn the "while (state)" loop into fls(state).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c       | 15 +++------------
 include/linux/sched.h |  2 +-
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 1bd2077187f..554a0b229ac 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -140,24 +140,15 @@ static const char * const task_state_array[] = {
 	"t (tracing stop)",	/*   8 */
 	"Z (zombie)",		/*  16 */
 	"X (dead)",		/*  32 */
-	"x (dead)",		/*  64 */
-	"K (wakekill)",		/* 128 */
-	"W (waking)",		/* 256 */
-	"P (parked)",		/* 512 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
 {
-	unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
-	const char * const *p = &task_state_array[0];
+	unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
 
-	BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
+	BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
 
-	while (state) {
-		p++;
-		state >>= 1;
-	}
-	return *p;
+	return task_state_array[fls(state)];
 }
 
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cf9e414dbb9..33e4e9e1f62 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -229,7 +229,7 @@ extern char ___assert_task_state[1 - 2*!!(
 /* get_task_state() */
 #define TASK_REPORT		(TASK_RUNNING | TASK_INTERRUPTIBLE | \
 				 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
-				 __TASK_TRACED)
+				 __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
 
 #define task_is_traced(task)	((task->state & __TASK_TRACED) != 0)
 #define task_is_stopped(task)	((task->state & __TASK_STOPPED) != 0)
-- 
cgit v1.2.3-70-g09d2


From 940fe4793a219375c4713a17c61b843720807c9d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:36 -0800
Subject: proc: fix the potential use-after-free in first_tid()

proc_task_readdir() verifies that the result of get_proc_task() is
pid_alive() and thus its ->group_leader is fine too.  However this is not
necessarily true after rcu_read_unlock(), we need to recheck this again
after first_tid() does rcu_read_lock().  Otherwise
leader->thread_group.next (used by next_thread()) can be invalid if the
rcu grace period expires in between.

The race is subtle and unlikely, but still it is possible afaics.  To
simplify lets ignore the "likely" case when tid != 0, f_version can be
cleared by proc_task_operations->llseek().

Suppose we have a main thread M and its subthread T.  Suppose that f_pos
== 3, iow first_tid() should return T.  Now suppose that the following
happens between rcu_read_unlock() and rcu_read_lock():

	1. T execs and becomes the new leader. This removes M from
	    ->thread_group but next_thread(M) is still T.

	2. T creates another thread X which does exec as well, T
	   goes away.

	3. X creates another subthread, this increments nr_threads.

	4. first_tid() does next_thread(M) and returns the already
	   dead T.

Note also that we need 2.  and 3.  only because of get_nr_threads() check,
and this check was supposed to be optimization only.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Sameer Nanda <snanda@chromium.org>
Cc: Sergey Dyasly <dserrg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 03c8d747be4..f223a56e613 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3109,6 +3109,9 @@ static struct task_struct *first_tid(struct task_struct *leader,
 	pos = NULL;
 	if (nr && nr >= get_nr_threads(leader))
 		goto out;
+	/* It could be unhashed before we take rcu lock */
+	if (!pid_alive(leader))
+		goto out;
 
 	/* If we haven't found our starting place yet start
 	 * with the leader and walk nr threads forward.
-- 
cgit v1.2.3-70-g09d2


From c986c14a6a88427946dc77d7018a81b95b3d41b6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:38 -0800
Subject: proc: change first_tid() to use while_each_thread() rather than
 next_thread()

Rerwrite the main loop to use while_each_thread() instead of
next_thread().  We are going to fix or replace while_each_thread(),
next_thread() should be avoided whenever possible.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Sameer Nanda <snanda@chromium.org>
Cc: Sergey Dyasly <dserrg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f223a56e613..be8e17cabfc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3106,23 +3106,23 @@ static struct task_struct *first_tid(struct task_struct *leader,
 	}
 
 	/* If nr exceeds the number of threads there is nothing todo */
-	pos = NULL;
 	if (nr && nr >= get_nr_threads(leader))
-		goto out;
+		goto fail;
 	/* It could be unhashed before we take rcu lock */
 	if (!pid_alive(leader))
-		goto out;
+		goto fail;
 
 	/* If we haven't found our starting place yet start
 	 * with the leader and walk nr threads forward.
 	 */
-	for (pos = leader; nr > 0; --nr) {
-		pos = next_thread(pos);
-		if (pos == leader) {
-			pos = NULL;
-			goto out;
-		}
-	}
+	pos = leader;
+	do {
+		if (nr-- <= 0)
+			goto found;
+	} while_each_thread(leader, pos);
+fail:
+	pos = NULL;
+	goto out;
 found:
 	get_task_struct(pos);
 out:
-- 
cgit v1.2.3-70-g09d2


From d855a4b79f49ea07d1827fc0591490a6a324148b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:39 -0800
Subject: proc: don't (ab)use ->group_leader in proc_task_readdir() paths

proc_task_readdir() does not really need "leader", first_tid() has to
revalidate it anyway.  Just pass proc_pid(inode) to first_tid() instead,
it can do pid_task(PIDTYPE_PID) itself and read ->group_leader only if
necessary.

The patch also extracts the "inode is dead" code from
pid_delete_dentry(dentry) into the new trivial helper,
proc_inode_is_dead(inode), proc_task_readdir() uses it to return -ENOENT
if this dir was removed.

This is a bit racy, but the race is very inlikely and the getdents() after
openndir() can see the empty "." + ".." dir only once.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Sameer Nanda <snanda@chromium.org>
Cc: Sergey Dyasly <dserrg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 52 ++++++++++++++++++++++++----------------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index be8e17cabfc..9b423fec973 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1658,13 +1658,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	return 0;
 }
 
+static inline bool proc_inode_is_dead(struct inode *inode)
+{
+	return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+}
+
 int pid_delete_dentry(const struct dentry *dentry)
 {
 	/* Is the task we represent dead?
 	 * If so, then don't put the dentry on the lru list,
 	 * kill it immediately.
 	 */
-	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+	return proc_inode_is_dead(dentry->d_inode);
 }
 
 const struct dentry_operations pid_dentry_operations =
@@ -3092,34 +3097,35 @@ out_no_task:
  * In the case of a seek we start with the leader and walk nr
  * threads past it.
  */
-static struct task_struct *first_tid(struct task_struct *leader,
-		int tid, int nr, struct pid_namespace *ns)
+static struct task_struct *first_tid(struct pid *pid, int tid,
+					int nr, struct pid_namespace *ns)
 {
-	struct task_struct *pos;
+	struct task_struct *pos, *task;
 
 	rcu_read_lock();
-	/* Attempt to start with the pid of a thread */
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		goto fail;
+
+	/* Attempt to start with the tid of a thread */
 	if (tid && (nr > 0)) {
 		pos = find_task_by_pid_ns(tid, ns);
-		if (pos && (pos->group_leader == leader))
+		if (pos && same_thread_group(pos, task))
 			goto found;
 	}
 
 	/* If nr exceeds the number of threads there is nothing todo */
-	if (nr && nr >= get_nr_threads(leader))
-		goto fail;
-	/* It could be unhashed before we take rcu lock */
-	if (!pid_alive(leader))
+	if (nr && nr >= get_nr_threads(task))
 		goto fail;
 
 	/* If we haven't found our starting place yet start
 	 * with the leader and walk nr threads forward.
 	 */
-	pos = leader;
+	pos = task = task->group_leader;
 	do {
 		if (nr-- <= 0)
 			goto found;
-	} while_each_thread(leader, pos);
+	} while_each_thread(task, pos);
 fail:
 	pos = NULL;
 	goto out;
@@ -3155,25 +3161,16 @@ static struct task_struct *next_tid(struct task_struct *start)
 /* for the /proc/TGID/task/ directories */
 static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct task_struct *leader = NULL;
-	struct task_struct *task = get_proc_task(file_inode(file));
+	struct inode *inode = file_inode(file);
+	struct task_struct *task;
 	struct pid_namespace *ns;
 	int tid;
 
-	if (!task)
-		return -ENOENT;
-	rcu_read_lock();
-	if (pid_alive(task)) {
-		leader = task->group_leader;
-		get_task_struct(leader);
-	}
-	rcu_read_unlock();
-	put_task_struct(task);
-	if (!leader)
+	if (proc_inode_is_dead(inode))
 		return -ENOENT;
 
 	if (!dir_emit_dots(file, ctx))
-		goto out;
+		return 0;
 
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
@@ -3181,7 +3178,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	ns = file->f_dentry->d_sb->s_fs_info;
 	tid = (int)file->f_version;
 	file->f_version = 0;
-	for (task = first_tid(leader, tid, ctx->pos - 2, ns);
+	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
 	     task;
 	     task = next_tid(task), ctx->pos++) {
 		char name[PROC_NUMBUF];
@@ -3197,8 +3194,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 	}
-out:
-	put_task_struct(leader);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9f6e963f06c19a57a876cb77a9c87f6a56295b13 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:40 -0800
Subject: proc: fix ->f_pos overflows in first_tid()

1. proc_task_readdir()->first_tid() path truncates f_pos to int, this
   is wrong even on 64bit.

   We could check that f_pos < PID_MAX or even INT_MAX in
   proc_task_readdir(), but this patch simply checks the potential
   overflow in first_tid(), this check is nop on 64bit.  We do not care if
   it was negative and the new unsigned value is huge, all we need to
   ensure is that we never wrongly return !NULL.

2. Remove the 2nd "nr != 0" check before get_nr_threads(),
   nr_threads == 0 is not distinguishable from !pid_task() above.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Sameer Nanda <snanda@chromium.org>
Cc: Sergey Dyasly <dserrg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9b423fec973..51507065263 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3097,10 +3097,14 @@ out_no_task:
  * In the case of a seek we start with the leader and walk nr
  * threads past it.
  */
-static struct task_struct *first_tid(struct pid *pid, int tid,
-					int nr, struct pid_namespace *ns)
+static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+					struct pid_namespace *ns)
 {
 	struct task_struct *pos, *task;
+	unsigned long nr = f_pos;
+
+	if (nr != f_pos)	/* 32bit overflow? */
+		return NULL;
 
 	rcu_read_lock();
 	task = pid_task(pid, PIDTYPE_PID);
@@ -3108,14 +3112,14 @@ static struct task_struct *first_tid(struct pid *pid, int tid,
 		goto fail;
 
 	/* Attempt to start with the tid of a thread */
-	if (tid && (nr > 0)) {
+	if (tid && nr) {
 		pos = find_task_by_pid_ns(tid, ns);
 		if (pos && same_thread_group(pos, task))
 			goto found;
 	}
 
 	/* If nr exceeds the number of threads there is nothing todo */
-	if (nr && nr >= get_nr_threads(task))
+	if (nr >= get_nr_threads(task))
 		goto fail;
 
 	/* If we haven't found our starting place yet start
@@ -3123,7 +3127,7 @@ static struct task_struct *first_tid(struct pid *pid, int tid,
 	 */
 	pos = task = task->group_leader;
 	do {
-		if (nr-- <= 0)
+		if (!nr--)
 			goto found;
 	} while_each_thread(task, pos);
 fail:
-- 
cgit v1.2.3-70-g09d2


From cdf7e8dded6212cb29f758017a613e4eefc4ce9e Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Thu, 23 Jan 2014 15:55:41 -0800
Subject: proc: set attributes of pde using accessor functions

Use existing accessors proc_set_user() and proc_set_size() to set
attributes.  Just a cleanup.

Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c      | 3 +--
 fs/proc/proc_devtree.c | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index cca93b6fb9a..b7f268eb5f4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
 
-	de->uid = inode->i_uid;
-	de->gid = inode->i_gid;
+	proc_set_user(de, inode->i_uid, inode->i_gid);
 	de->mode = inode->i_mode;
 	return 0;
 }
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 70779b2fc20..c824187251f 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -74,9 +74,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
 		return NULL;
 
 	if (!strncmp(name, "security-", 9))
-		ent->size = 0; /* don't leak number of password chars */
+		proc_set_size(ent, 0); /* don't leak number of password chars */
 	else
-		ent->size = pp->length;
+		proc_set_size(ent, pp->length);
 
 	return ent;
 }
-- 
cgit v1.2.3-70-g09d2


From c1d867a54d426b45da017fbe8e585f8a3064ce8d Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 23 Jan 2014 15:55:43 -0800
Subject: fs/proc/proc_devtree.c: remove empty /proc/device-tree when no
 openfirmware exists.

Distribution kernels might want to build in support for /proc/device-tree
for kernels that might end up running on hardware that doesn't support
openfirmware.  This results in an empty /proc/device-tree existing.
Remove it if the OFW root node doesn't exist.

This situation actually confuses grub2, resulting in install failures.
grub2 sees the /proc/device-tree and picks the wrong install target cf.
http://bzr.savannah.gnu.org/lh/grub/trunk/grub/annotate/4300/util/grub-install.in#L311
grub should be more robust, but still, leaving an empty proc dir seems
pointless.

Addresses https://bugzilla.redhat.com/show_bug.cgi?id=818378.

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_devtree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index c824187251f..c82dd514784 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -232,6 +232,7 @@ void __init proc_device_tree_init(void)
 		return;
 	root = of_find_node_by_path("/");
 	if (root == NULL) {
+		remove_proc_entry("device-tree", NULL);
 		pr_debug("/proc/device-tree: can't find root\n");
 		return;
 	}
-- 
cgit v1.2.3-70-g09d2


From 3d93116cef306bd516a7645e7b4895d1d0ceec2b Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 23 Jan 2014 15:55:44 -0800
Subject: fs/proc_namespace.c: simplify testing nsp and nsp->mnt_ns

Trivial cleanup to eliminate a goto.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc_namespace.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 439406e081a..7be26f03a3f 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 
 	rcu_read_lock();
 	nsp = task_nsproxy(task);
-	if (!nsp) {
+	if (!nsp || !nsp->mnt_ns) {
 		rcu_read_unlock();
 		put_task_struct(task);
 		goto err;
 	}
 	ns = nsp->mnt_ns;
-	if (!ns) {
-		rcu_read_unlock();
-		put_task_struct(task);
-		goto err;
-	}
 	get_mnt_ns(ns);
 	rcu_read_unlock();
 	task_lock(task);
-- 
cgit v1.2.3-70-g09d2


From abaf3787ac26ba33e2f75e76b1174c32254c25b0 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 23 Jan 2014 15:55:45 -0800
Subject: fs/proc: don't use module_init for non-modular core code

PROC_FS is a bool, so this code is either present or absent.  It will
never be modular, so using module_init as an alias for __initcall is
rather misleading.

Fix this up now, so that we can relocate module_init from init.h into
module.h in the future.  If we don't do this, we'd have to add module.h to
obviously non-modular code, and that would be ugly at best.

Note that direct use of __initcall is discouraged, vs.  one of the
priority categorized subgroups.  As __initcall gets mapped onto
device_initcall, our use of fs_initcall (which makes sense for fs code)
will thus change these registrations from level 6-device to level 5-fs
(i.e.  slightly earlier).  However no observable impact of that small
difference has been observed during testing, or is expected.

Also note that this change uncovers a missing semicolon bug in the
registration of vmcore_init as an initcall.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/cmdline.c    | 2 +-
 fs/proc/consoles.c   | 2 +-
 fs/proc/cpuinfo.c    | 2 +-
 fs/proc/devices.c    | 2 +-
 fs/proc/interrupts.c | 2 +-
 fs/proc/kcore.c      | 2 +-
 fs/proc/kmsg.c       | 2 +-
 fs/proc/loadavg.c    | 2 +-
 fs/proc/meminfo.c    | 2 +-
 fs/proc/nommu.c      | 2 +-
 fs/proc/page.c       | 2 +-
 fs/proc/softirqs.c   | 2 +-
 fs/proc/stat.c       | 2 +-
 fs/proc/uptime.c     | 2 +-
 fs/proc/version.c    | 2 +-
 fs/proc/vmcore.c     | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 82676e3fcd1..cbd82dff7e8 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void)
 	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
 	return 0;
 }
-module_init(proc_cmdline_init);
+fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 51942d5abce..290ba85cb90 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -109,4 +109,4 @@ static int __init proc_consoles_init(void)
 	proc_create("consoles", 0, NULL, &proc_consoles_operations);
 	return 0;
 }
-module_init(proc_consoles_init);
+fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 5a1e539a234..06f4d31e039 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void)
 	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 	return 0;
 }
-module_init(proc_cpuinfo_init);
+fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index b14347167c3..50493edc30e 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -67,4 +67,4 @@ static int __init proc_devices_init(void)
 	proc_create("devices", 0, NULL, &proc_devinfo_operations);
 	return 0;
 }
-module_init(proc_devices_init);
+fs_initcall(proc_devices_init);
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 05029c0e2f2..a352d5703b4 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void)
 	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
 	return 0;
 }
-module_init(proc_interrupts_init);
+fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5ed0e52d6aa..39e6ef32f0b 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -639,4 +639,4 @@ static int __init proc_kcore_init(void)
 
 	return 0;
 }
-module_init(proc_kcore_init);
+fs_initcall(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bdfabdaefdc..05f8dcdb086 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void)
 	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
 	return 0;
 }
-module_init(proc_kmsg_init);
+fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 1afa4dd4cae..aec66e6c206 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void)
 	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
 	return 0;
 }
-module_init(proc_loadavg_init);
+fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 24270eceddb..136e548d956 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -220,4 +220,4 @@ static int __init proc_meminfo_init(void)
 	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
 	return 0;
 }
-module_init(proc_meminfo_init);
+fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 5f9bc8a746c..d4a35746cab 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -131,4 +131,4 @@ static int __init proc_nommu_init(void)
 	return 0;
 }
 
-module_init(proc_nommu_init);
+fs_initcall(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index cab84b6272e..02174a61031 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -219,4 +219,4 @@ static int __init proc_page_init(void)
 	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
 	return 0;
 }
-module_init(proc_page_init);
+fs_initcall(proc_page_init);
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 62604be9f58..ad8a77f94be 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void)
 	proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
 	return 0;
 }
-module_init(proc_softirqs_init);
+fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cf86c0e868..6f599c62f0c 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -221,4 +221,4 @@ static int __init proc_stat_init(void)
 	proc_create("stat", 0, NULL, &proc_stat_operations);
 	return 0;
 }
-module_init(proc_stat_init);
+fs_initcall(proc_stat_init);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 06189462590..7141b8d0ca9 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -49,4 +49,4 @@ static int __init proc_uptime_init(void)
 	proc_create("uptime", 0, NULL, &uptime_proc_fops);
 	return 0;
 }
-module_init(proc_uptime_init);
+fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 76817a60678..d2154eb6d78 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -31,4 +31,4 @@ static int __init proc_version_init(void)
 	proc_create("version", 0, NULL, &version_proc_fops);
 	return 0;
 }
-module_init(proc_version_init);
+fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9100d695988..2ca7ba047f0 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1082,7 +1082,7 @@ static int __init vmcore_init(void)
 		proc_vmcore->size = vmcore_size;
 	return 0;
 }
-module_init(vmcore_init)
+fs_initcall(vmcore_init);
 
 /* Cleanup function for vmcore module. */
 void vmcore_cleanup(void)
-- 
cgit v1.2.3-70-g09d2


From 83f62a2eacb1d6945c78523f20e0c34b5d94913c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:49 -0800
Subject: exec:check_unsafe_exec: use while_each_thread() rather than
 next_thread()

next_thread() should be avoided, change check_unsafe_exec() to use
while_each_thread().

Nobody except signal->curr_target actually needs next_thread-like code,
and we need to change (fix) this interface.  This particular code is fine,
p == current.  But in general the code like this can loop forever if p
exits and next_thread(t) can't reach the unhashed thread.

This also saves 32 bytes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 9cbad5b0187..81ae6212187 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1243,10 +1243,11 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
 	if (current->no_new_privs)
 		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
 
+	t = p;
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
 	rcu_read_lock();
-	for (t = next_thread(p); t != p; t = next_thread(t)) {
+	while_each_thread(p, t) {
 		if (t->fs == p->fs)
 			n_fs++;
 	}
-- 
cgit v1.2.3-70-g09d2


From 9e00cdb091b008cb3c78192651180896de412a63 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:50 -0800
Subject: exec:check_unsafe_exec: kill the dead -EAGAIN and clear_in_exec logic

fs_struct->in_exec == T means that this ->fs is used by a single process
(thread group), and one of the treads does do_execve().

To avoid the mt-exec races this code has the following complications:

	1. check_unsafe_exec() returns -EBUSY if ->in_exec was
	   already set by another thread.

	2. do_execve_common() records "clear_in_exec" to ensure
	   that the error path can only clear ->in_exec if it was
	   set by current.

However, after 9b1bf12d5d51 "signals: move cred_guard_mutex from
task_struct to signal_struct" we do not need these complications:

	1. We can't race with our sub-thread, this is called under
	   per-process ->cred_guard_mutex. And we can't race with
	   another CLONE_FS task, we already checked that this fs
	   is not shared.

	   We can remove the  dead -EAGAIN logic.

	2. "out_unmark:" in do_execve_common() is either called
	   under ->cred_guard_mutex, or after de_thread() which
	   kills other threads, so we can't race with sub-thread
	   which could set ->in_exec. And if ->fs is shared with
	   another process ->in_exec should be false anyway.

	   We can clear in_exec unconditionally.

This also means that check_unsafe_exec() can be void.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 81ae6212187..389fe7b0ba1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1223,11 +1223,10 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold ->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
-static int check_unsafe_exec(struct linux_binprm *bprm)
+static void check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
 	unsigned n_fs;
-	int res = 0;
 
 	if (p->ptrace) {
 		if (p->ptrace & PT_PTRACE_CAP)
@@ -1253,22 +1252,15 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
 	}
 	rcu_read_unlock();
 
-	if (p->fs->users > n_fs) {
+	if (p->fs->users > n_fs)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
-	} else {
-		res = -EAGAIN;
-		if (!p->fs->in_exec) {
-			p->fs->in_exec = 1;
-			res = 1;
-		}
-	}
+	else
+		p->fs->in_exec = 1;
 	spin_unlock(&p->fs->lock);
-
-	return res;
 }
 
-/* 
- * Fill the binprm structure from the inode. 
+/*
+ * Fill the binprm structure from the inode.
  * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
  *
  * This may be called multiple times for binary chains (scripts for example).
@@ -1453,7 +1445,6 @@ static int do_execve_common(const char *filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
-	bool clear_in_exec;
 	int retval;
 
 	/*
@@ -1485,10 +1476,7 @@ static int do_execve_common(const char *filename,
 	if (retval)
 		goto out_free;
 
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
+	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
 	file = open_exec(filename);
@@ -1558,8 +1546,7 @@ out_file:
 	}
 
 out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
+	current->fs->in_exec = 0;
 	current->in_execve = 0;
 
 out_free:
-- 
cgit v1.2.3-70-g09d2


From 63e46b95e9eae1161832bf45cb40bbad37bfb182 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:51 -0800
Subject: exec: move the final allow_write_access/fput into free_bprm()

Both success/failure paths cleanup bprm->file, we can move this
code into free_bprm() to simlify and cleanup this logic.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 389fe7b0ba1..f860866e04b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1138,9 +1138,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
-
 	current->self_exec_id++;
-			
 	flush_signal_handlers(current, 0);
 	do_close_on_exec(current->files);
 }
@@ -1172,6 +1170,10 @@ void free_bprm(struct linux_binprm *bprm)
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
+	if (bprm->file) {
+		allow_write_access(bprm->file);
+		fput(bprm->file);
+	}
 	/* If a binfmt changed the interp, free it. */
 	if (bprm->interp != bprm->filename)
 		kfree(bprm->interp);
@@ -1424,12 +1426,6 @@ static int exec_binprm(struct linux_binprm *bprm)
 		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
 		current->did_exec = 1;
 		proc_exec_connector(current);
-
-		if (bprm->file) {
-			allow_write_access(bprm->file);
-			fput(bprm->file);
-			bprm->file = NULL; /* to catch use-after-free */
-		}
 	}
 
 	return ret;
@@ -1492,7 +1488,7 @@ static int do_execve_common(const char *filename,
 
 	retval = bprm_mm_init(bprm);
 	if (retval)
-		goto out_file;
+		goto out_unmark;
 
 	bprm->argc = count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
@@ -1539,12 +1535,6 @@ out:
 		mmput(bprm->mm);
 	}
 
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
 out_unmark:
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-- 
cgit v1.2.3-70-g09d2


From 98611e4e6a2b4a03fd2d4750cce8e4455a995c8d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:52 -0800
Subject: exec: kill task_struct->did_exec

We can kill either task->did_exec or PF_FORKNOEXEC, they are mutually
exclusive.  The patch kills ->did_exec because it has a single user.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 1 -
 include/linux/sched.h | 1 -
 kernel/fork.c         | 1 -
 kernel/sys.c          | 5 ++---
 4 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index f860866e04b..493b102a27c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1424,7 +1424,6 @@ static int exec_binprm(struct linux_binprm *bprm)
 		audit_bprm(bprm);
 		trace_sched_process_exec(current, old_pid, bprm);
 		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
-		current->did_exec = 1;
 		proc_exec_connector(current);
 	}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 66a17ad55bc..68a0e84463a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1239,7 +1239,6 @@ struct task_struct {
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
 
-	unsigned did_exec:1;
 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
 				 * execve */
 	unsigned in_iowait:1;
diff --git a/kernel/fork.c b/kernel/fork.c
index b6dd0bbf424..a17621c6cd4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1226,7 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!try_module_get(task_thread_info(p)->exec_domain->module))
 		goto bad_fork_cleanup_count;
 
-	p->did_exec = 0;
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
diff --git a/kernel/sys.c b/kernel/sys.c
index c72311324ea..ecd3ea12f72 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
  * only important on a multi-user system anyway, to make sure one user
  * can't send a signal to a process owned by another.  -TYT, 12/12/91
  *
- * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
- * LBT 04.03.94
+ * !PF_FORKNOEXEC check to conform completely to POSIX.
  */
 SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 {
@@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 		if (task_session(p) != task_session(group_leader))
 			goto out;
 		err = -EACCES;
-		if (p->did_exec)
+		if (!(p->flags & PF_FORKNOEXEC))
 			goto out;
 	} else {
 		err = -ESRCH;
-- 
cgit v1.2.3-70-g09d2


From 185ee40ee7fd1ecfc6575e8cefa2331218d1eca2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 23 Jan 2014 15:55:53 -0800
Subject: fs/proc/array.c: change do_task_stat() to use while_each_thread()

Change the remaining next_thread (ab)users to use while_each_thread().

The last user which should be changed is next_tid(), but we can't do this
now.

__exit_signal() and complete_signal() are fine, they actually need
next_thread() logic.

This patch (of 3):

do_task_stat() can use while_each_thread(), no changes in
the compiled code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Sameer Nanda <snanda@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 554a0b229ac..656e401794d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -444,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
 				gtime += task_gtime(t);
-				t = next_thread(t);
-			} while (t != task);
+			} while_each_thread(task, t);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-- 
cgit v1.2.3-70-g09d2


From b88fae644e5e3922251a4b242f435f5e3b49c381 Mon Sep 17 00:00:00 2001
From: Zhang Yi <zhang.yi20@zte.com.cn>
Date: Thu, 23 Jan 2014 15:55:57 -0800
Subject: exec: avoid propagating PF_NO_SETAFFINITY into userspace child

Userspace process doesn't want the PF_NO_SETAFFINITY, but its parent may be
a kernel worker thread which has PF_NO_SETAFFINITY set, and this worker thread
can do kernel_thread() to create the child.
Clearing this flag in usersapce child to enable its migrating capability.

Signed-off-by: Zhang Yi <zhang.yi20@zte.com.cn>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 493b102a27c..44218a7267b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1087,8 +1087,8 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;		/* We're using it now */
 
 	set_fs(USER_DS);
-	current->flags &=
-		~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
+	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+					PF_NOFREEZE | PF_NO_SETAFFINITY);
 	flush_thread();
 	current->personality &= ~bprm->per_clear;
 
-- 
cgit v1.2.3-70-g09d2


From 3b96d7db3b6dc99d207bca50037274d22e48dea5 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Thu, 23 Jan 2014 15:55:58 -0800
Subject: fs/exec.c: call arch_pick_mmap_layout() only once

Currently both setup_new_exec() and flush_old_exec() issue a call to
arch_pick_mmap_layout().  As setup_new_exec() and flush_old_exec() are
always called pairwise arch_pick_mmap_layout() is called twice.

This patch removes one call from setup_new_exec() to have it only called
once.

Signed-off-by: Richard Weinberger <richard@nod.at>
Tested-by: Pat Erley <pat-lkml@erley.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 44218a7267b..e1529b4c79b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -842,7 +842,6 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
-	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
-- 
cgit v1.2.3-70-g09d2


From bb25e49ff8ab0ef0b3c073c09d55cf10ef8a2aa0 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Thu, 23 Jan 2014 15:56:08 -0800
Subject: fs/ubifs: use rbtree postorder iteration helper instead of opencoding

Use rbtree_postorder_for_each_entry_safe() to destroy the rbtree instead
of opencoding an alternate postorder iteration that modifies the tree

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Artem Bityutskiy <dedekind1@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ubifs/debug.c    | 22 +++-------------------
 fs/ubifs/log.c      | 21 ++-------------------
 fs/ubifs/orphan.c   | 21 ++-------------------
 fs/ubifs/recovery.c | 21 +++------------------
 fs/ubifs/super.c    | 24 ++++--------------------
 fs/ubifs/tnc.c      | 22 +++-------------------
 6 files changed, 17 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index cc1febd8fad..5157b866a85 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2118,26 +2118,10 @@ out_free:
  */
 static void free_inodes(struct fsck_data *fsckd)
 {
-	struct rb_node *this = fsckd->inodes.rb_node;
-	struct fsck_inode *fscki;
+	struct fsck_inode *fscki, *n;
 
-	while (this) {
-		if (this->rb_left)
-			this = this->rb_left;
-		else if (this->rb_right)
-			this = this->rb_right;
-		else {
-			fscki = rb_entry(this, struct fsck_inode, rb);
-			this = rb_parent(this);
-			if (this) {
-				if (this->rb_left == &fscki->rb)
-					this->rb_left = NULL;
-				else
-					this->rb_right = NULL;
-			}
-			kfree(fscki);
-		}
-	}
+	rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb)
+		kfree(fscki);
 }
 
 /**
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36bd4efd081..a902c5919e4 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum)
  */
 static void destroy_done_tree(struct rb_root *done_tree)
 {
-	struct rb_node *this = done_tree->rb_node;
-	struct done_ref *dr;
+	struct done_ref *dr, *n;
 
-	while (this) {
-		if (this->rb_left) {
-			this = this->rb_left;
-			continue;
-		} else if (this->rb_right) {
-			this = this->rb_right;
-			continue;
-		}
-		dr = rb_entry(this, struct done_ref, rb);
-		this = rb_parent(this);
-		if (this) {
-			if (this->rb_left == &dr->rb)
-				this->rb_left = NULL;
-			else
-				this->rb_right = NULL;
-		}
+	rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb)
 		kfree(dr);
-	}
 }
 
 /**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index ba32da3fe08..f1c3e5a1b31 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
 
 static void dbg_free_check_tree(struct rb_root *root)
 {
-	struct rb_node *this = root->rb_node;
-	struct check_orphan *o;
+	struct check_orphan *o, *n;
 
-	while (this) {
-		if (this->rb_left) {
-			this = this->rb_left;
-			continue;
-		} else if (this->rb_right) {
-			this = this->rb_right;
-			continue;
-		}
-		o = rb_entry(this, struct check_orphan, rb);
-		this = rb_parent(this);
-		if (this) {
-			if (this->rb_left == &o->rb)
-				this->rb_left = NULL;
-			else
-				this->rb_right = NULL;
-		}
+	rbtree_postorder_for_each_entry_safe(o, n, root, rb)
 		kfree(o);
-	}
 }
 
 static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 065096e36ed..c14adb2f420 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum)
  */
 void ubifs_destroy_size_tree(struct ubifs_info *c)
 {
-	struct rb_node *this = c->size_tree.rb_node;
-	struct size_entry *e;
+	struct size_entry *e, *n;
 
-	while (this) {
-		if (this->rb_left) {
-			this = this->rb_left;
-			continue;
-		} else if (this->rb_right) {
-			this = this->rb_right;
-			continue;
-		}
-		e = rb_entry(this, struct size_entry, rb);
+	rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
 		if (e->inode)
 			iput(e->inode);
-		this = rb_parent(this);
-		if (this) {
-			if (this->rb_left == &e->rb)
-				this->rb_left = NULL;
-			else
-				this->rb_right = NULL;
-		}
 		kfree(e);
 	}
+
 	c->size_tree = RB_ROOT;
 }
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f69daa514a5..5ded8490c0c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c)
  */
 static void free_buds(struct ubifs_info *c)
 {
-	struct rb_node *this = c->buds.rb_node;
-	struct ubifs_bud *bud;
-
-	while (this) {
-		if (this->rb_left)
-			this = this->rb_left;
-		else if (this->rb_right)
-			this = this->rb_right;
-		else {
-			bud = rb_entry(this, struct ubifs_bud, rb);
-			this = rb_parent(this);
-			if (this) {
-				if (this->rb_left == &bud->rb)
-					this->rb_left = NULL;
-				else
-					this->rb_right = NULL;
-			}
-			kfree(bud);
-		}
-	}
+	struct ubifs_bud *bud, *n;
+
+	rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
+		kfree(bud);
 }
 
 /**
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 349f31a30f4..9083bc7ed4a 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c,
  */
 void destroy_old_idx(struct ubifs_info *c)
 {
-	struct rb_node *this = c->old_idx.rb_node;
-	struct ubifs_old_idx *old_idx;
+	struct ubifs_old_idx *old_idx, *n;
 
-	while (this) {
-		if (this->rb_left) {
-			this = this->rb_left;
-			continue;
-		} else if (this->rb_right) {
-			this = this->rb_right;
-			continue;
-		}
-		old_idx = rb_entry(this, struct ubifs_old_idx, rb);
-		this = rb_parent(this);
-		if (this) {
-			if (this->rb_left == &old_idx->rb)
-				this->rb_left = NULL;
-			else
-				this->rb_right = NULL;
-		}
+	rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb)
 		kfree(old_idx);
-	}
+
 	c->old_idx = RB_ROOT;
 }
 
-- 
cgit v1.2.3-70-g09d2


From d1866bd06101eb8ab2bb9d180b47c052c04b7cee Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Thu, 23 Jan 2014 15:56:10 -0800
Subject: fs/ext4: use rbtree postorder iteration helper instead of opencoding

Use rbtree_postorder_for_each_entry_safe() to destroy the rbtree instead
of opencoding an alternate postorder iteration that modifies the tree

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Michel Lespinasse <walken@google.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/block_validity.c | 33 ++++-----------------------------
 fs/ext4/dir.c            | 35 +++++------------------------------
 2 files changed, 9 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3f11656bd72..41eb9dcfac7 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
 /* Called when the filesystem is unmounted */
 void ext4_release_system_zone(struct super_block *sb)
 {
-	struct rb_node	*n = EXT4_SB(sb)->system_blks.rb_node;
-	struct rb_node	*parent;
-	struct ext4_system_zone	*entry;
+	struct ext4_system_zone	*entry, *n;
 
-	while (n) {
-		/* Do the node's children first */
-		if (n->rb_left) {
-			n = n->rb_left;
-			continue;
-		}
-		if (n->rb_right) {
-			n = n->rb_right;
-			continue;
-		}
-		/*
-		 * The node has no children; free it, and then zero
-		 * out parent's link to it.  Finally go to the
-		 * beginning of the loop and try to free the parent
-		 * node.
-		 */
-		parent = rb_parent(n);
-		entry = rb_entry(n, struct ext4_system_zone, node);
+	rbtree_postorder_for_each_entry_safe(entry, n,
+			&EXT4_SB(sb)->system_blks, node)
 		kmem_cache_free(ext4_system_zone_cachep, entry);
-		if (!parent)
-			EXT4_SB(sb)->system_blks = RB_ROOT;
-		else if (parent->rb_left == n)
-			parent->rb_left = NULL;
-		else if (parent->rb_right == n)
-			parent->rb_right = NULL;
-		n = parent;
-	}
+
 	EXT4_SB(sb)->system_blks = RB_ROOT;
 }
 
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 680bb338891..d638c57e996 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -353,41 +353,16 @@ struct fname {
  */
 static void free_rb_tree_fname(struct rb_root *root)
 {
-	struct rb_node	*n = root->rb_node;
-	struct rb_node	*parent;
-	struct fname	*fname;
-
-	while (n) {
-		/* Do the node's children first */
-		if (n->rb_left) {
-			n = n->rb_left;
-			continue;
-		}
-		if (n->rb_right) {
-			n = n->rb_right;
-			continue;
-		}
-		/*
-		 * The node has no children; free it, and then zero
-		 * out parent's link to it.  Finally go to the
-		 * beginning of the loop and try to free the parent
-		 * node.
-		 */
-		parent = rb_parent(n);
-		fname = rb_entry(n, struct fname, rb_hash);
+	struct fname *fname, *next;
+
+	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
 		while (fname) {
 			struct fname *old = fname;
 			fname = fname->next;
 			kfree(old);
 		}
-		if (!parent)
-			*root = RB_ROOT;
-		else if (parent->rb_left == n)
-			parent->rb_left = NULL;
-		else if (parent->rb_right == n)
-			parent->rb_right = NULL;
-		n = parent;
-	}
+
+	*root = RB_ROOT;
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From e8bbeeb755a077cfc0f814b07739f9225642d65c Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Thu, 23 Jan 2014 15:56:11 -0800
Subject: fs/jffs2: use rbtree postorder iteration helper instead of opencoding

Use rbtree_postorder_for_each_entry_safe() to destroy the rbtree instead
of opencoding an alternate postorder iteration that modifies the tree

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/nodelist.c  | 28 ++--------------------------
 fs/jffs2/readinode.c | 26 +++-----------------------
 2 files changed, 5 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 975a1f562c1..9a5449bc3af 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_
    they're killed. */
 void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
 {
-	struct jffs2_node_frag *frag;
-	struct jffs2_node_frag *parent;
-
-	if (!root->rb_node)
-		return;
+	struct jffs2_node_frag *frag, *next;
 
 	dbg_fragtree("killing\n");
-
-	frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb));
-	while(frag) {
-		if (frag->rb.rb_left) {
-			frag = frag_left(frag);
-			continue;
-		}
-		if (frag->rb.rb_right) {
-			frag = frag_right(frag);
-			continue;
-		}
-
+	rbtree_postorder_for_each_entry_safe(frag, next, root, rb) {
 		if (frag->node && !(--frag->node->frags)) {
 			/* Not a hole, and it's the final remaining frag
 			   of this node. Free the node */
@@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
 
 			jffs2_free_full_dnode(frag->node);
 		}
-		parent = frag_parent(frag);
-		if (parent) {
-			if (frag_left(parent) == frag)
-				parent->rb.rb_left = NULL;
-			else
-				parent->rb.rb_right = NULL;
-		}
 
 		jffs2_free_node_frag(frag);
-		frag = parent;
-
 		cond_resched();
 	}
 }
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ae81b01e6fd..386303dca38 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
 
 static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
 {
-	struct rb_node *this;
-	struct jffs2_tmp_dnode_info *tn;
-
-	this = list->rb_node;
+	struct jffs2_tmp_dnode_info *tn, *next;
 
-	/* Now at bottom of tree */
-	while (this) {
-		if (this->rb_left)
-			this = this->rb_left;
-		else if (this->rb_right)
-			this = this->rb_right;
-		else {
-			tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb);
+	rbtree_postorder_for_each_entry_safe(tn, next, list, rb) {
 			jffs2_free_full_dnode(tn->fn);
 			jffs2_free_tmp_dnode_info(tn);
-
-			this = rb_parent(this);
-			if (!this)
-				break;
-
-			if (this->rb_left == &tn->rb)
-				this->rb_left = NULL;
-			else if (this->rb_right == &tn->rb)
-				this->rb_right = NULL;
-			else BUG();
-		}
 	}
+
 	*list = RB_ROOT;
 }
 
-- 
cgit v1.2.3-70-g09d2


From b1c8047c6b474c639d923122ab84732cbfeb7225 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Thu, 23 Jan 2014 15:56:12 -0800
Subject: fs/ext3: use rbtree postorder iteration helper instead of opencoding

Use rbtree_postorder_for_each_entry_safe() to destroy the rbtree instead
of opencoding an alternate postorder iteration that modifies the tree

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c | 36 +++++-------------------------------
 1 file changed, 5 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index bafdd48eefd..a331ad1c23f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -309,43 +309,17 @@ struct fname {
  */
 static void free_rb_tree_fname(struct rb_root *root)
 {
-	struct rb_node	*n = root->rb_node;
-	struct rb_node	*parent;
-	struct fname	*fname;
-
-	while (n) {
-		/* Do the node's children first */
-		if (n->rb_left) {
-			n = n->rb_left;
-			continue;
-		}
-		if (n->rb_right) {
-			n = n->rb_right;
-			continue;
-		}
-		/*
-		 * The node has no children; free it, and then zero
-		 * out parent's link to it.  Finally go to the
-		 * beginning of the loop and try to free the parent
-		 * node.
-		 */
-		parent = rb_parent(n);
-		fname = rb_entry(n, struct fname, rb_hash);
+	struct fname *fname, *next;
+
+	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
 		while (fname) {
 			struct fname * old = fname;
 			fname = fname->next;
 			kfree (old);
 		}
-		if (!parent)
-			*root = RB_ROOT;
-		else if (parent->rb_left == n)
-			parent->rb_left = NULL;
-		else if (parent->rb_right == n)
-			parent->rb_right = NULL;
-		n = parent;
-	}
-}
 
+	*root = RB_ROOT;
+}
 
 static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
 							   loff_t pos)
-- 
cgit v1.2.3-70-g09d2


From ed8f68669a27287a3b15882e8d88ebccae75ec59 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 23 Jan 2014 15:56:13 -0800
Subject: 
 fs-ext3-use-rbtree-postorder-iteration-helper-instead-of-opencoding-fix

use do{}while - more efficient and it squishes a coccinelle warning

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index a331ad1c23f..e66e4808719 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -312,11 +312,11 @@ static void free_rb_tree_fname(struct rb_root *root)
 	struct fname *fname, *next;
 
 	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
-		while (fname) {
-			struct fname * old = fname;
+		do {
+			struct fname *old = fname;
 			fname = fname->next;
-			kfree (old);
-		}
+			kfree(old);
+		} while (fname);
 
 	*root = RB_ROOT;
 }
-- 
cgit v1.2.3-70-g09d2


From 949b9c3d4263c9b7c2448588afce37becd58e1ad Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@linbit.com>
Date: Thu, 23 Jan 2014 15:56:15 -0800
Subject: userns: relax the posix_acl_valid() checks

So far, POSIX ACLs are using a canonical representation that keeps all ACL
entries in a strict order; the ACL_USER and ACL_GROUP entries for specific
users and groups are ordered by user and group identifier, respectively.
The user-space code provides ACL entries in this order; the kernel
verifies that the ACL entry order is correct in posix_acl_valid().

User namespaces allow to arbitrary map user and group identifiers which
can cause the ACL_USER and ACL_GROUP entry order to differ between user
space and the kernel; posix_acl_valid() would then fail.

Work around this by allowing ACL_USER and ACL_GROUP entries to be in any
order in the kernel.  The effect is only minor: file permission checks
will pick the first matching ACL_USER entry, and check all matching
ACL_GROUP entries.

(The libacl user-space library and getfacl / setfacl tools will not create
ACLs with duplicate user or group idenfifiers; they will handle ACLs with
entries in an arbitrary order correctly.)

Signed-off-by: Andreas Gruenbacher <agruen@linbit.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/posix_acl.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 021e7c069b8..551e61ba15b 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -149,8 +149,6 @@ posix_acl_valid(const struct posix_acl *acl)
 {
 	const struct posix_acl_entry *pa, *pe;
 	int state = ACL_USER_OBJ;
-	kuid_t prev_uid = INVALID_UID;
-	kgid_t prev_gid = INVALID_GID;
 	int needs_mask = 0;
 
 	FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -169,10 +167,6 @@ posix_acl_valid(const struct posix_acl *acl)
 					return -EINVAL;
 				if (!uid_valid(pa->e_uid))
 					return -EINVAL;
-				if (uid_valid(prev_uid) &&
-				    uid_lte(pa->e_uid, prev_uid))
-					return -EINVAL;
-				prev_uid = pa->e_uid;
 				needs_mask = 1;
 				break;
 
@@ -188,10 +182,6 @@ posix_acl_valid(const struct posix_acl *acl)
 					return -EINVAL;
 				if (!gid_valid(pa->e_gid))
 					return -EINVAL;
-				if (gid_valid(prev_gid) &&
-				    gid_lte(pa->e_gid, prev_gid))
-					return -EINVAL;
-				prev_gid = pa->e_gid;
 				needs_mask = 1;
 				break;
 
-- 
cgit v1.2.3-70-g09d2


From 40e2c71d57565a82970a5a2b75f7eb67bb3252f4 Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Thu, 23 Jan 2014 15:56:19 -0800
Subject: romfs: fix returm err while getting inode in fill_super

Getting an inode by romfs_iget may lead to an err in fill_super, and the
err value should be return.

And it should return -ENOMEM instead while d_make_root fails, fix it too.

Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/romfs/super.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ff1d3d42e72..d8418782862 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -533,16 +533,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	root = romfs_iget(sb, pos);
 	if (IS_ERR(root))
-		goto error;
+		return PTR_ERR(root);
 
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root)
-		goto error;
+		return -ENOMEM;
 
 	return 0;
 
-error:
-	return -EINVAL;
 error_rsb_inval:
 	ret = -EINVAL;
 error_rsb:
-- 
cgit v1.2.3-70-g09d2


From f0bc9985fe8bf4377d5557cd7957d9be43ec8861 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 21 Jan 2014 16:44:57 -0600
Subject: xfs: clean up xfs_buftarg

Clean up the xfs_buftarg structure a bit:
- remove bt_bsize which is never used
- replace bt_sshift with bt_ssize; we only ever shift it back

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c   | 5 ++---
 fs/xfs/xfs_buf.h   | 3 +--
 fs/xfs/xfs_ioctl.c | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9fccfb59429..b664bce57bf 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -445,7 +445,7 @@ _xfs_buf_find(
 	numbytes = BBTOB(numblks);
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(numbytes < (1 << btp->bt_sshift)));
+	ASSERT(!(numbytes < btp->bt_ssize));
 	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
 
 	/*
@@ -1599,8 +1599,7 @@ xfs_setsize_buftarg(
 	unsigned int		blocksize,
 	unsigned int		sectorsize)
 {
-	btp->bt_bsize = blocksize;
-	btp->bt_sshift = ffs(sectorsize) - 1;
+	btp->bt_ssize = sectorsize;
 	btp->bt_smask = sectorsize - 1;
 
 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1cf21a4a9f2..4ef949aebb0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -93,8 +93,7 @@ typedef struct xfs_buftarg {
 	struct block_device	*bt_bdev;
 	struct backing_dev_info	*bt_bdi;
 	struct xfs_mount	*bt_mount;
-	unsigned int		bt_bsize;
-	unsigned int		bt_sshift;
+	unsigned int		bt_ssize;
 	size_t			bt_smask;
 
 	/* LRU control structures */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 518aa56b8f2..584e092415d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1583,7 +1583,7 @@ xfs_file_ioctl(
 			XFS_IS_REALTIME_INODE(ip) ?
 			mp->m_rtdev_targp : mp->m_ddev_targp;
 
-		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+		da.d_mem = da.d_miniosz = target->bt_ssize;
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
 		if (copy_to_user(arg, &da, sizeof(da)))
-- 
cgit v1.2.3-70-g09d2


From 6da54179b3f1bb6a302fd5f3b38fae32ee463ed1 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 21 Jan 2014 16:45:52 -0600
Subject: xfs: rename xfs_buftarg structure members

In preparation for adding new members to the structure,
give these old ones more descriptive names:

	bt_ssize -> bt_meta_sectorsize
	bt_smask -> bt_meta_sectormask

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c   | 8 ++++----
 fs/xfs/xfs_buf.h   | 4 ++--
 fs/xfs/xfs_file.c  | 4 ++--
 fs/xfs/xfs_ioctl.c | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b664bce57bf..a526f8d2dc6 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -445,8 +445,8 @@ _xfs_buf_find(
 	numbytes = BBTOB(numblks);
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(numbytes < btp->bt_ssize));
-	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+	ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
 
 	/*
 	 * Corrupted block numbers can get through to here, unfortunately, so we
@@ -1599,8 +1599,8 @@ xfs_setsize_buftarg(
 	unsigned int		blocksize,
 	unsigned int		sectorsize)
 {
-	btp->bt_ssize = sectorsize;
-	btp->bt_smask = sectorsize - 1;
+	btp->bt_meta_sectorsize = sectorsize;
+	btp->bt_meta_sectormask = sectorsize - 1;
 
 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
 		char name[BDEVNAME_SIZE];
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4ef949aebb0..d5d88dda4d3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -93,8 +93,8 @@ typedef struct xfs_buftarg {
 	struct block_device	*bt_bdev;
 	struct backing_dev_info	*bt_bdi;
 	struct xfs_mount	*bt_mount;
-	unsigned int		bt_ssize;
-	size_t			bt_smask;
+	unsigned int		bt_meta_sectorsize;
+	size_t			bt_meta_sectormask;
 
 	/* LRU control structures */
 	struct shrinker		bt_shrinker;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e0012159263..d01745f748a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -261,7 +261,7 @@ xfs_file_aio_read(
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
-		if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+		if ((pos | size) & target->bt_meta_sectormask) {
 			if (pos == i_size_read(inode))
 				return 0;
 			return -XFS_ERROR(EINVAL);
@@ -641,7 +641,7 @@ xfs_file_dio_aio_write(
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
-	if ((pos & target->bt_smask) || (count & target->bt_smask))
+	if ((pos | count) & target->bt_meta_sectormask)
 		return -XFS_ERROR(EINVAL);
 
 	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 584e092415d..3dc60ed9572 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1583,7 +1583,7 @@ xfs_file_ioctl(
 			XFS_IS_REALTIME_INODE(ip) ?
 			mp->m_rtdev_targp : mp->m_ddev_targp;
 
-		da.d_mem = da.d_miniosz = target->bt_ssize;
+		da.d_mem = da.d_miniosz = target->bt_meta_sectorsize;
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
 		if (copy_to_user(arg, &da, sizeof(da)))
-- 
cgit v1.2.3-70-g09d2


From 7c71ee78031c248dca13fc94dea9a4cc217db6cf Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 21 Jan 2014 16:46:23 -0600
Subject: xfs: allow logical-sector sized O_DIRECT

Some time ago, mkfs.xfs started picking the storage physical
sector size as the default filesystem "sector size" in order
to avoid RMW costs incurred by doing IOs at logical sector
size alignments.

However, this means that for a filesystem made with i.e.
a 4k sector size on an "advanced format" 4k/512 disk,
512-byte direct IOs are no longer allowed.  This means
that XFS has essentially turned this AF drive into a hard
4K device, from the filesystem on up.

XFS's mkfs-specified "sector size" is really just controlling
the minimum size & alignment of filesystem metadata.

There is no real need to tightly couple XFS's minimal
metadata size to the minimum allowed direct IO size;
XFS can continue doing metadata in optimal sizes, but
still allow smaller DIOs for apps which issue them,
for whatever reason.

This patch adds a new field to the xfs_buftarg, so that
we now track 2 sizes:

 1) The metadata sector size, which is the minimum unit and
    alignment of IO which will be performed by metadata operations.
 2) The device logical sector size

The first is used internally by the file system for metadata
alignment and IOs.
The second is used for the minimum allowed direct IO alignment.

This has passed xfstests on filesystems made with 4k sectors,
including when run under the patch I sent to ignore
XFS_IOC_DIOINFO, and issue 512 DIOs anyway.  I also directly
tested end of block behavior on preallocated, sparse, and
existing files when we do a 512 IO into a 4k file on a
4k-sector filesystem, to be sure there were no unexpected
behaviors.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c   |  5 +++++
 fs/xfs/xfs_buf.h   | 15 +++++++++++++++
 fs/xfs/xfs_file.c  |  7 +++++--
 fs/xfs/xfs_ioctl.c |  2 +-
 4 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a526f8d2dc6..51757113a82 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1599,6 +1599,7 @@ xfs_setsize_buftarg(
 	unsigned int		blocksize,
 	unsigned int		sectorsize)
 {
+	/* Set up metadata sector size info */
 	btp->bt_meta_sectorsize = sectorsize;
 	btp->bt_meta_sectormask = sectorsize - 1;
 
@@ -1613,6 +1614,10 @@ xfs_setsize_buftarg(
 		return EINVAL;
 	}
 
+	/* Set up device logical sector size mask */
+	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d5d88dda4d3..995339534db 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -88,6 +88,19 @@ typedef unsigned int xfs_buf_flags_t;
  */
 #define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
 
+/*
+ * The xfs_buftarg contains 2 notions of "sector size" -
+ *
+ * 1) The metadata sector size, which is the minimum unit and
+ *    alignment of IO which will be performed by metadata operations.
+ * 2) The device logical sector size
+ *
+ * The first is specified at mkfs time, and is stored on-disk in the
+ * superblock's sb_sectsize.
+ *
+ * The latter is derived from the underlying device, and controls direct IO
+ * alignment constraints.
+ */
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
@@ -95,6 +108,8 @@ typedef struct xfs_buftarg {
 	struct xfs_mount	*bt_mount;
 	unsigned int		bt_meta_sectorsize;
 	size_t			bt_meta_sectormask;
+	size_t			bt_logical_sectorsize;
+	size_t			bt_logical_sectormask;
 
 	/* LRU control structures */
 	struct shrinker		bt_shrinker;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d01745f748a..2e7989e3a2d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -261,7 +261,8 @@ xfs_file_aio_read(
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
-		if ((pos | size) & target->bt_meta_sectormask) {
+		/* DIO must be aligned to device logical sector size */
+		if ((pos | size) & target->bt_logical_sectormask) {
 			if (pos == i_size_read(inode))
 				return 0;
 			return -XFS_ERROR(EINVAL);
@@ -641,9 +642,11 @@ xfs_file_dio_aio_write(
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
-	if ((pos | count) & target->bt_meta_sectormask)
+	/* DIO must be aligned to device logical sector size */
+	if ((pos | count) & target->bt_logical_sectormask)
 		return -XFS_ERROR(EINVAL);
 
+	/* "unaligned" here means not aligned to a filesystem block */
 	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
 		unaligned_io = 1;
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3dc60ed9572..bcfe6120211 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1583,7 +1583,7 @@ xfs_file_ioctl(
 			XFS_IS_REALTIME_INODE(ip) ?
 			mp->m_rtdev_targp : mp->m_ddev_targp;
 
-		da.d_mem = da.d_miniosz = target->bt_meta_sectorsize;
+		da.d_mem =  da.d_miniosz = target->bt_logical_sectorsize;
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
 		if (copy_to_user(arg, &da, sizeof(da)))
-- 
cgit v1.2.3-70-g09d2


From d50e61361c68a05a9cd7d54617522f99f278ac8a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 15 Jan 2014 12:21:12 -0500
Subject: nfsd4: decrease nfsd4_encode_fattr stack usage

A struct svc_fh is 320 bytes on x86_64, it'd be better not to have these
on the stack.

kmalloc'ing them probably isn't ideal either, but this is the simplest
thing to do.  If it turns out to be a problem in the readdir case then
we could add a svc_fh to nfsd4_readdir and pass that in.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 8198ecf3c03..63f2395c57e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2058,7 +2058,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	u32 bmval1 = bmval[1];
 	u32 bmval2 = bmval[2];
 	struct kstat stat;
-	struct svc_fh tempfh;
+	struct svc_fh *tempfh = NULL;
 	struct kstatfs statfs;
 	int buflen = count << 2;
 	__be32 *attrlenp;
@@ -2105,11 +2105,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			goto out_nfserr;
 	}
 	if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
-		fh_init(&tempfh, NFS4_FHSIZE);
-		status = fh_compose(&tempfh, exp, dentry, NULL);
+		tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+		status = nfserr_jukebox;
+		if (!tempfh)
+			goto out;
+		fh_init(tempfh, NFS4_FHSIZE);
+		status = fh_compose(tempfh, exp, dentry, NULL);
 		if (status)
 			goto out;
-		fhp = &tempfh;
+		fhp = tempfh;
 	}
 	if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
 			| FATTR4_WORD0_SUPPORTED_ATTRS)) {
@@ -2495,8 +2499,8 @@ out:
 		security_release_secctx(context, contextlen);
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
 	kfree(acl);
-	if (fhp == &tempfh)
-		fh_put(&tempfh);
+	if (tempfh)
+		fh_put(tempfh);
 	return status;
 out_nfserr:
 	status = nfserrno(err);
-- 
cgit v1.2.3-70-g09d2


From b22e8fedc19588864a6ba0acefbbed06f05ba713 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Nov 2013 22:51:47 -0500
Subject: ecryptfs: fix failure handling in ->readlink()

If ecryptfs_readlink_lower() fails, buf remains an uninitialized
pointer and passing it nd_set_link() won't do anything good.

Fixed by switching ecryptfs_readlink_lower() to saner API - make it
return buf or ERR_PTR(...) and update callers.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/inode.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c36c4482447..b167ca48b8e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -659,19 +659,17 @@ out_lock:
 	return rc;
 }
 
-static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
-				   size_t *bufsiz)
+static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	char *lower_buf;
+	char *buf;
 	mm_segment_t old_fs;
 	int rc;
 
 	lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!lower_buf) {
-		rc = -ENOMEM;
-		goto out;
-	}
+	if (!lower_buf)
+		return ERR_PTR(-ENOMEM);
 	old_fs = get_fs();
 	set_fs(get_ds());
 	rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
@@ -680,21 +678,18 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
 	set_fs(old_fs);
 	if (rc < 0)
 		goto out;
-	rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
+	rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb,
 						  lower_buf, rc);
 out:
 	kfree(lower_buf);
-	return rc;
+	return rc ? ERR_PTR(rc) : buf;
 }
 
 static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	char *buf;
-	size_t len = PATH_MAX;
-	int rc;
-
-	rc = ecryptfs_readlink_lower(dentry, &buf, &len);
-	if (rc)
+	size_t len;
+	char *buf = ecryptfs_readlink_lower(dentry, &len);
+	if (IS_ERR(buf))
 		goto out;
 	fsstack_copy_attr_atime(dentry->d_inode,
 				ecryptfs_dentry_to_lower(dentry)->d_inode);
@@ -1003,10 +998,12 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
 		char *target;
 		size_t targetsiz;
 
-		rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
-		if (!rc) {
+		target = ecryptfs_readlink_lower(dentry, &targetsiz);
+		if (!IS_ERR(target)) {
 			kfree(target);
 			stat->size = targetsiz;
+		} else {
+			rc = PTR_ERR(target);
 		}
 	}
 	return rc;
-- 
cgit v1.2.3-70-g09d2


From 96c8c442117859cd95b5b57836ff374ff43f0564 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Nov 2013 22:54:55 -0500
Subject: xfs: switch to kfree_put_link()

don't bother open-coding it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/xfs_iops.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 104455b8046..a3dad17b135 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -391,18 +391,6 @@ xfs_vn_follow_link(
 	return NULL;
 }
 
-STATIC void
-xfs_vn_put_link(
-	struct dentry	*dentry,
-	struct nameidata *nd,
-	void		*p)
-{
-	char		*s = nd_get_link(nd);
-
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
 STATIC int
 xfs_vn_getattr(
 	struct vfsmount		*mnt,
@@ -1118,7 +1106,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
 	.follow_link		= xfs_vn_follow_link,
-	.put_link		= xfs_vn_put_link,
+	.put_link		= kfree_put_link,
 	.get_acl		= xfs_get_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
-- 
cgit v1.2.3-70-g09d2


From 842a859db26b707f06fc9fbbb9137a9b90910e49 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Dec 2013 14:43:54 -0500
Subject: affs: use ->kill_sb() to simplify ->put_super() and failure exits of
 ->mount()

... and return saner errors from ->mount(), while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/super.c | 57 +++++++++++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/super.c b/fs/affs/super.c
index 45161a832bb..d098731b82f 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -49,11 +49,6 @@ affs_put_super(struct super_block *sb)
 	pr_debug("AFFS: put_super()\n");
 
 	cancel_delayed_work_sync(&sbi->sb_work);
-	kfree(sbi->s_prefix);
-	affs_free_bitmap(sb);
-	affs_brelse(sbi->s_root_bh);
-	kfree(sbi);
-	sb->s_fs_info = NULL;
 }
 
 static int
@@ -316,7 +311,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned long		 mount_flags;
 	int			 tmp_flags;	/* fix remount prototype... */
 	u8			 sig[4];
-	int			 ret = -EINVAL;
+	int			 ret;
 
 	save_mount_options(sb, data);
 
@@ -412,17 +407,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!silent)
 		printk(KERN_ERR "AFFS: No valid root block on device %s\n",
 			sb->s_id);
-	goto out_error;
+	return -EINVAL;
 
 	/* N.B. after this point bh must be released */
 got_root:
+	/* Keep super block in cache */
+	sbi->s_root_bh = root_bh;
 	root_block = sbi->s_root_block;
 
 	/* Find out which kind of FS we have */
 	boot_bh = sb_bread(sb, 0);
 	if (!boot_bh) {
 		printk(KERN_ERR "AFFS: Cannot read boot block\n");
-		goto out_error;
+		return -EINVAL;
 	}
 	memcpy(sig, boot_bh->b_data, 4);
 	brelse(boot_bh);
@@ -471,7 +468,7 @@ got_root:
 		default:
 			printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n",
 				sb->s_id, chksum);
-			goto out_error;
+			return -EINVAL;
 	}
 
 	if (mount_flags & SF_VERBOSE) {
@@ -488,22 +485,17 @@ got_root:
 	if (sbi->s_flags & SF_OFS)
 		sbi->s_data_blksize -= 24;
 
-	/* Keep super block in cache */
-	sbi->s_root_bh = root_bh;
-	/* N.B. after this point s_root_bh must be released */
-
 	tmp_flags = sb->s_flags;
-	if (affs_init_bitmap(sb, &tmp_flags))
-		goto out_error;
+	ret = affs_init_bitmap(sb, &tmp_flags);
+	if (ret)
+		return ret;
 	sb->s_flags = tmp_flags;
 
 	/* set up enough so that it can read an inode */
 
 	root_inode = affs_iget(sb, root_block);
-	if (IS_ERR(root_inode)) {
-		ret = PTR_ERR(root_inode);
-		goto out_error;
-	}
+	if (IS_ERR(root_inode))
+		return PTR_ERR(root_inode);
 
 	if (AFFS_SB(sb)->s_flags & SF_INTL)
 		sb->s_d_op = &affs_intl_dentry_operations;
@@ -513,22 +505,11 @@ got_root:
 	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root) {
 		printk(KERN_ERR "AFFS: Get root inode failed\n");
-		goto out_error;
+		return -ENOMEM;
 	}
 
 	pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
 	return 0;
-
-	/*
-	 * Begin the cascaded cleanup ...
-	 */
-out_error:
-	kfree(sbi->s_bitmap);
-	affs_brelse(root_bh);
-	kfree(sbi->s_prefix);
-	kfree(sbi);
-	sb->s_fs_info = NULL;
-	return ret;
 }
 
 static int
@@ -615,11 +596,23 @@ static struct dentry *affs_mount(struct file_system_type *fs_type,
 	return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
 }
 
+static void affs_kill_sb(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	kill_block_super(sb);
+	if (sbi) {
+		affs_free_bitmap(sb);
+		affs_brelse(sbi->s_root_bh);
+		kfree(sbi->s_prefix);
+		kfree(sbi);
+	}
+}
+
 static struct file_system_type affs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "affs",
 	.mount		= affs_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= affs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("affs");
-- 
cgit v1.2.3-70-g09d2


From 2309fb8ef40e82c4175100c37eb3d9db9e572ca5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Dec 2013 16:35:14 -0500
Subject: cramfs: get rid of ->put_super()

failure exits are simpler that way

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cramfs/inode.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e501ac3a49f..508a7524fe3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -219,10 +219,11 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 	return read_buffers[buffer] + offset;
 }
 
-static void cramfs_put_super(struct super_block *sb)
+static void cramfs_kill_sb(struct super_block *sb)
 {
-	kfree(sb->s_fs_info);
-	sb->s_fs_info = NULL;
+	struct cramfs_sb_info *sbi = sb->s_fs_info;
+	kill_block_super(sb);
+	kfree(sbi);
 }
 
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
@@ -261,7 +262,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 		if (super.magic == CRAMFS_MAGIC_WEND) {
 			if (!silent)
 				printk(KERN_ERR "cramfs: wrong endianness\n");
-			goto out;
+			return -EINVAL;
 		}
 
 		/* check at 512 byte offset */
@@ -273,20 +274,20 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 				printk(KERN_ERR "cramfs: wrong endianness\n");
 			else if (!silent)
 				printk(KERN_ERR "cramfs: wrong magic\n");
-			goto out;
+			return -EINVAL;
 		}
 	}
 
 	/* get feature flags first */
 	if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
 		printk(KERN_ERR "cramfs: unsupported filesystem features\n");
-		goto out;
+		return -EINVAL;
 	}
 
 	/* Check that the root inode is in a sane state */
 	if (!S_ISDIR(super.root.mode)) {
 		printk(KERN_ERR "cramfs: root is not a directory\n");
-		goto out;
+		return -EINVAL;
 	}
 	/* correct strange, hard-coded permissions of mkcramfs */
 	super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
@@ -310,22 +311,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 		  (root_offset != 512 + sizeof(struct cramfs_super))))
 	{
 		printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
-		goto out;
+		return -EINVAL;
 	}
 
 	/* Set it all up.. */
 	sb->s_op = &cramfs_ops;
 	root = get_cramfs_inode(sb, &super.root, 0);
 	if (IS_ERR(root))
-		goto out;
+		return PTR_ERR(root);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root)
-		goto out;
+		return -ENOMEM;
 	return 0;
-out:
-	kfree(sbi);
-	sb->s_fs_info = NULL;
-	return -EINVAL;
 }
 
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -550,7 +547,6 @@ static const struct inode_operations cramfs_dir_inode_operations = {
 };
 
 static const struct super_operations cramfs_ops = {
-	.put_super	= cramfs_put_super,
 	.remount_fs	= cramfs_remount,
 	.statfs		= cramfs_statfs,
 };
@@ -565,7 +561,7 @@ static struct file_system_type cramfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "cramfs",
 	.mount		= cramfs_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= cramfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("cramfs");
-- 
cgit v1.2.3-70-g09d2


From f7f4f4dd6948e3bca0e04e5217c825052ad88f5a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Dec 2013 16:54:28 -0500
Subject: cramfs: take headers to fs/cramfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/blackfin/kernel/setup.c              |  2 +-
 arch/cris/arch-v32/drivers/axisflashmap.c |  2 --
 fs/cramfs/inode.c                         | 24 ++++++++++++++++++++----
 fs/cramfs/internal.h                      |  4 ++++
 fs/cramfs/uncompress.c                    |  2 +-
 include/linux/cramfs_fs.h                 | 10 ----------
 include/linux/cramfs_fs_sb.h              | 20 --------------------
 init/do_mounts_rd.c                       |  2 +-
 8 files changed, 27 insertions(+), 39 deletions(-)
 create mode 100644 fs/cramfs/internal.h
 delete mode 100644 include/linux/cramfs_fs.h
 delete mode 100644 include/linux/cramfs_fs_sb.h

(limited to 'fs')

diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c
index 39619304212..4f424ae3b36 100644
--- a/arch/blackfin/kernel/setup.c
+++ b/arch/blackfin/kernel/setup.c
@@ -17,7 +17,7 @@
 #ifdef CONFIG_MTD_UCLINUX
 #include <linux/mtd/map.h>
 #include <linux/ext2_fs.h>
-#include <linux/cramfs_fs.h>
+#include <uapi/linux/cramfs_fs.h>
 #include <linux/romfs_fs.h>
 #endif
 
diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c
index 1b6ad624720..28dd77144e8 100644
--- a/arch/cris/arch-v32/drivers/axisflashmap.c
+++ b/arch/cris/arch-v32/drivers/axisflashmap.c
@@ -24,8 +24,6 @@
 #include <linux/mtd/mtdram.h>
 #include <linux/mtd/partitions.h>
 
-#include <linux/cramfs_fs.h>
-
 #include <asm/axisflashmap.h>
 #include <asm/mmu.h>
 
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 508a7524fe3..06610cf94d5 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,14 +17,30 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
-#include <linux/cramfs_fs.h>
 #include <linux/slab.h>
-#include <linux/cramfs_fs_sb.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
-
+#include <uapi/linux/cramfs_fs.h>
 #include <asm/uaccess.h>
 
+#include "internal.h"
+
+/*
+ * cramfs super-block data in memory
+ */
+struct cramfs_sb_info {
+	unsigned long magic;
+	unsigned long size;
+	unsigned long blocks;
+	unsigned long files;
+	unsigned long flags;
+};
+
+static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
@@ -221,7 +237,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 
 static void cramfs_kill_sb(struct super_block *sb)
 {
-	struct cramfs_sb_info *sbi = sb->s_fs_info;
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
 	kill_block_super(sb);
 	kfree(sbi);
 }
diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h
new file mode 100644
index 00000000000..349d7127215
--- /dev/null
+++ b/fs/cramfs/internal.h
@@ -0,0 +1,4 @@
+/* Uncompression interfaces to the underlying zlib */
+int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
+int cramfs_uncompress_init(void);
+void cramfs_uncompress_exit(void);
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 023329800d2..1760c1b84d9 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -19,7 +19,7 @@
 #include <linux/errno.h>
 #include <linux/vmalloc.h>
 #include <linux/zlib.h>
-#include <linux/cramfs_fs.h>
+#include "internal.h"
 
 static z_stream stream;
 static int initialized;
diff --git a/include/linux/cramfs_fs.h b/include/linux/cramfs_fs.h
deleted file mode 100644
index 133789609f2..00000000000
--- a/include/linux/cramfs_fs.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __CRAMFS_H
-#define __CRAMFS_H
-
-#include <uapi/linux/cramfs_fs.h>
-
-/* Uncompression interfaces to the underlying zlib */
-int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
-int cramfs_uncompress_init(void);
-void cramfs_uncompress_exit(void);
-#endif
diff --git a/include/linux/cramfs_fs_sb.h b/include/linux/cramfs_fs_sb.h
deleted file mode 100644
index 8390693568f..00000000000
--- a/include/linux/cramfs_fs_sb.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _CRAMFS_FS_SB
-#define _CRAMFS_FS_SB
-
-/*
- * cramfs super-block data in memory
- */
-struct cramfs_sb_info {
-			unsigned long magic;
-			unsigned long size;
-			unsigned long blocks;
-			unsigned long files;
-			unsigned long flags;
-};
-
-static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-#endif
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index 7c098ac9068..a8227022e3a 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -13,7 +13,7 @@
 #include <linux/minix_fs.h>
 #include <linux/ext2_fs.h>
 #include <linux/romfs_fs.h>
-#include <linux/cramfs_fs.h>
+#include <uapi/linux/cramfs_fs.h>
 #include <linux/initrd.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3-70-g09d2


From 5a9ed6f5e7b80c95b133818aa96b1fba8e1216a0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Dec 2013 17:05:05 -0500
Subject: efs: get rid of ->put_super()

simplifies failure exits in ->mount()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/efs/super.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/efs/super.c b/fs/efs/super.c
index c6f57a74a55..50215bbd646 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -26,11 +26,18 @@ static struct dentry *efs_mount(struct file_system_type *fs_type,
 	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
 }
 
+static void efs_kill_sb(struct super_block *s)
+{
+	struct efs_sb_info *sbi = SUPER_INFO(s);
+	kill_block_super(s);
+	kfree(sbi);
+}
+
 static struct file_system_type efs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "efs",
 	.mount		= efs_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= efs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("efs");
@@ -105,12 +112,6 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(efs_inode_cachep);
 }
 
-static void efs_put_super(struct super_block *s)
-{
-	kfree(s->s_fs_info);
-	s->s_fs_info = NULL;
-}
-
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
 	*flags |= MS_RDONLY;
@@ -120,7 +121,6 @@ static int efs_remount(struct super_block *sb, int *flags, char *data)
 static const struct super_operations efs_superblock_operations = {
 	.alloc_inode	= efs_alloc_inode,
 	.destroy_inode	= efs_destroy_inode,
-	.put_super	= efs_put_super,
 	.statfs		= efs_statfs,
 	.remount_fs	= efs_remount,
 };
@@ -259,7 +259,6 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	struct efs_sb_info *sb;
 	struct buffer_head *bh;
 	struct inode *root;
-	int ret = -EINVAL;
 
  	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
 	if (!sb)
@@ -270,7 +269,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
 		printk(KERN_ERR "EFS: device does not support %d byte blocks\n",
 			EFS_BLOCKSIZE);
-		goto out_no_fs_ul;
+		return -EINVAL;
 	}
   
 	/* read the vh (volume header) block */
@@ -278,7 +277,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 
 	if (!bh) {
 		printk(KERN_ERR "EFS: cannot read volume header\n");
-		goto out_no_fs_ul;
+		return -EINVAL;
 	}
 
 	/*
@@ -290,13 +289,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	brelse(bh);
 
 	if (sb->fs_start == -1) {
-		goto out_no_fs_ul;
+		return -EINVAL;
 	}
 
 	bh = sb_bread(s, sb->fs_start + EFS_SUPER);
 	if (!bh) {
 		printk(KERN_ERR "EFS: cannot read superblock\n");
-		goto out_no_fs_ul;
+		return -EINVAL;
 	}
 		
 	if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
@@ -304,7 +303,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 		printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER);
 #endif
 		brelse(bh);
-		goto out_no_fs_ul;
+		return -EINVAL;
 	}
 	brelse(bh);
 
@@ -319,24 +318,16 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	root = efs_iget(s, EFS_ROOTINODE);
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "EFS: get root inode failed\n");
-		ret = PTR_ERR(root);
-		goto out_no_fs;
+		return PTR_ERR(root);
 	}
 
 	s->s_root = d_make_root(root);
 	if (!(s->s_root)) {
 		printk(KERN_ERR "EFS: get root dentry failed\n");
-		ret = -ENOMEM;
-		goto out_no_fs;
+		return -ENOMEM;
 	}
 
 	return 0;
-
-out_no_fs_ul:
-out_no_fs:
-	s->s_fs_info = NULL;
-	kfree(sb);
-	return ret;
 }
 
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
-- 
cgit v1.2.3-70-g09d2


From 208adb6403e079ceeb8e731696615d22db6f397b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Dec 2013 19:48:58 -0500
Subject: qnx4: clean qnx4_fill_super() up

* pass on-disk superblock to qnx4_chkroot() explicitly
* don't leave stale (and unused) pointers in qnx4_super_block
* free stuff in ->kill_sb(); ->put_super() becomes empty and dies
* simplify failure exits

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 63 ++++++++++++++++++++-------------------------------------
 fs/qnx4/qnx4.h  |  2 --
 2 files changed, 22 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2e8caa62da7..89558810381 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -27,7 +27,6 @@
 
 static const struct super_operations qnx4_sops;
 
-static void qnx4_put_super(struct super_block *sb);
 static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
@@ -37,7 +36,6 @@ static const struct super_operations qnx4_sops =
 {
 	.alloc_inode	= qnx4_alloc_inode,
 	.destroy_inode	= qnx4_destroy_inode,
-	.put_super	= qnx4_put_super,
 	.statfs		= qnx4_statfs,
 	.remount_fs	= qnx4_remount,
 };
@@ -148,18 +146,19 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
  * it really _is_ a qnx4 filesystem, and to check the size
  * of the directory entry.
  */
-static const char *qnx4_checkroot(struct super_block *sb)
+static const char *qnx4_checkroot(struct super_block *sb,
+				  struct qnx4_super_block *s)
 {
 	struct buffer_head *bh;
 	struct qnx4_inode_entry *rootdir;
 	int rd, rl;
 	int i, j;
 
-	if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
+	if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0')
 		return "no qnx4 filesystem (no root dir).";
 	QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
-	rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
-	rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+	rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1;
+	rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size);
 	for (j = 0; j < rl; j++) {
 		bh = sb_bread(sb, rd + j);	/* root dir, first block */
 		if (bh == NULL)
@@ -189,7 +188,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	struct inode *root;
 	const char *errmsg;
 	struct qnx4_sb_info *qs;
-	int ret = -EINVAL;
 
 	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
 	if (!qs)
@@ -198,67 +196,50 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 
 	sb_set_blocksize(s, QNX4_BLOCK_SIZE);
 
+	s->s_op = &qnx4_sops;
+	s->s_magic = QNX4_SUPER_MAGIC;
+	s->s_flags |= MS_RDONLY;	/* Yup, read-only yet */
+
 	/* Check the superblock signature. Since the qnx4 code is
 	   dangerous, we should leave as quickly as possible
 	   if we don't belong here... */
 	bh = sb_bread(s, 1);
 	if (!bh) {
 		printk(KERN_ERR "qnx4: unable to read the superblock\n");
-		goto outnobh;
+		return -EINVAL;
 	}
-	if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
-		if (!silent)
-			printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
-		goto out;
-	}
-	s->s_op = &qnx4_sops;
-	s->s_magic = QNX4_SUPER_MAGIC;
-	s->s_flags |= MS_RDONLY;	/* Yup, read-only yet */
-	qnx4_sb(s)->sb_buf = bh;
-	qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
-
 
  	/* check before allocating dentries, inodes, .. */
-	errmsg = qnx4_checkroot(s);
+	errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
+	brelse(bh);
 	if (errmsg != NULL) {
  		if (!silent)
 			printk(KERN_ERR "qnx4: %s\n", errmsg);
-		goto out;
+		return -EINVAL;
 	}
 
  	/* does root not have inode number QNX4_ROOT_INO ?? */
 	root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "qnx4: get inode failed\n");
-		ret = PTR_ERR(root);
- 		goto outb;
+		return PTR_ERR(root);
  	}
 
-	ret = -ENOMEM;
  	s->s_root = d_make_root(root);
  	if (s->s_root == NULL)
- 		goto outb;
+ 		return -ENOMEM;
 
-	brelse(bh);
 	return 0;
-
-      outb:
-	kfree(qs->BitMap);
-      out:
-	brelse(bh);
-      outnobh:
-	kfree(qs);
-	s->s_fs_info = NULL;
-	return ret;
 }
 
-static void qnx4_put_super(struct super_block *sb)
+static void qnx4_kill_sb(struct super_block *sb)
 {
 	struct qnx4_sb_info *qs = qnx4_sb(sb);
-	kfree( qs->BitMap );
-	kfree( qs );
-	sb->s_fs_info = NULL;
-	return;
+	kill_block_super(sb);
+	if (qs) {
+		kfree(qs->BitMap);
+		kfree(qs);
+	}
 }
 
 static int qnx4_readpage(struct file *file, struct page *page)
@@ -409,7 +390,7 @@ static struct file_system_type qnx4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "qnx4",
 	.mount		= qnx4_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= qnx4_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("qnx4");
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 34e2d329c97..c9b1be2c164 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -10,8 +10,6 @@
 #endif
 
 struct qnx4_sb_info {
-	struct buffer_head	*sb_buf;	/* superblock buffer */
-	struct qnx4_super_block	*sb;		/* our superblock */
 	unsigned int		Version;	/* may be useful */
 	struct qnx4_inode_entry	*BitMap;	/* useful */
 };
-- 
cgit v1.2.3-70-g09d2


From 1c1c8747cd0528fe1d225badf25bf5346d799ea3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 11 Dec 2013 23:07:51 -0500
Subject: btrfs: sanitize BTRFS_IOC_FILE_EXTENT_SAME

* don't assume that ->dest_count won't change between copy_from_user()
and memdup_user()
* use fdget instead of fget
* don't bother comparing superblocks when we'd already compared vfsmounts
* get rid of excessive goto
* use file_inode() instead of open-coding the sucker

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ioctl.c | 70 +++++++++++++++++++-------------------------------------
 1 file changed, 24 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 21da5762b0b..ad27dcea319 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2686,14 +2686,11 @@ out_unlock:
 #define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
 
 static long btrfs_ioctl_file_extent_same(struct file *file,
-					 void __user *argp)
+			struct btrfs_ioctl_same_args __user *argp)
 {
-	struct btrfs_ioctl_same_args tmp;
 	struct btrfs_ioctl_same_args *same;
 	struct btrfs_ioctl_same_extent_info *info;
-	struct inode *src = file->f_dentry->d_inode;
-	struct file *dst_file = NULL;
-	struct inode *dst;
+	struct inode *src = file_inode(file);
 	u64 off;
 	u64 len;
 	int i;
@@ -2701,6 +2698,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 	unsigned long size;
 	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
 	bool is_admin = capable(CAP_SYS_ADMIN);
+	u16 count;
 
 	if (!(file->f_mode & FMODE_READ))
 		return -EINVAL;
@@ -2709,17 +2707,14 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 	if (ret)
 		return ret;
 
-	if (copy_from_user(&tmp,
-			   (struct btrfs_ioctl_same_args __user *)argp,
-			   sizeof(tmp))) {
+	if (get_user(count, &argp->dest_count)) {
 		ret = -EFAULT;
 		goto out;
 	}
 
-	size = sizeof(tmp) +
-		tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
+	size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
 
-	same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size);
+	same = memdup_user(argp, size);
 
 	if (IS_ERR(same)) {
 		ret = PTR_ERR(same);
@@ -2756,52 +2751,35 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 		goto out;
 
 	/* pre-format output fields to sane values */
-	for (i = 0; i < same->dest_count; i++) {
+	for (i = 0; i < count; i++) {
 		same->info[i].bytes_deduped = 0ULL;
 		same->info[i].status = 0;
 	}
 
-	ret = 0;
-	for (i = 0; i < same->dest_count; i++) {
-		info = &same->info[i];
-
-		dst_file = fget(info->fd);
-		if (!dst_file) {
+	for (i = 0, info = same->info; i < count; i++, info++) {
+		struct inode *dst;
+		struct fd dst_file = fdget(info->fd);
+		if (!dst_file.file) {
 			info->status = -EBADF;
-			goto next;
+			continue;
 		}
+		dst = file_inode(dst_file.file);
 
-		if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+		if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
 			info->status = -EINVAL;
-			goto next;
-		}
-
-		info->status = -EXDEV;
-		if (file->f_path.mnt != dst_file->f_path.mnt)
-			goto next;
-
-		dst = dst_file->f_dentry->d_inode;
-		if (src->i_sb != dst->i_sb)
-			goto next;
-
-		if (S_ISDIR(dst->i_mode)) {
+		} else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+			info->status = -EXDEV;
+		} else if (S_ISDIR(dst->i_mode)) {
 			info->status = -EISDIR;
-			goto next;
-		}
-
-		if (!S_ISREG(dst->i_mode)) {
+		} else if (!S_ISREG(dst->i_mode)) {
 			info->status = -EACCES;
-			goto next;
+		} else {
+			info->status = btrfs_extent_same(src, off, len, dst,
+							info->logical_offset);
+			if (info->status == 0)
+				info->bytes_deduped += len;
 		}
-
-		info->status = btrfs_extent_same(src, off, len, dst,
-						info->logical_offset);
-		if (info->status == 0)
-			info->bytes_deduped += len;
-
-next:
-		if (dst_file)
-			fput(dst_file);
+		fdput(dst_file);
 	}
 
 	ret = copy_to_user(argp, same, size);
-- 
cgit v1.2.3-70-g09d2


From 36a7411724b1caf2fa92b5e4a41576ee8f16769e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 23 Dec 2013 16:51:33 -0500
Subject: eventfd_ctx_fdget(): use fdget() instead of fget()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventfd.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 35470d9b96e..d6a88e7812f 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -349,15 +349,12 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
  */
 struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 {
-	struct file *file;
 	struct eventfd_ctx *ctx;
-
-	file = eventfd_fget(fd);
-	if (IS_ERR(file))
-		return (struct eventfd_ctx *) file;
-	ctx = eventfd_ctx_get(file->private_data);
-	fput(file);
-
+	struct fd f = fdget(fd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+	ctx = eventfd_ctx_fileget(f.file);
+	fdput(f);
 	return ctx;
 }
 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
-- 
cgit v1.2.3-70-g09d2


From 479e64c21038326f4fe429b4ffb7ea6d3175c2dc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 22 Nov 2013 01:45:04 -0500
Subject: nls: have register_nls() set ->owner

pass owner explicitly to __register_nls(), make register_nls() a macro passing
THIS_MODULE as the owner argument to __register_nls().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nls/mac-celtic.c     | 1 -
 fs/nls/mac-centeuro.c   | 1 -
 fs/nls/mac-croatian.c   | 1 -
 fs/nls/mac-cyrillic.c   | 1 -
 fs/nls/mac-gaelic.c     | 1 -
 fs/nls/mac-greek.c      | 1 -
 fs/nls/mac-iceland.c    | 1 -
 fs/nls/mac-inuit.c      | 1 -
 fs/nls/mac-roman.c      | 1 -
 fs/nls/mac-romanian.c   | 1 -
 fs/nls/mac-turkish.c    | 1 -
 fs/nls/nls_ascii.c      | 1 -
 fs/nls/nls_base.c       | 5 +++--
 fs/nls/nls_cp1250.c     | 1 -
 fs/nls/nls_cp1251.c     | 1 -
 fs/nls/nls_cp1255.c     | 1 -
 fs/nls/nls_cp437.c      | 1 -
 fs/nls/nls_cp737.c      | 1 -
 fs/nls/nls_cp775.c      | 1 -
 fs/nls/nls_cp850.c      | 1 -
 fs/nls/nls_cp852.c      | 1 -
 fs/nls/nls_cp855.c      | 1 -
 fs/nls/nls_cp857.c      | 1 -
 fs/nls/nls_cp860.c      | 1 -
 fs/nls/nls_cp861.c      | 1 -
 fs/nls/nls_cp862.c      | 1 -
 fs/nls/nls_cp863.c      | 1 -
 fs/nls/nls_cp864.c      | 1 -
 fs/nls/nls_cp865.c      | 1 -
 fs/nls/nls_cp866.c      | 1 -
 fs/nls/nls_cp869.c      | 1 -
 fs/nls/nls_cp874.c      | 1 -
 fs/nls/nls_cp932.c      | 1 -
 fs/nls/nls_cp936.c      | 1 -
 fs/nls/nls_cp949.c      | 1 -
 fs/nls/nls_cp950.c      | 1 -
 fs/nls/nls_euc-jp.c     | 1 -
 fs/nls/nls_iso8859-1.c  | 1 -
 fs/nls/nls_iso8859-13.c | 1 -
 fs/nls/nls_iso8859-14.c | 1 -
 fs/nls/nls_iso8859-15.c | 1 -
 fs/nls/nls_iso8859-2.c  | 1 -
 fs/nls/nls_iso8859-3.c  | 1 -
 fs/nls/nls_iso8859-4.c  | 1 -
 fs/nls/nls_iso8859-5.c  | 1 -
 fs/nls/nls_iso8859-6.c  | 1 -
 fs/nls/nls_iso8859-7.c  | 1 -
 fs/nls/nls_iso8859-9.c  | 1 -
 fs/nls/nls_koi8-r.c     | 1 -
 fs/nls/nls_koi8-ru.c    | 1 -
 fs/nls/nls_koi8-u.c     | 1 -
 fs/nls/nls_utf8.c       | 1 -
 include/linux/nls.h     | 3 ++-
 53 files changed, 5 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c
index 634a8b717b0..266c2d7d50b 100644
--- a/fs/nls/mac-celtic.c
+++ b/fs/nls/mac-celtic.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_macceltic(void)
diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c
index 979e6265ac5..9789c605755 100644
--- a/fs/nls/mac-centeuro.c
+++ b/fs/nls/mac-centeuro.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_maccenteuro(void)
diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c
index dd3f675911e..bb19e7a07d4 100644
--- a/fs/nls/mac-croatian.c
+++ b/fs/nls/mac-croatian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_maccroatian(void)
diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c
index 1112c84dd8b..2a7dea36acb 100644
--- a/fs/nls/mac-cyrillic.c
+++ b/fs/nls/mac-cyrillic.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_maccyrillic(void)
diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c
index 2de9158409c..77b00165358 100644
--- a/fs/nls/mac-gaelic.c
+++ b/fs/nls/mac-gaelic.c
@@ -548,7 +548,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_macgaelic(void)
diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c
index a8631008280..1eccf499e2e 100644
--- a/fs/nls/mac-greek.c
+++ b/fs/nls/mac-greek.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_macgreek(void)
diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c
index babe2998d5c..cbd0875c6d6 100644
--- a/fs/nls/mac-iceland.c
+++ b/fs/nls/mac-iceland.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_maciceland(void)
diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c
index 312364f010d..fba8357aaf0 100644
--- a/fs/nls/mac-inuit.c
+++ b/fs/nls/mac-inuit.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_macinuit(void)
diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c
index 53ce0809cbd..b6a98a5208c 100644
--- a/fs/nls/mac-roman.c
+++ b/fs/nls/mac-roman.c
@@ -618,7 +618,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_macroman(void)
diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c
index add6f7a0c66..25547f02363 100644
--- a/fs/nls/mac-romanian.c
+++ b/fs/nls/mac-romanian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_macromanian(void)
diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c
index dffa96d5de0..b5454bc7b7f 100644
--- a/fs/nls/mac-turkish.c
+++ b/fs/nls/mac-turkish.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_macturkish(void)
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
index 7020e940f74..a2620650d5e 100644
--- a/fs/nls/nls_ascii.c
+++ b/fs/nls/nls_ascii.c
@@ -148,7 +148,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_ascii(void)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index fea6bd5831d..52ccd34b1e7 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -232,13 +232,14 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
 }
 EXPORT_SYMBOL(utf16s_to_utf8s);
 
-int register_nls(struct nls_table * nls)
+int __register_nls(struct nls_table *nls, struct module *owner)
 {
 	struct nls_table ** tmp = &tables;
 
 	if (nls->next)
 		return -EBUSY;
 
+	nls->owner = owner;
 	spin_lock(&nls_lock);
 	while (*tmp) {
 		if (nls == *tmp) {
@@ -252,6 +253,7 @@ int register_nls(struct nls_table * nls)
 	spin_unlock(&nls_lock);
 	return 0;	
 }
+EXPORT_SYMBOL(__register_nls);
 
 int unregister_nls(struct nls_table * nls)
 {
@@ -538,7 +540,6 @@ struct nls_table *load_nls_default(void)
 		return &default_table;
 }
 
-EXPORT_SYMBOL(register_nls);
 EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
index c8471fe78e4..ace3e19d340 100644
--- a/fs/nls/nls_cp1250.c
+++ b/fs/nls/nls_cp1250.c
@@ -329,7 +329,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp1250(void)
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
index 1939b46e772..9273ddfd08a 100644
--- a/fs/nls/nls_cp1251.c
+++ b/fs/nls/nls_cp1251.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp1251(void)
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
index 8120ae2e091..1caf5dfed85 100644
--- a/fs/nls/nls_cp1255.c
+++ b/fs/nls/nls_cp1255.c
@@ -365,7 +365,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp1255(void)
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
index ff37a4628ce..7ddb830da3f 100644
--- a/fs/nls/nls_cp437.c
+++ b/fs/nls/nls_cp437.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp437(void)
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
index f5576b8be1b..c593f683a0c 100644
--- a/fs/nls/nls_cp737.c
+++ b/fs/nls/nls_cp737.c
@@ -332,7 +332,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp737(void)
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
index 4905635d1c0..554c863745f 100644
--- a/fs/nls/nls_cp775.c
+++ b/fs/nls/nls_cp775.c
@@ -301,7 +301,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp775(void)
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
index fe5bdad50e2..56cccd14b40 100644
--- a/fs/nls/nls_cp850.c
+++ b/fs/nls/nls_cp850.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp850(void)
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
index ceb1c0166dd..7cdc05ac1d4 100644
--- a/fs/nls/nls_cp852.c
+++ b/fs/nls/nls_cp852.c
@@ -319,7 +319,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp852(void)
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
index cc7f5fb2e0c..7426eea0566 100644
--- a/fs/nls/nls_cp855.c
+++ b/fs/nls/nls_cp855.c
@@ -281,7 +281,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp855(void)
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
index e418e198e8d..098309733eb 100644
--- a/fs/nls/nls_cp857.c
+++ b/fs/nls/nls_cp857.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp857(void)
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
index a86c97d1aa3..84224478e73 100644
--- a/fs/nls/nls_cp860.c
+++ b/fs/nls/nls_cp860.c
@@ -346,7 +346,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp860(void)
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
index bd920227acd..dc873e4be09 100644
--- a/fs/nls/nls_cp861.c
+++ b/fs/nls/nls_cp861.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp861(void)
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
index e9b68eb3daf..d5263e3c556 100644
--- a/fs/nls/nls_cp862.c
+++ b/fs/nls/nls_cp862.c
@@ -403,7 +403,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp862(void)
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
index f8a9b07ab4e..051c9832e36 100644
--- a/fs/nls/nls_cp863.c
+++ b/fs/nls/nls_cp863.c
@@ -363,7 +363,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp863(void)
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
index 8d31f435fc6..97eb1273b2f 100644
--- a/fs/nls/nls_cp864.c
+++ b/fs/nls/nls_cp864.c
@@ -389,7 +389,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp864(void)
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
index 4bd902fe3ec..11121422852 100644
--- a/fs/nls/nls_cp865.c
+++ b/fs/nls/nls_cp865.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp865(void)
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
index bdc7cb39139..ffdcbc3fc38 100644
--- a/fs/nls/nls_cp866.c
+++ b/fs/nls/nls_cp866.c
@@ -287,7 +287,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp866(void)
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
index 9f283a2b151..3b5a3458935 100644
--- a/fs/nls/nls_cp869.c
+++ b/fs/nls/nls_cp869.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp869(void)
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
index 0b3c4886f8c..8dfaa10710f 100644
--- a/fs/nls/nls_cp874.c
+++ b/fs/nls/nls_cp874.c
@@ -256,7 +256,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp874(void)
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
index 0ffed6f1ceb..67b7398e848 100644
--- a/fs/nls/nls_cp932.c
+++ b/fs/nls/nls_cp932.c
@@ -7914,7 +7914,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp932(void)
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index 82770301bc3..c96546cfec9 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -11092,7 +11092,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp936(void)
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
index 8a7a2fe85c6..199171e97aa 100644
--- a/fs/nls/nls_cp949.c
+++ b/fs/nls/nls_cp949.c
@@ -13927,7 +13927,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp949(void)
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
index ef2536829aa..8e141870820 100644
--- a/fs/nls/nls_cp950.c
+++ b/fs/nls/nls_cp950.c
@@ -9463,7 +9463,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_cp950(void)
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
index 7424929a278..162b3f16035 100644
--- a/fs/nls/nls_euc-jp.c
+++ b/fs/nls/nls_euc-jp.c
@@ -553,7 +553,6 @@ static struct nls_table table = {
 	.charset	= "euc-jp",
 	.uni2char	= uni2char,
 	.char2uni	= char2uni,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_euc_jp(void)
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
index 7b951bb5849..69ac020d43b 100644
--- a/fs/nls/nls_iso8859-1.c
+++ b/fs/nls/nls_iso8859-1.c
@@ -239,7 +239,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_1(void)
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
index c4d52ea9f09..afb3f8f275f 100644
--- a/fs/nls/nls_iso8859-13.c
+++ b/fs/nls/nls_iso8859-13.c
@@ -267,7 +267,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_13(void)
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
index dc02600c7fe..046370f0b6f 100644
--- a/fs/nls/nls_iso8859-14.c
+++ b/fs/nls/nls_iso8859-14.c
@@ -323,7 +323,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_14(void)
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
index 3c7dfc832ef..7e34a841a05 100644
--- a/fs/nls/nls_iso8859-15.c
+++ b/fs/nls/nls_iso8859-15.c
@@ -289,7 +289,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_15(void)
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
index a2d2197e4c7..7dd57118174 100644
--- a/fs/nls/nls_iso8859-2.c
+++ b/fs/nls/nls_iso8859-2.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_2(void)
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
index a61e0daa3a8..740b75ec449 100644
--- a/fs/nls/nls_iso8859-3.c
+++ b/fs/nls/nls_iso8859-3.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_3(void)
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
index e8ff555483b..8826021e32f 100644
--- a/fs/nls/nls_iso8859-4.c
+++ b/fs/nls/nls_iso8859-4.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_4(void)
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
index 4721e893012..7c04057a1ad 100644
--- a/fs/nls/nls_iso8859-5.c
+++ b/fs/nls/nls_iso8859-5.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_5(void)
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
index 01a517d6d30..d4a881400d7 100644
--- a/fs/nls/nls_iso8859-6.c
+++ b/fs/nls/nls_iso8859-6.c
@@ -245,7 +245,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_6(void)
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
index 2d27b93ef19..37b75d825a7 100644
--- a/fs/nls/nls_iso8859-7.c
+++ b/fs/nls/nls_iso8859-7.c
@@ -299,7 +299,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_7(void)
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
index 694bf070c72..557b98250d3 100644
--- a/fs/nls/nls_iso8859-9.c
+++ b/fs/nls/nls_iso8859-9.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_iso8859_9(void)
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
index 43875310540..811f232fccf 100644
--- a/fs/nls/nls_koi8-r.c
+++ b/fs/nls/nls_koi8-r.c
@@ -305,7 +305,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_koi8_r(void)
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
index e7bc1d75c78..a80a741a867 100644
--- a/fs/nls/nls_koi8-ru.c
+++ b/fs/nls/nls_koi8-ru.c
@@ -55,7 +55,6 @@ static struct nls_table table = {
 	.charset	= "koi8-ru",
 	.uni2char	= uni2char,
 	.char2uni	= char2uni,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_koi8_ru(void)
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
index 8c9f0292b5a..7e029e4c188 100644
--- a/fs/nls/nls_koi8-u.c
+++ b/fs/nls/nls_koi8-u.c
@@ -312,7 +312,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= charset2lower,
 	.charset2upper	= charset2upper,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_koi8_u(void)
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index 0d60a44acac..afcfbc4a14d 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -46,7 +46,6 @@ static struct nls_table table = {
 	.char2uni	= char2uni,
 	.charset2lower	= identity,	/* no conversion */
 	.charset2upper	= identity,
-	.owner		= THIS_MODULE,
 };
 
 static int __init init_nls_utf8(void)
diff --git a/include/linux/nls.h b/include/linux/nls.h
index 5dc635f8d79..520681b6820 100644
--- a/include/linux/nls.h
+++ b/include/linux/nls.h
@@ -44,11 +44,12 @@ enum utf16_endian {
 };
 
 /* nls_base.c */
-extern int register_nls(struct nls_table *);
+extern int __register_nls(struct nls_table *, struct module *);
 extern int unregister_nls(struct nls_table *);
 extern struct nls_table *load_nls(char *);
 extern void unload_nls(struct nls_table *);
 extern struct nls_table *load_nls_default(void);
+#define register_nls(nls) __register_nls((nls), THIS_MODULE)
 
 extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu);
 extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen);
-- 
cgit v1.2.3-70-g09d2


From b42d570c9fbebc8e779fbcbc2b598fa94a0e809f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 22 Nov 2013 01:53:47 -0500
Subject: afs: get rid of junk in fs/afs/proc.c

kill pointless method instances and don't bother with ->owner - it's
ignored for procfs files anyway, make use of remove_proc_subtree() for
removal, get rid of cell->proc_dir.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/internal.h |   1 -
 fs/afs/proc.c     | 122 ++++++++++--------------------------------------------
 2 files changed, 22 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a306bb6d88d..6621f800812 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -195,7 +195,6 @@ struct afs_cell {
 	struct list_head	link;		/* main cell list link */
 	struct key		*anonymous_key;	/* anonymous user key for this cell */
 	struct list_head	proc_link;	/* /proc cell list link */
-	struct proc_dir_entry	*proc_dir;	/* /proc dir for this cell */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 526e4bbbde5..bddc5120ed4 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = {
 	.write		= afs_proc_cells_write,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
-	.owner		= THIS_MODULE,
 };
 
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file);
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file);
 static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
 				      size_t size, loff_t *_pos);
 static ssize_t afs_proc_rootcell_write(struct file *file,
@@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 				       size_t size, loff_t *_pos);
 
 static const struct file_operations afs_proc_rootcell_fops = {
-	.open		= afs_proc_rootcell_open,
 	.read		= afs_proc_rootcell_read,
 	.write		= afs_proc_rootcell_write,
 	.llseek		= no_llseek,
-	.release	= afs_proc_rootcell_release,
-	.owner		= THIS_MODULE,
 };
 
 static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_volumes_release(struct inode *inode,
-					 struct file *file);
 static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 					loff_t *pos);
@@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = {
 	.open		= afs_proc_cell_volumes_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= afs_proc_cell_volumes_release,
-	.owner		= THIS_MODULE,
+	.release	= seq_release,
 };
 
 static int afs_proc_cell_vlservers_open(struct inode *inode,
 					struct file *file);
-static int afs_proc_cell_vlservers_release(struct inode *inode,
-					   struct file *file);
 static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 					  loff_t *pos);
@@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
 	.open		= afs_proc_cell_vlservers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= afs_proc_cell_vlservers_release,
-	.owner		= THIS_MODULE,
+	.release	= seq_release,
 };
 
 static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_servers_release(struct inode *inode,
-					 struct file *file);
 static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
 					loff_t *pos);
@@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 	.open		= afs_proc_cell_servers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= afs_proc_cell_servers_release,
-	.owner		= THIS_MODULE,
+	.release	= seq_release,
 };
 
 /*
@@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = {
  */
 int afs_proc_init(void)
 {
-	struct proc_dir_entry *p;
-
 	_enter("");
 
 	proc_afs = proc_mkdir("fs/afs", NULL);
 	if (!proc_afs)
 		goto error_dir;
 
-	p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
-	if (!p)
-		goto error_cells;
-
-	p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
-	if (!p)
-		goto error_rootcell;
+	if (!proc_create("cells", 0, proc_afs, &afs_proc_cells_fops) ||
+	    !proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops))
+		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
-error_rootcell:
- 	remove_proc_entry("cells", proc_afs);
-error_cells:
-	remove_proc_entry("fs/afs", NULL);
+error_tree:
+	remove_proc_subtree("fs/afs", NULL);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -172,9 +149,7 @@ error_dir:
  */
 void afs_proc_cleanup(void)
 {
-	remove_proc_entry("rootcell", proc_afs);
-	remove_proc_entry("cells", proc_afs);
-	remove_proc_entry("fs/afs", NULL);
+	remove_proc_subtree("fs/afs", NULL);
 }
 
 /*
@@ -319,19 +294,6 @@ inval:
 	goto done;
 }
 
-/*
- * Stubs for /proc/fs/afs/rootcell
- */
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file)
-{
-	return 0;
-}
-
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file)
-{
-	return 0;
-}
-
 static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
 				      size_t size, loff_t *_pos)
 {
@@ -387,38 +349,27 @@ nomem:
  */
 int afs_proc_cell_setup(struct afs_cell *cell)
 {
-	struct proc_dir_entry *p;
+	struct proc_dir_entry *dir;
 
 	_enter("%p{%s}", cell, cell->name);
 
-	cell->proc_dir = proc_mkdir(cell->name, proc_afs);
-	if (!cell->proc_dir)
+	dir = proc_mkdir(cell->name, proc_afs);
+	if (!dir)
 		goto error_dir;
 
-	p = proc_create_data("servers", 0, cell->proc_dir,
-			     &afs_proc_cell_servers_fops, cell);
-	if (!p)
-		goto error_servers;
-
-	p = proc_create_data("vlservers", 0, cell->proc_dir,
-			     &afs_proc_cell_vlservers_fops, cell);
-	if (!p)
-		goto error_vlservers;
-
-	p = proc_create_data("volumes", 0, cell->proc_dir,
-			     &afs_proc_cell_volumes_fops, cell);
-	if (!p)
-		goto error_volumes;
+	if (!proc_create_data("servers", 0, dir,
+			     &afs_proc_cell_servers_fops, cell) ||
+	    !proc_create_data("vlservers", 0, dir,
+			     &afs_proc_cell_vlservers_fops, cell) ||
+	    !proc_create_data("volumes", 0, dir,
+			     &afs_proc_cell_volumes_fops, cell))
+		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
-error_volumes:
-	remove_proc_entry("vlservers", cell->proc_dir);
-error_vlservers:
-	remove_proc_entry("servers", cell->proc_dir);
-error_servers:
-	remove_proc_entry(cell->name, proc_afs);
+error_tree:
+	remove_proc_subtree(cell->name, proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell)
 {
 	_enter("");
 
-	remove_proc_entry("volumes", cell->proc_dir);
-	remove_proc_entry("vlservers", cell->proc_dir);
-	remove_proc_entry("servers", cell->proc_dir);
-	remove_proc_entry(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, proc_afs);
 
 	_leave("");
 }
@@ -462,14 +410,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-/*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
-{
-	return seq_release(inode, file);
-}
-
 /*
  * set up the iterator to start reading from the cells list and return the
  * first item
@@ -568,15 +508,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-/*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_vlservers_release(struct inode *inode,
-					   struct file *file)
-{
-	return seq_release(inode, file);
-}
-
 /*
  * set up the iterator to start reading from the cells list and return the
  * first item
@@ -672,15 +603,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-/*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_servers_release(struct inode *inode,
-					 struct file *file)
-{
-	return seq_release(inode, file);
-}
-
 /*
  * set up the iterator to start reading from the cells list and return the
  * first item
-- 
cgit v1.2.3-70-g09d2


From 2ccdc413196b43a02bb68b46be5b68850904e9ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 22 Nov 2013 13:25:39 -0500
Subject: kill reiserfs_bdevname()

it's never called with NULL argument...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/procfs.c   | 4 ++--
 fs/reiserfs/reiserfs.h | 8 --------
 fs/reiserfs/super.c    | 8 ++++----
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index a958444a75f..02b0b7d0f7d 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -419,7 +419,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
 	char *s;
 
 	/* Some block devices use /'s */
-	strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+	strlcpy(b, sb->s_id, BDEVNAME_SIZE);
 	s = strchr(b, '/');
 	if (s)
 		*s = '!';
@@ -449,7 +449,7 @@ int reiserfs_proc_info_done(struct super_block *sb)
 		char *s;
 
 		/* Some block devices use /'s */
-		strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+		strlcpy(b, sb->s_id, BDEVNAME_SIZE);
 		s = strchr(b, '/');
 		if (s)
 			*s = '!';
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f8adaee537c..24f068c33b4 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -608,14 +608,6 @@ int reiserfs_resize(struct super_block *, unsigned long);
 
 #define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
 
-/* A safe version of the "bdevname", which returns the "s_id" field of
- * a superblock or else "Null superblock" if the super block is NULL.
- */
-static inline char *reiserfs_bdevname(struct super_block *s)
-{
-	return (s == NULL) ? "Null superblock" : s->s_id;
-}
-
 #define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
 static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
 						*journal)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3ead145dadc..2c803353f8a 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1479,7 +1479,7 @@ static int read_super_block(struct super_block *s, int offset)
 	if (!bh) {
 		reiserfs_warning(s, "sh-2006",
 				 "bread failed (dev %s, block %lu, size %lu)",
-				 reiserfs_bdevname(s), offset / s->s_blocksize,
+				 s->s_id, offset / s->s_blocksize,
 				 s->s_blocksize);
 		return 1;
 	}
@@ -1500,7 +1500,7 @@ static int read_super_block(struct super_block *s, int offset)
 	if (!bh) {
 		reiserfs_warning(s, "sh-2007",
 				 "bread failed (dev %s, block %lu, size %lu)",
-				 reiserfs_bdevname(s), offset / s->s_blocksize,
+				 s->s_id, offset / s->s_blocksize,
 				 s->s_blocksize);
 		return 1;
 	}
@@ -1509,7 +1509,7 @@ static int read_super_block(struct super_block *s, int offset)
 	if (sb_blocksize(rs) != s->s_blocksize) {
 		reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
 				 "filesystem on (dev %s, block %Lu, size %lu)",
-				 reiserfs_bdevname(s),
+				 s->s_id,
 				 (unsigned long long)bh->b_blocknr,
 				 s->s_blocksize);
 		brelse(bh);
@@ -1825,7 +1825,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	/* try new format (64-th 1k block), which can contain reiserfs super block */
 	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
 		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
-		      reiserfs_bdevname(s));
+		      s->s_id);
 		goto error_unlocked;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a8d4b8345e0ee48b732126d980efaf0dc373e2b0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 11 Jan 2014 19:19:32 +0100
Subject: introduce __fcheck_files() to fix rcu_dereference_check_fdtable(),
 kill rcu_my_thread_group_empty()

rcu_dereference_check_fdtable() looks very wrong,

1. rcu_my_thread_group_empty() was added by 844b9a8707f1 "vfs: fix
   RCU-lockdep false positive due to /proc" but it doesn't really
   fix the problem. A CLONE_THREAD (without CLONE_FILES) task can
   hit the same race with get_files_struct().

   And otoh rcu_my_thread_group_empty() can suppress the correct
   warning if the caller is the CLONE_FILES (without CLONE_THREAD)
   task.

2. files->count == 1 check is not really right too. Even if this
   files_struct is not shared it is not safe to access it lockless
   unless the caller is the owner.

   Otoh, this check is sub-optimal. files->count == 0 always means
   it is safe to use it lockless even if files != current->files,
   but put_files_struct() has to take rcu_read_lock(). See the next
   patch.

This patch removes the buggy checks and turns fcheck_files() into
__fcheck_files() which uses rcu_dereference_raw(), the "unshared"
callers, fget_light() and fget_raw_light(), can use it to avoid
the warning from RCU-lockdep.

fcheck_files() is trivially reimplemented as rcu_lockdep_assert()
plus __fcheck_files().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c                |  4 ++--
 include/linux/fdtable.h  | 35 +++++++++++++++++++++--------------
 include/linux/rcupdate.h |  2 --
 kernel/rcu/update.c      | 11 -----------
 4 files changed, 23 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 4a78f981557..957cbc09b0d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -707,7 +707,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 
 	*fput_needed = 0;
 	if (atomic_read(&files->count) == 1) {
-		file = fcheck_files(files, fd);
+		file = __fcheck_files(files, fd);
 		if (file && (file->f_mode & FMODE_PATH))
 			file = NULL;
 	} else {
@@ -735,7 +735,7 @@ struct file *fget_raw_light(unsigned int fd, int *fput_needed)
 
 	*fput_needed = 0;
 	if (atomic_read(&files->count) == 1) {
-		file = fcheck_files(files, fd);
+		file = __fcheck_files(files, fd);
 	} else {
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 085197bd881..70e8e21c0a3 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -59,29 +59,36 @@ struct files_struct {
 	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
 };
 
-#define rcu_dereference_check_fdtable(files, fdtfd) \
-	(rcu_dereference_check((fdtfd), \
-			       lockdep_is_held(&(files)->file_lock) || \
-			       atomic_read(&(files)->count) == 1 || \
-			       rcu_my_thread_group_empty()))
-
-#define files_fdtable(files) \
-		(rcu_dereference_check_fdtable((files), (files)->fdt))
-
 struct file_operations;
 struct vfsmount;
 struct dentry;
 
 extern void __init files_defer_init(void);
 
-static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
+#define rcu_dereference_check_fdtable(files, fdtfd) \
+	rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))
+
+#define files_fdtable(files) \
+	rcu_dereference_check_fdtable((files), (files)->fdt)
+
+/*
+ * The caller must ensure that fd table isn't shared or hold rcu or file lock
+ */
+static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd)
 {
-	struct file * file = NULL;
-	struct fdtable *fdt = files_fdtable(files);
+	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
 
 	if (fd < fdt->max_fds)
-		file = rcu_dereference_check_fdtable(files, fdt->fd[fd]);
-	return file;
+		return rcu_dereference_raw(fdt->fd[fd]);
+	return NULL;
+}
+
+static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
+{
+	rcu_lockdep_assert(rcu_read_lock_held() ||
+			   lockdep_is_held(&files->file_lock),
+			   "suspicious rcu_dereference_check() usage");
+	return __fcheck_files(files, fd);
 }
 
 /*
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 39cbb889e20..a2482cf90b6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -448,8 +448,6 @@ static inline int rcu_read_lock_sched_held(void)
 
 #ifdef CONFIG_PROVE_RCU
 
-extern int rcu_my_thread_group_empty(void);
-
 /**
  * rcu_lockdep_assert - emit lockdep splat if specified condition not met
  * @c: condition to check
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 6cb3dff89e2..a3596c8ec9e 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -195,17 +195,6 @@ void wait_rcu_gp(call_rcu_func_t crf)
 }
 EXPORT_SYMBOL_GPL(wait_rcu_gp);
 
-#ifdef CONFIG_PROVE_RCU
-/*
- * wrapper function to avoid #include problems.
- */
-int rcu_my_thread_group_empty(void)
-{
-	return thread_group_empty(current);
-}
-EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
 static inline void debug_init_rcu_head(struct rcu_head *head)
 {
-- 
cgit v1.2.3-70-g09d2


From ce08b62d18b3f97cd4e5a39bd5898872b9201875 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 11 Jan 2014 19:19:53 +0100
Subject: change close_files() to use rcu_dereference_raw(files->fdt)

put_files_struct() and close_files() do rcu_read_lock() to make
rcu_dereference_check_fdtable() happy.

This looks a bit ugly, files_fdtable() just reads the pointer,
we can simply use rcu_dereference_raw() to avoid the warning.

The patch also changes close_files() to return fdt, this avoids
another rcu_read_lock()/files_fdtable() in put_files_struct().

I think close_files() needs more cleanups:

	- we do not need xchg() exactly because we are the last
	  user of this files_struct

	- "if (file)" should be turned into WARN_ON(!file)

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 957cbc09b0d..d34e59e5174 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -348,21 +348,16 @@ out:
 	return NULL;
 }
 
-static void close_files(struct files_struct * files)
+static struct fdtable *close_files(struct files_struct * files)
 {
-	int i, j;
-	struct fdtable *fdt;
-
-	j = 0;
-
 	/*
 	 * It is safe to dereference the fd table without RCU or
 	 * ->file_lock because this is the last reference to the
-	 * files structure.  But use RCU to shut RCU-lockdep up.
+	 * files structure.
 	 */
-	rcu_read_lock();
-	fdt = files_fdtable(files);
-	rcu_read_unlock();
+	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+	int i, j = 0;
+
 	for (;;) {
 		unsigned long set;
 		i = j * BITS_PER_LONG;
@@ -381,6 +376,8 @@ static void close_files(struct files_struct * files)
 			set >>= 1;
 		}
 	}
+
+	return fdt;
 }
 
 struct files_struct *get_files_struct(struct task_struct *task)
@@ -398,14 +395,9 @@ struct files_struct *get_files_struct(struct task_struct *task)
 
 void put_files_struct(struct files_struct *files)
 {
-	struct fdtable *fdt;
-
 	if (atomic_dec_and_test(&files->count)) {
-		close_files(files);
-		/* not really needed, since nobody can see us */
-		rcu_read_lock();
-		fdt = files_fdtable(files);
-		rcu_read_unlock();
+		struct fdtable *fdt = close_files(files);
+
 		/* free the arrays if they are not embedded */
 		if (fdt != &files->fdtab)
 			__free_fdtable(fdt);
-- 
cgit v1.2.3-70-g09d2


From 1deb46e2562561255c34075825fd00f22a858bb3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 13 Jan 2014 16:48:19 +0100
Subject: fs: factor out common code in fget() and fget_raw()

Apart from FMODE_PATH check fget() and fget_raw() are identical,
shift the code into the new simple helper, __fget(fd, mask). Saves
160 bytes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index d34e59e5174..4ed58a32575 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -637,16 +637,16 @@ void do_close_on_exec(struct files_struct *files)
 	spin_unlock(&files->file_lock);
 }
 
-struct file *fget(unsigned int fd)
+static struct file *__fget(unsigned int fd, fmode_t mask)
 {
-	struct file *file;
 	struct files_struct *files = current->files;
+	struct file *file;
 
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
 		/* File object ref couldn't be taken */
-		if (file->f_mode & FMODE_PATH ||
+		if ((file->f_mode & mask) ||
 		    !atomic_long_inc_not_zero(&file->f_count))
 			file = NULL;
 	}
@@ -655,25 +655,16 @@ struct file *fget(unsigned int fd)
 	return file;
 }
 
+struct file *fget(unsigned int fd)
+{
+	return __fget(fd, FMODE_PATH);
+}
 EXPORT_SYMBOL(fget);
 
 struct file *fget_raw(unsigned int fd)
 {
-	struct file *file;
-	struct files_struct *files = current->files;
-
-	rcu_read_lock();
-	file = fcheck_files(files, fd);
-	if (file) {
-		/* File object ref couldn't be taken */
-		if (!atomic_long_inc_not_zero(&file->f_count))
-			file = NULL;
-	}
-	rcu_read_unlock();
-
-	return file;
+	return __fget(fd, 0);
 }
-
 EXPORT_SYMBOL(fget_raw);
 
 /*
-- 
cgit v1.2.3-70-g09d2


From ad46183445043b562856c60b74db664668fb364b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 13 Jan 2014 16:48:40 +0100
Subject: fs: factor out common code in fget_light() and fget_raw_light()

Apart from FMODE_PATH check fget_light() and fget_raw_light() are
identical, shift the code into the new helper, __fget_light(fd, mask).
Saves 208 bytes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 4ed58a32575..50c1208f645 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -683,21 +683,21 @@ EXPORT_SYMBOL(fget_raw);
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-struct file *fget_light(unsigned int fd, int *fput_needed)
+struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed)
 {
-	struct file *file;
 	struct files_struct *files = current->files;
+	struct file *file;
 
 	*fput_needed = 0;
 	if (atomic_read(&files->count) == 1) {
 		file = __fcheck_files(files, fd);
-		if (file && (file->f_mode & FMODE_PATH))
+		if (file && (file->f_mode & mask))
 			file = NULL;
 	} else {
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			if (!(file->f_mode & FMODE_PATH) &&
+			if (!(file->f_mode & mask) &&
 			    atomic_long_inc_not_zero(&file->f_count))
 				*fput_needed = 1;
 			else
@@ -709,30 +709,15 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 
 	return file;
 }
+struct file *fget_light(unsigned int fd, int *fput_needed)
+{
+	return __fget_light(fd, FMODE_PATH, fput_needed);
+}
 EXPORT_SYMBOL(fget_light);
 
 struct file *fget_raw_light(unsigned int fd, int *fput_needed)
 {
-	struct file *file;
-	struct files_struct *files = current->files;
-
-	*fput_needed = 0;
-	if (atomic_read(&files->count) == 1) {
-		file = __fcheck_files(files, fd);
-	} else {
-		rcu_read_lock();
-		file = fcheck_files(files, fd);
-		if (file) {
-			if (atomic_long_inc_not_zero(&file->f_count))
-				*fput_needed = 1;
-			else
-				/* Didn't get the reference, someone's freed */
-				file = NULL;
-		}
-		rcu_read_unlock();
-	}
-
-	return file;
+	return __fget_light(fd, 0, fput_needed);
 }
 
 void set_close_on_exec(unsigned int fd, int flag)
-- 
cgit v1.2.3-70-g09d2


From e6ff9a9fa4e05c1c03dec63cdc6a87d6dea02755 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 13 Jan 2014 16:49:06 +0100
Subject: fs: __fget_light() can use __fget() in slow path

The slow path in __fget_light() can use __fget() to avoid the
code duplication. Saves 232 bytes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 50c1208f645..771578b33fb 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -694,17 +694,9 @@ struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed)
 		if (file && (file->f_mode & mask))
 			file = NULL;
 	} else {
-		rcu_read_lock();
-		file = fcheck_files(files, fd);
-		if (file) {
-			if (!(file->f_mode & mask) &&
-			    atomic_long_inc_not_zero(&file->f_count))
-				*fput_needed = 1;
-			else
-				/* Didn't get the reference, someone's freed */
-				file = NULL;
-		}
-		rcu_read_unlock();
+		file = __fget(fd, mask);
+		if (file)
+			*fput_needed = 1;
 	}
 
 	return file;
-- 
cgit v1.2.3-70-g09d2


From ac34a1b35eca5ef64cb499e25f776bf42a81a660 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Wed, 15 Jan 2014 19:58:28 +0200
Subject: befs: iget_locked() doesn't return an ERR_PTR

Also fix befs_iget return value if iget_locked fails.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/befs/linuxvfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index daa15d6ba45..845d2d690ce 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -324,8 +324,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino);
 
 	inode = iget_locked(sb, ino);
-	if (IS_ERR(inode))
-		return inode;
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
-- 
cgit v1.2.3-70-g09d2


From 9dad943ae7d4a01da6bb18e1a157ab1bfe6186cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:36 -0800
Subject: reiserfs: prefix ACL symbols with reiserfs_

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/xattr_acl.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 06c04f73da6..6f721ea9403 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -16,7 +16,7 @@ static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
 			    struct posix_acl *acl);
 
 static int
-posix_acl_set(struct dentry *dentry, const char *name, const void *value,
+reiserfs_posix_acl_set(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags, int type)
 {
 	struct inode *inode = dentry->d_inode;
@@ -65,7 +65,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
 }
 
 static int
-posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
+reiserfs_posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
 		size_t size, int type)
 {
 	struct posix_acl *acl;
@@ -88,7 +88,7 @@ posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
 /*
  * Convert from filesystem to in-memory representation.
  */
-static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
+static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
 {
 	const char *end = (char *)value + size;
 	int n, count;
@@ -158,7 +158,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
 /*
  * Convert from in-memory to filesystem representation.
  */
-static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
+static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 {
 	reiserfs_acl_header *ext_acl;
 	char *e;
@@ -257,7 +257,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	} else if (retval < 0) {
 		acl = ERR_PTR(retval);
 	} else {
-		acl = posix_acl_from_disk(value, retval);
+		acl = reiserfs_posix_acl_from_disk(value, retval);
 	}
 	if (!IS_ERR(acl))
 		set_cached_acl(inode, type, acl);
@@ -307,7 +307,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	}
 
 	if (acl) {
-		value = posix_acl_to_disk(acl, &size);
+		value = reiserfs_posix_acl_to_disk(acl, &size);
 		if (IS_ERR(value))
 			return (int)PTR_ERR(value);
 	}
@@ -499,8 +499,8 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
 const struct xattr_handler reiserfs_posix_acl_access_handler = {
 	.prefix = POSIX_ACL_XATTR_ACCESS,
 	.flags = ACL_TYPE_ACCESS,
-	.get = posix_acl_get,
-	.set = posix_acl_set,
+	.get = reiserfs_posix_acl_get,
+	.set = reiserfs_posix_acl_set,
 	.list = posix_acl_access_list,
 };
 
@@ -519,7 +519,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
 const struct xattr_handler reiserfs_posix_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.flags = ACL_TYPE_DEFAULT,
-	.get = posix_acl_get,
-	.set = posix_acl_set,
+	.get = reiserfs_posix_acl_get,
+	.set = reiserfs_posix_acl_set,
 	.list = posix_acl_default_list,
 };
-- 
cgit v1.2.3-70-g09d2


From 5c8ebd57b6a51daf53f75b7a16c45090a98a91a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:37 -0800
Subject: fs: merge xattr_acl.c into posix_acl.c

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Makefile    |   2 +-
 fs/posix_acl.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/xattr_acl.c | 180 ---------------------------------------------------------
 3 files changed, 174 insertions(+), 188 deletions(-)
 delete mode 100644 fs/xattr_acl.c

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28..f2c1843820e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -42,7 +42,7 @@ obj-$(CONFIG_BINFMT_SOM)	+= binfmt_som.o
 obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 
 obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
-obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
+obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_COREDUMP)		+= coredump.o
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f8..359d70b0e94 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1,10 +1,8 @@
 /*
- * linux/fs/posix_acl.c
+ * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  *
- *  Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org>
- *
- *  Fixes from William Schumacher incorporated on 15 March 2001.
- *     (Reported by Charles Bertsch, <CBertsch@microtest.com>).
+ * Fixes from William Schumacher incorporated on 15 March 2001.
+ *    (Reported by Charles Bertsch, <CBertsch@microtest.com>).
  */
 
 /*
@@ -18,9 +16,9 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/export.h>
-
-#include <linux/errno.h>
+#include <linux/user_namespace.h>
 
 EXPORT_SYMBOL(posix_acl_init);
 EXPORT_SYMBOL(posix_acl_alloc);
@@ -418,3 +416,171 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 	return err;
 }
 EXPORT_SYMBOL(posix_acl_chmod);
+
+/*
+ * Fix up the uids and gids in posix acl extended attributes in place.
+ */
+static void posix_acl_fix_xattr_userns(
+	struct user_namespace *to, struct user_namespace *from,
+	void *value, size_t size)
+{
+	posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+	posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+	int count;
+	kuid_t uid;
+	kgid_t gid;
+
+	if (!value)
+		return;
+	if (size < sizeof(posix_acl_xattr_header))
+		return;
+	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return;
+
+	count = posix_acl_xattr_count(size);
+	if (count < 0)
+		return;
+	if (count == 0)
+		return;
+
+	for (end = entry + count; entry != end; entry++) {
+		switch(le16_to_cpu(entry->e_tag)) {
+		case ACL_USER:
+			uid = make_kuid(from, le32_to_cpu(entry->e_id));
+			entry->e_id = cpu_to_le32(from_kuid(to, uid));
+			break;
+		case ACL_GROUP:
+			gid = make_kgid(from, le32_to_cpu(entry->e_id));
+			entry->e_id = cpu_to_le32(from_kgid(to, gid));
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void posix_acl_fix_xattr_from_user(void *value, size_t size)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	if (user_ns == &init_user_ns)
+		return;
+	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
+}
+
+void posix_acl_fix_xattr_to_user(void *value, size_t size)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	if (user_ns == &init_user_ns)
+		return;
+	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
+}
+
+/*
+ * Convert from extended attribute to in-memory representation.
+ */
+struct posix_acl *
+posix_acl_from_xattr(struct user_namespace *user_ns,
+		     const void *value, size_t size)
+{
+	posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+	posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+	int count;
+	struct posix_acl *acl;
+	struct posix_acl_entry *acl_e;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(posix_acl_xattr_header))
+		 return ERR_PTR(-EINVAL);
+	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	count = posix_acl_xattr_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	acl_e = acl->a_entries;
+	
+	for (end = entry + count; entry != end; acl_e++, entry++) {
+		acl_e->e_tag  = le16_to_cpu(entry->e_tag);
+		acl_e->e_perm = le16_to_cpu(entry->e_perm);
+
+		switch(acl_e->e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				break;
+
+			case ACL_USER:
+				acl_e->e_uid =
+					make_kuid(user_ns,
+						  le32_to_cpu(entry->e_id));
+				if (!uid_valid(acl_e->e_uid))
+					goto fail;
+				break;
+			case ACL_GROUP:
+				acl_e->e_gid =
+					make_kgid(user_ns,
+						  le32_to_cpu(entry->e_id));
+				if (!gid_valid(acl_e->e_gid))
+					goto fail;
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL (posix_acl_from_xattr);
+
+/*
+ * Convert from in-memory to extended attribute representation.
+ */
+int
+posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
+		   void *buffer, size_t size)
+{
+	posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
+	posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
+	int real_size, n;
+
+	real_size = posix_acl_xattr_size(acl->a_count);
+	if (!buffer)
+		return real_size;
+	if (real_size > size)
+		return -ERANGE;
+	
+	ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+
+	for (n=0; n < acl->a_count; n++, ext_entry++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+		ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch(acl_e->e_tag) {
+		case ACL_USER:
+			ext_entry->e_id =
+				cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			ext_entry->e_id =
+				cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
+			break;
+		default:
+			ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			break;
+		}
+	}
+	return real_size;
+}
+EXPORT_SYMBOL (posix_acl_to_xattr);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
deleted file mode 100644
index 9fbea87fdb6..00000000000
--- a/fs/xattr_acl.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * linux/fs/xattr_acl.c
- *
- * Almost all from linux/fs/ext2/acl.c:
- * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/gfp.h>
-#include <linux/user_namespace.h>
-
-/*
- * Fix up the uids and gids in posix acl extended attributes in place.
- */
-static void posix_acl_fix_xattr_userns(
-	struct user_namespace *to, struct user_namespace *from,
-	void *value, size_t size)
-{
-	posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
-	posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
-	int count;
-	kuid_t uid;
-	kgid_t gid;
-
-	if (!value)
-		return;
-	if (size < sizeof(posix_acl_xattr_header))
-		return;
-	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-		return;
-
-	count = posix_acl_xattr_count(size);
-	if (count < 0)
-		return;
-	if (count == 0)
-		return;
-
-	for (end = entry + count; entry != end; entry++) {
-		switch(le16_to_cpu(entry->e_tag)) {
-		case ACL_USER:
-			uid = make_kuid(from, le32_to_cpu(entry->e_id));
-			entry->e_id = cpu_to_le32(from_kuid(to, uid));
-			break;
-		case ACL_GROUP:
-			gid = make_kgid(from, le32_to_cpu(entry->e_id));
-			entry->e_id = cpu_to_le32(from_kgid(to, gid));
-			break;
-		default:
-			break;
-		}
-	}
-}
-
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
-{
-	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
-		return;
-	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
-}
-
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
-{
-	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
-		return;
-	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
-}
-
-/*
- * Convert from extended attribute to in-memory representation.
- */
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *user_ns,
-		     const void *value, size_t size)
-{
-	posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
-	posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
-	int count;
-	struct posix_acl *acl;
-	struct posix_acl_entry *acl_e;
-
-	if (!value)
-		return NULL;
-	if (size < sizeof(posix_acl_xattr_header))
-		 return ERR_PTR(-EINVAL);
-	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	count = posix_acl_xattr_count(size);
-	if (count < 0)
-		return ERR_PTR(-EINVAL);
-	if (count == 0)
-		return NULL;
-	
-	acl = posix_acl_alloc(count, GFP_NOFS);
-	if (!acl)
-		return ERR_PTR(-ENOMEM);
-	acl_e = acl->a_entries;
-	
-	for (end = entry + count; entry != end; acl_e++, entry++) {
-		acl_e->e_tag  = le16_to_cpu(entry->e_tag);
-		acl_e->e_perm = le16_to_cpu(entry->e_perm);
-
-		switch(acl_e->e_tag) {
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				break;
-
-			case ACL_USER:
-				acl_e->e_uid =
-					make_kuid(user_ns,
-						  le32_to_cpu(entry->e_id));
-				if (!uid_valid(acl_e->e_uid))
-					goto fail;
-				break;
-			case ACL_GROUP:
-				acl_e->e_gid =
-					make_kgid(user_ns,
-						  le32_to_cpu(entry->e_id));
-				if (!gid_valid(acl_e->e_gid))
-					goto fail;
-				break;
-
-			default:
-				goto fail;
-		}
-	}
-	return acl;
-
-fail:
-	posix_acl_release(acl);
-	return ERR_PTR(-EINVAL);
-}
-EXPORT_SYMBOL (posix_acl_from_xattr);
-
-/*
- * Convert from in-memory to extended attribute representation.
- */
-int
-posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
-		   void *buffer, size_t size)
-{
-	posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
-	posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
-	int real_size, n;
-
-	real_size = posix_acl_xattr_size(acl->a_count);
-	if (!buffer)
-		return real_size;
-	if (real_size > size)
-		return -ERANGE;
-	
-	ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-
-	for (n=0; n < acl->a_count; n++, ext_entry++) {
-		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
-		ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
-		ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
-		switch(acl_e->e_tag) {
-		case ACL_USER:
-			ext_entry->e_id =
-				cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
-			break;
-		case ACL_GROUP:
-			ext_entry->e_id =
-				cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
-			break;
-		default:
-			ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
-			break;
-		}
-	}
-	return real_size;
-}
-EXPORT_SYMBOL (posix_acl_to_xattr);
-- 
cgit v1.2.3-70-g09d2


From 2982baa2ae31eb23ce29b688ab2f77eb019062f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:38 -0800
Subject: fs: add get_acl helper

Factor out the code to get an ACL either from the inode or disk from
check_acl, so that it can be used elsewhere later on.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c                | 24 +++---------------------
 fs/posix_acl.c            | 27 +++++++++++++++++++++++++++
 include/linux/posix_acl.h |  2 ++
 3 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 3531deebad3..bcb838e2e52 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -235,27 +235,9 @@ static int check_acl(struct inode *inode, int mask)
 	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
 	}
 
-	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
-
-	/*
-	 * A filesystem can force a ACL callback by just never filling the
-	 * ACL cache. But normally you'd fill the cache either at inode
-	 * instantiation time, or on the first ->get_acl call.
-	 *
-	 * If the filesystem doesn't have a get_acl() function at all, we'll
-	 * just create the negative cache entry.
-	 */
-	if (acl == ACL_NOT_CACHED) {
-	        if (inode->i_op->get_acl) {
-			acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		} else {
-		        set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
-		        return -EAGAIN;
-		}
-	}
-
+	acl = get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
 	if (acl) {
 	        int error = posix_acl_permission(inode, acl, mask);
 	        posix_acl_release(acl);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 359d70b0e94..30524de49a6 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -26,6 +26,33 @@ EXPORT_SYMBOL(posix_acl_valid);
 EXPORT_SYMBOL(posix_acl_equiv_mode);
 EXPORT_SYMBOL(posix_acl_from_mode);
 
+struct posix_acl *get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	if (!IS_POSIXACL(inode))
+		return NULL;
+
+	/*
+	 * A filesystem can force a ACL callback by just never filling the
+	 * ACL cache. But normally you'd fill the cache either at inode
+	 * instantiation time, or on the first ->get_acl call.
+	 *
+	 * If the filesystem doesn't have a get_acl() function at all, we'll
+	 * just create the negative cache entry.
+	 */
+	if (!inode->i_op->get_acl) {
+		set_cached_acl(inode, type, NULL);
+		return NULL;
+	}
+	return inode->i_op->get_acl(inode, type);
+}
+EXPORT_SYMBOL(get_acl);
+
 /*
  * Init a fresh posix_acl
  */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 7931efe7117..a8d9918c0b2 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -175,4 +175,6 @@ static inline void cache_no_acl(struct inode *inode)
 #endif
 }
 
+struct posix_acl *get_acl(struct inode *inode, int type);
+
 #endif  /* __LINUX_POSIX_ACL_H */
-- 
cgit v1.2.3-70-g09d2


From 2aeccbe957d0d2b9fbb2a236e53a955097e2a9ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:40 -0800
Subject: fs: add generic xattr_acl handlers

With the ->set_acl inode operation we can implement the Posix ACL
xattr handlers in generic code instead of duplicating them all
over the tree.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/posix_acl.c                  | 102 ++++++++++++++++++++++++++++++++++++++++
 include/linux/posix_acl_xattr.h |   3 ++
 2 files changed, 105 insertions(+)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 30524de49a6..e699b076cdd 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
 
@@ -611,3 +612,104 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
 	return real_size;
 }
 EXPORT_SYMBOL (posix_acl_to_xattr);
+
+static int
+posix_acl_xattr_get(struct dentry *dentry, const char *name,
+		void *value, size_t size, int type)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+	if (S_ISLNK(dentry->d_inode->i_mode))
+		return -EOPNOTSUPP;
+
+	acl = get_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+posix_acl_xattr_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl = NULL;
+	int ret;
+
+	if (!IS_POSIXACL(inode))
+		return -EOPNOTSUPP;
+	if (!inode->i_op->set_acl)
+		return -EOPNOTSUPP;
+
+	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return value ? -EACCES : 0;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+
+		if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto out;
+		}
+	}
+
+	ret = inode->i_op->set_acl(inode, acl, type);
+out:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static size_t
+posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
+{
+	const char *xname;
+	size_t size;
+
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+	if (S_ISLNK(dentry->d_inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (type == ACL_TYPE_ACCESS)
+		xname = POSIX_ACL_XATTR_ACCESS;
+	else
+		xname = POSIX_ACL_XATTR_DEFAULT;
+
+	size = strlen(xname) + 1;
+	if (list && size <= list_size)
+		memcpy(list, xname, size);
+	return size;
+}
+
+const struct xattr_handler posix_acl_access_xattr_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.flags = ACL_TYPE_ACCESS,
+	.list = posix_acl_xattr_list,
+	.get = posix_acl_xattr_get,
+	.set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
+
+const struct xattr_handler posix_acl_default_xattr_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.flags = ACL_TYPE_DEFAULT,
+	.list = posix_acl_xattr_list,
+	.get = posix_acl_xattr_get,
+	.set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index ad93ad0f1db..6f14ee29582 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -69,4 +69,7 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns,
 int posix_acl_to_xattr(struct user_namespace *user_ns,
 		       const struct posix_acl *acl, void *buffer, size_t size);
 
+extern const struct xattr_handler posix_acl_access_xattr_handler;
+extern const struct xattr_handler posix_acl_default_xattr_handler;
+
 #endif	/* _POSIX_ACL_XATTR_H */
-- 
cgit v1.2.3-70-g09d2


From 5bf3258fd2acd8515450ab8efcd97c9d3b69f7f9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:41 -0800
Subject: fs: make posix_acl_chmod more useful

Rename the current posix_acl_chmod to __posix_acl_chmod and add
a fully featured ACL chmod helper that uses the ->set_acl inode
operation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c               |  2 +-
 fs/btrfs/acl.c            |  2 +-
 fs/ext2/acl.c             |  2 +-
 fs/ext3/acl.c             |  2 +-
 fs/ext4/acl.c             |  2 +-
 fs/f2fs/acl.c             |  2 +-
 fs/generic_acl.c          |  2 +-
 fs/gfs2/acl.c             |  2 +-
 fs/hfsplus/posix_acl.c    |  2 +-
 fs/jffs2/acl.c            |  2 +-
 fs/jfs/acl.c              |  2 +-
 fs/ocfs2/acl.c            |  2 +-
 fs/posix_acl.c            | 30 +++++++++++++++++++++++++++---
 fs/reiserfs/xattr_acl.c   |  2 +-
 fs/xfs/xfs_acl.c          |  2 +-
 include/linux/posix_acl.h | 17 +++++++++++++----
 16 files changed, 54 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 7af425f53be..f5ce5c50c57 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -156,7 +156,7 @@ int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
 		return -EOPNOTSUPP;
 	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
 	if (acl) {
-		retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+		retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 		if (retval)
 			return retval;
 		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0890c83643e..1af04ff8898 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_acl_chmod(struct inode *inode)
 	if (IS_ERR_OR_NULL(acl))
 		return PTR_ERR(acl);
 
-	ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (ret)
 		return ret;
 	ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 110b6b371a4..7006ced4532 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -308,7 +308,7 @@ ext2_acl_chmod(struct inode *inode)
 	acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
-	error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (error)
 		return error;
 	error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index dbb5ad59a7f..6691a6c6b21 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -314,7 +314,7 @@ ext3_acl_chmod(struct inode *inode)
 	acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
-	error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (error)
 		return error;
 retry:
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 39a54a0e9fe..2eebe02fdf0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -320,7 +320,7 @@ ext4_acl_chmod(struct inode *inode)
 	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
-	error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (error)
 		return error;
 retry:
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index d0fc287efef..14c4df0ede3 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -311,7 +311,7 @@ int f2fs_acl_chmod(struct inode *inode)
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
 
-	error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	error = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
 	if (error)
 		return error;
 
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index b3f3676796d..46a5076e977 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -158,7 +158,7 @@ generic_acl_chmod(struct inode *inode)
 		return -EOPNOTSUPP;
 	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
 	if (acl) {
-		error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+		error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 		if (error)
 			return error;
 		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f69ac0af549..3e200c7ca7a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -162,7 +162,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 	if (!acl)
 		return gfs2_setattr_simple(inode, attr);
 
-	error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
+	error = __posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
 	if (error)
 		return error;
 
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index b609cc14c72..cab5fd6fdb7 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -167,7 +167,7 @@ int hfsplus_posix_acl_chmod(struct inode *inode)
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
 
-	err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	err = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (unlikely(err))
 		return err;
 
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 223283c3011..5853969a51b 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -335,7 +335,7 @@ int jffs2_acl_chmod(struct inode *inode)
 	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
-	rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	rc = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (rc)
 		return rc;
 	rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d254d6d3599..9c0fca8073d 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -161,7 +161,7 @@ int jfs_acl_chmod(struct inode *inode)
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
 
-	rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	rc = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (rc)
 		return rc;
 
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index b4f788e0ca3..73ccf0e22ec 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -350,7 +350,7 @@ int ocfs2_acl_chmod(struct inode *inode)
 	acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
-	ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (ret)
 		return ret;
 	ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index e699b076cdd..08218550b0d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -364,7 +364,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
 /*
  * Modify the ACL for the chmod syscall.
  */
-static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
+static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
 {
 	struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
 	struct posix_acl_entry *pa, *pe;
@@ -428,12 +428,12 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
 EXPORT_SYMBOL(posix_acl_create);
 
 int
-posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
+__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 {
 	struct posix_acl *clone = posix_acl_clone(*acl, gfp);
 	int err = -ENOMEM;
 	if (clone) {
-		err = posix_acl_chmod_masq(clone, mode);
+		err = __posix_acl_chmod_masq(clone, mode);
 		if (err) {
 			posix_acl_release(clone);
 			clone = NULL;
@@ -443,6 +443,30 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 	*acl = clone;
 	return err;
 }
+EXPORT_SYMBOL(__posix_acl_chmod);
+
+int
+posix_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (!IS_POSIXACL(inode))
+		return 0;
+	if (!inode->i_op->set_acl)
+		return -EOPNOTSUPP;
+
+	acl = get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR_OR_NULL(acl))
+		return PTR_ERR(acl);
+
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	if (ret)
+		return ret;
+	ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
+	posix_acl_release(acl);
+	return ret;
+}
 EXPORT_SYMBOL(posix_acl_chmod);
 
 /*
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 6f721ea9403..ea4e44351f7 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -463,7 +463,7 @@ int reiserfs_acl_chmod(struct inode *inode)
 		return 0;
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
-	error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode);
+	error = __posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 370eb3e121d..4eac1058b68 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -334,7 +334,7 @@ xfs_acl_chmod(struct inode *inode)
 	if (IS_ERR(acl) || !acl)
 		return PTR_ERR(acl);
 
-	error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 	if (error)
 		return error;
 
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index a8d9918c0b2..8b64e789998 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -89,12 +89,14 @@ extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
 extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
 extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
 extern int posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
-extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
+extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
 
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
 #ifdef CONFIG_FS_POSIX_ACL
+extern int posix_acl_chmod(struct inode *);
+
 static inline struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
 	switch (type) {
@@ -165,15 +167,22 @@ static inline void forget_all_cached_acls(struct inode *inode)
 	if (old_default != ACL_NOT_CACHED)
 		posix_acl_release(old_default);
 }
-#endif
 
 static inline void cache_no_acl(struct inode *inode)
 {
-#ifdef CONFIG_FS_POSIX_ACL
 	inode->i_acl = NULL;
 	inode->i_default_acl = NULL;
-#endif
 }
+#else
+static inline int posix_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline void cache_no_acl(struct inode *inode)
+{
+}
+#endif /* CONFIG_FS_POSIX_ACL */
 
 struct posix_acl *get_acl(struct inode *inode, int type);
 
-- 
cgit v1.2.3-70-g09d2


From 37bc15392a2363ca822b2c2828e0ccafbea32f75 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:42 -0800
Subject: fs: make posix_acl_create more useful

Rename the current posix_acl_created to __posix_acl_create and add
a fully featured helper to set up the ACLs on file creation that
uses get_acl().

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c               |  2 +-
 fs/btrfs/acl.c            |  2 +-
 fs/ext2/acl.c             |  2 +-
 fs/ext3/acl.c             |  2 +-
 fs/ext4/acl.c             |  2 +-
 fs/f2fs/acl.c             |  2 +-
 fs/generic_acl.c          |  2 +-
 fs/gfs2/acl.c             |  2 +-
 fs/hfsplus/posix_acl.c    |  2 +-
 fs/jffs2/acl.c            |  2 +-
 fs/jfs/acl.c              |  2 +-
 fs/nfs/nfs3acl.c          |  2 +-
 fs/ocfs2/acl.c            |  2 +-
 fs/posix_acl.c            | 57 +++++++++++++++++++++++++++++++++++++++++++----
 fs/reiserfs/xattr_acl.c   |  2 +-
 fs/xfs/xfs_acl.c          |  4 ++--
 include/linux/posix_acl.h | 15 ++++++++++---
 17 files changed, 81 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index f5ce5c50c57..8482f2d1160 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -200,7 +200,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 	if (acl) {
 		if (S_ISDIR(mode))
 			*dpacl = posix_acl_dup(acl);
-		retval = posix_acl_create(&acl, GFP_NOFS, &mode);
+		retval = __posix_acl_create(&acl, GFP_NOFS, &mode);
 		if (retval < 0)
 			return retval;
 		if (retval > 0)
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1af04ff8898..b56519d4726 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -222,7 +222,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
 			if (ret)
 				goto failed;
 		}
-		ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
 		if (ret < 0)
 			return ret;
 
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7006ced4532..6e842a764ee 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -268,7 +268,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
 			if (error)
 				goto cleanup;
 		}
-		error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+		error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
 		if (error < 0)
 			return error;
 		if (error > 0) {
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 6691a6c6b21..4f3d8fa0c0a 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -271,7 +271,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 			if (error)
 				goto cleanup;
 		}
-		error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		error = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
 		if (error < 0)
 			return error;
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 2eebe02fdf0..f827f3bb6d4 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -276,7 +276,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 			if (error)
 				goto cleanup;
 		}
-		error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		error = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
 		if (error < 0)
 			return error;
 
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 14c4df0ede3..45e84303c24 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -285,7 +285,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
 		if (error)
 			goto cleanup;
 	}
-	error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+	error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
 	if (error < 0)
 		return error;
 	if (error > 0)
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 46a5076e977..4357f39c844 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -128,7 +128,7 @@ generic_acl_init(struct inode *inode, struct inode *dir)
 	if (acl) {
 		if (S_ISDIR(inode->i_mode))
 			set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
-		error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+		error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
 		if (error < 0)
 			return error;
 		if (error > 0)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e200c7ca7a..e82e4ac574a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -131,7 +131,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
 			goto out;
 	}
 
-	error = posix_acl_create(&acl, GFP_NOFS, &mode);
+	error = __posix_acl_create(&acl, GFP_NOFS, &mode);
 	if (error < 0)
 		return error;
 
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index cab5fd6fdb7..277942f36f8 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -137,7 +137,7 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
 				goto init_acl_cleanup;
 		}
 
-		err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
 		if (unlikely(err < 0))
 			return err;
 
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 5853969a51b..4d6e31b1981 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -295,7 +295,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode
 		if (S_ISDIR(*i_mode))
 			set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
 
-		rc = posix_acl_create(&acl, GFP_KERNEL, i_mode);
+		rc = __posix_acl_create(&acl, GFP_KERNEL, i_mode);
 		if (rc < 0)
 			return rc;
 		if (rc > 0)
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 9c0fca8073d..28d529ae9a4 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -132,7 +132,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 			if (rc)
 				goto cleanup;
 		}
-		rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+		rc = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
 		if (rc < 0)
 			goto cleanup; /* posix_acl_release(NULL) is no-op */
 		if (rc > 0)
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 4a1aafba6a2..e85967587d7 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -428,7 +428,7 @@ int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
 	if (!dfacl)
 		return 0;
 	acl = posix_acl_dup(dfacl);
-	error = posix_acl_create(&acl, GFP_KERNEL, &mode);
+	error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
 	if (error < 0)
 		goto out_release_dfacl;
 	error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 73ccf0e22ec..c0f9d2fe134 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -401,7 +401,7 @@ int ocfs2_init_acl(handle_t *handle,
 				goto cleanup;
 		}
 		mode = inode->i_mode;
-		ret = posix_acl_create(&acl, GFP_NOFS, &mode);
+		ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
 		if (ret < 0)
 			return ret;
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 08218550b0d..8f245ab2014 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -410,7 +410,7 @@ static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
 }
 
 int
-posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
+__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
 {
 	struct posix_acl *clone = posix_acl_clone(*acl, gfp);
 	int err = -ENOMEM;
@@ -425,7 +425,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
 	*acl = clone;
 	return err;
 }
-EXPORT_SYMBOL(posix_acl_create);
+EXPORT_SYMBOL(__posix_acl_create);
 
 int
 __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
@@ -446,7 +446,7 @@ __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 EXPORT_SYMBOL(__posix_acl_chmod);
 
 int
-posix_acl_chmod(struct inode *inode)
+posix_acl_chmod(struct inode *inode, umode_t mode)
 {
 	struct posix_acl *acl;
 	int ret = 0;
@@ -460,7 +460,7 @@ posix_acl_chmod(struct inode *inode)
 	if (IS_ERR_OR_NULL(acl))
 		return PTR_ERR(acl);
 
-	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
 	if (ret)
 		return ret;
 	ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
@@ -469,6 +469,55 @@ posix_acl_chmod(struct inode *inode)
 }
 EXPORT_SYMBOL(posix_acl_chmod);
 
+int
+posix_acl_create(struct inode *dir, umode_t *mode,
+		struct posix_acl **default_acl, struct posix_acl **acl)
+{
+	struct posix_acl *p;
+	int ret;
+
+	if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
+		goto no_acl;
+
+	p = get_acl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	if (!p) {
+		*mode &= ~current_umask();
+		goto no_acl;
+	}
+
+	*acl = posix_acl_clone(p, GFP_NOFS);
+	if (!*acl)
+		return -ENOMEM;
+
+	ret = posix_acl_create_masq(*acl, mode);
+	if (ret < 0) {
+		posix_acl_release(*acl);
+		return -ENOMEM;
+	}
+
+	if (ret == 0) {
+		posix_acl_release(*acl);
+		*acl = NULL;
+	}
+
+	if (!S_ISDIR(*mode)) {
+		posix_acl_release(p);
+		*default_acl = NULL;
+	} else {
+		*default_acl = p;
+	}
+	return 0;
+
+no_acl:
+	*default_acl = NULL;
+	*acl = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(posix_acl_create);
+
 /*
  * Fix up the uids and gids in posix acl extended attributes in place.
  */
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index ea4e44351f7..d95c9592327 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -378,7 +378,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 
 		/* Now we reconcile the new ACL and the mode,
 		   potentially modifying both */
-		err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
 		if (err < 0)
 			return err;
 
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4eac1058b68..057ae2d502d 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -297,12 +297,12 @@ xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
 			goto out;
 	}
 
-	error = posix_acl_create(&acl, GFP_KERNEL, &mode);
+	error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
 	if (error < 0)
 		return error;
 
 	/*
-	 * If posix_acl_create returns a positive value we need to
+	 * If __posix_acl_create returns a positive value we need to
 	 * inherit a permission that can't be represented using the Unix
 	 * mode bits and we actually need to set an ACL.
 	 */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 8b64e789998..f7e6f6cb214 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -88,14 +88,16 @@ extern int posix_acl_valid(const struct posix_acl *);
 extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
 extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
 extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
-extern int posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
+extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
 extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
 
 extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
 #ifdef CONFIG_FS_POSIX_ACL
-extern int posix_acl_chmod(struct inode *);
+extern int posix_acl_chmod(struct inode *, umode_t);
+extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
+		struct posix_acl **);
 
 static inline struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
@@ -174,7 +176,7 @@ static inline void cache_no_acl(struct inode *inode)
 	inode->i_default_acl = NULL;
 }
 #else
-static inline int posix_acl_chmod(struct inode *inode)
+static inline int posix_acl_chmod(struct inode *inode, umode_t mode)
 {
 	return 0;
 }
@@ -182,6 +184,13 @@ static inline int posix_acl_chmod(struct inode *inode)
 static inline void cache_no_acl(struct inode *inode)
 {
 }
+
+static inline int posix_acl_create(struct inode *inode, umode_t *mode,
+		struct posix_acl **default_acl, struct posix_acl **acl)
+{
+	*default_acl = *acl = NULL;
+	return 0;
+}
 #endif /* CONFIG_FS_POSIX_ACL */
 
 struct posix_acl *get_acl(struct inode *inode, int type);
-- 
cgit v1.2.3-70-g09d2


From 996a710d46418cacb5b4a519ab9341a74066551d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:43 -0800
Subject: btrfs: use generic posix ACL infrastructure

Also don't bother to set up a .get_acl method for symlinks as we do not
support access control (ACLs or even mode bits) for symlinks in Linux.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/acl.c   | 142 +++++++------------------------------------------------
 fs/btrfs/ctree.h |   7 +--
 fs/btrfs/inode.c |   7 ++-
 fs/btrfs/xattr.c |   5 +-
 fs/btrfs/xattr.h |   2 -
 5 files changed, 28 insertions(+), 135 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index b56519d4726..ff9b3995d45 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -35,13 +35,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 	char *value = NULL;
 	struct posix_acl *acl;
 
-	if (!IS_POSIXACL(inode))
-		return NULL;
-
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
@@ -76,31 +69,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
-		void *value, size_t size, int type)
-{
-	struct posix_acl *acl;
-	int ret = 0;
-
-	if (!IS_POSIXACL(dentry->d_inode))
-		return -EOPNOTSUPP;
-
-	acl = btrfs_get_acl(dentry->d_inode, type);
-
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-	posix_acl_release(acl);
-
-	return ret;
-}
-
 /*
  * Needs to be called with fs_mutex held
  */
-static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 			 struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret, size = 0;
@@ -158,35 +130,9 @@ out:
 	return ret;
 }
 
-static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	int ret;
-	struct posix_acl *acl = NULL;
-
-	if (!inode_owner_or_capable(dentry->d_inode))
-		return -EPERM;
-
-	if (!IS_POSIXACL(dentry->d_inode))
-		return -EOPNOTSUPP;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-
-		if (acl) {
-			ret = posix_acl_valid(acl);
-			if (ret)
-				goto out;
-		}
-	}
-
-	ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
-out:
-	posix_acl_release(acl);
-
-	return ret;
+	return __btrfs_set_acl(NULL, inode, acl, type);
 }
 
 /*
@@ -197,83 +143,31 @@ out:
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		   struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl = NULL;
+	struct posix_acl *default_acl, *acl;
 	int ret = 0;
 
 	/* this happens with subvols */
 	if (!dir)
 		return 0;
 
-	if (!S_ISLNK(inode->i_mode)) {
-		if (IS_POSIXACL(dir)) {
-			acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
+	ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (ret)
+		return ret;
 
-		if (!acl)
-			inode->i_mode &= ~current_umask();
+	if (default_acl) {
+		ret = __btrfs_set_acl(trans, inode, default_acl,
+				      ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
 	}
 
-	if (IS_POSIXACL(dir) && acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			ret = btrfs_set_acl(trans, inode, acl,
-					    ACL_TYPE_DEFAULT);
-			if (ret)
-				goto failed;
-		}
-		ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-		if (ret < 0)
-			return ret;
-
-		if (ret > 0) {
-			/* we need an acl */
-			ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
-		} else if (ret < 0) {
-			cache_no_acl(inode);
-		}
-	} else {
-		cache_no_acl(inode);
+	if (acl) {
+		if (!ret)
+			ret = __btrfs_set_acl(trans, inode, acl,
+					      ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
 	}
-failed:
-	posix_acl_release(acl);
-
-	return ret;
-}
 
-int btrfs_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
-	int ret = 0;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	if (!IS_POSIXACL(inode))
-		return 0;
-
-	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR_OR_NULL(acl))
-		return PTR_ERR(acl);
-
-	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (ret)
-		return ret;
-	ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS);
-	posix_acl_release(acl);
+	if (!default_acl && !acl)
+		cache_no_acl(inode);
 	return ret;
 }
-
-const struct xattr_handler btrfs_xattr_acl_default_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.get	= btrfs_xattr_acl_get,
-	.set	= btrfs_xattr_acl_set,
-};
-
-const struct xattr_handler btrfs_xattr_acl_access_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.get	= btrfs_xattr_acl_get,
-	.set	= btrfs_xattr_acl_set,
-};
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54ab86127f7..7506825211a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3899,20 +3899,17 @@ do {									\
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		   struct inode *inode, struct inode *dir);
-int btrfs_acl_chmod(struct inode *inode);
 #else
 #define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
 static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
 				 struct inode *inode, struct inode *dir)
 {
 	return 0;
 }
-static inline int btrfs_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
 #endif
 
 /* relocation.c */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d03..b1314300d9f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4464,7 +4464,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		err = btrfs_dirty_inode(inode);
 
 		if (!err && attr->ia_valid & ATTR_MODE)
-			err = btrfs_acl_chmod(inode);
+			err = posix_acl_chmod(inode, inode->i_mode);
 	}
 
 	return err;
@@ -8649,12 +8649,14 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 
@@ -8724,6 +8726,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.permission	= btrfs_permission,
 	.fiemap		= btrfs_fiemap,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
@@ -8735,6 +8738,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
@@ -8748,7 +8752,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
-	.get_acl	= btrfs_get_acl,
 	.update_time	= btrfs_update_time,
 };
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 05740b9789e..3d1c301c926 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -22,6 +22,7 @@
 #include <linux/rwsem.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
@@ -313,8 +314,8 @@ err:
  */
 const struct xattr_handler *btrfs_xattr_handlers[] = {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	&btrfs_xattr_acl_access_handler,
-	&btrfs_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	NULL,
 };
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b3cc8039134..5049608d138 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,8 +21,6 @@
 
 #include <linux/xattr.h>
 
-extern const struct xattr_handler btrfs_xattr_acl_access_handler;
-extern const struct xattr_handler btrfs_xattr_acl_default_handler;
 extern const struct xattr_handler *btrfs_xattr_handlers[];
 
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
-- 
cgit v1.2.3-70-g09d2


From 64e178a7118b1cf7648391755e44dcc209091003 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:44 -0800
Subject: ext2/3/4: use generic posix ACL infrastructure

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/acl.c   | 188 ++++-------------------------------------------
 fs/ext2/acl.h   |   8 +-
 fs/ext2/file.c  |   1 +
 fs/ext2/inode.c |   2 +-
 fs/ext2/namei.c |   2 +
 fs/ext2/xattr.c |   8 +-
 fs/ext2/xattr.h |   2 -
 fs/ext3/acl.c   | 223 ++++++++------------------------------------------------
 fs/ext3/acl.h   |   9 +--
 fs/ext3/file.c  |   1 +
 fs/ext3/inode.c |   2 +-
 fs/ext3/namei.c |   2 +
 fs/ext3/xattr.c |   8 +-
 fs/ext3/xattr.h |   2 -
 fs/ext4/acl.c   | 223 +++++++-------------------------------------------------
 fs/ext4/acl.h   |   9 +--
 fs/ext4/file.c  |   1 +
 fs/ext4/inode.c |   2 +-
 fs/ext4/namei.c |   2 +
 fs/ext4/xattr.c |   8 +-
 fs/ext4/xattr.h |   2 -
 21 files changed, 100 insertions(+), 605 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 6e842a764ee..1b8001bbe94 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -148,13 +148,6 @@ ext2_get_acl(struct inode *inode, int type)
 	struct posix_acl *acl;
 	int retval;
 
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return NULL;
-
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -189,19 +182,14 @@ ext2_get_acl(struct inode *inode, int type)
 /*
  * inode->i_mutex: down
  */
-static int
-ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+int
+ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int name_index;
 	void *value = NULL;
 	size_t size = 0;
 	int error;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return 0;
-
 	switch(type) {
 		case ACL_TYPE_ACCESS:
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -250,169 +238,21 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 int
 ext2_init_acl(struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl = NULL;
-	int error = 0;
-
-	if (!S_ISLNK(inode->i_mode)) {
-		if (test_opt(dir->i_sb, POSIX_ACL)) {
-			acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
-		if (!acl)
-			inode->i_mode &= ~current_umask();
-	}
-	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
-			if (error)
-				goto cleanup;
-		}
-		error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-		if (error < 0)
-			return error;
-		if (error > 0) {
-			/* This is an extended ACL */
-			error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
-		}
-	}
-cleanup:
-       posix_acl_release(acl);
-       return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext2_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
-        int error;
+	struct posix_acl *default_acl, *acl;
+	int error;
 
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return 0;
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
 	if (error)
 		return error;
-	error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
-	posix_acl_release(acl);
-	return error;
-}
 
-/*
- * Extended attribut handlers
- */
-static size_t
-ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size,
-			   const char *name, size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_size)
-		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-	return size;
-}
-
-static size_t
-ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
-			    const char *name, size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_size)
-		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-	return size;
-}
-
-static int
-ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
-		   size_t size, int type)
-{
-	struct posix_acl *acl;
-	int error;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-
-	acl = ext2_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int
-ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-		   size_t size, int flags, int type)
-{
-	struct posix_acl *acl;
-	int error;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(dentry->d_inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		else if (acl) {
-			error = posix_acl_valid(acl);
-			if (error)
-				goto release_and_out;
-		}
-	} else
-		acl = NULL;
-
-	error = ext2_set_acl(dentry->d_inode, type, acl);
-
-release_and_out:
-	posix_acl_release(acl);
+	if (default_acl) {
+		error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
 	return error;
 }
-
-const struct xattr_handler ext2_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.list	= ext2_xattr_list_acl_access,
-	.get	= ext2_xattr_get_acl,
-	.set	= ext2_xattr_set_acl,
-};
-
-const struct xattr_handler ext2_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= ext2_xattr_list_acl_default,
-	.get	= ext2_xattr_get_acl,
-	.set	= ext2_xattr_set_acl,
-};
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 503bfb0ed79..44937f9fcf3 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size)
 
 /* acl.c */
 extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
-extern int ext2_acl_chmod (struct inode *);
+extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
 #else
@@ -63,12 +63,6 @@ extern int ext2_init_acl (struct inode *, struct inode *);
 #define ext2_get_acl	NULL
 #define ext2_set_acl	NULL
 
-static inline int
-ext2_acl_chmod (struct inode *inode)
-{
-	return 0;
-}
-
 static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
 {
 	return 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a5b3a5db312..44c36e59076 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -103,5 +103,6 @@ const struct inode_operations ext2_file_inode_operations = {
 #endif
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
+	.set_acl	= ext2_set_acl,
 	.fiemap		= ext2_fiemap,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 8a337640a46..94ed36849b7 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1566,7 +1566,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 	setattr_copy(inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
-		error = ext2_acl_chmod(inode);
+		error = posix_acl_chmod(inode, inode->i_mode);
 	mark_inode_dirty(inode);
 
 	return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 256dd5f4c1c..c268d0af1db 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -421,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 #endif
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
+	.set_acl	= ext2_set_acl,
 	.tmpfile	= ext2_tmpfile,
 };
 
@@ -433,4 +434,5 @@ const struct inode_operations ext2_special_inode_operations = {
 #endif
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
+	.set_acl	= ext2_set_acl,
 };
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 2d7557db3ae..91426141c33 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -103,8 +103,8 @@ static struct mb_cache *ext2_xattr_cache;
 static const struct xattr_handler *ext2_xattr_handler_map[] = {
 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
-	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
-	[EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler,
+	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
+	[EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
 	[EXT2_XATTR_INDEX_TRUSTED]	     = &ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_SECURITY
@@ -116,8 +116,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = {
 	&ext2_xattr_user_handler,
 	&ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
-	&ext2_xattr_acl_access_handler,
-	&ext2_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 #ifdef CONFIG_EXT2_FS_SECURITY
 	&ext2_xattr_security_handler,
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 5e41cccff76..60edf298644 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -57,8 +57,6 @@ struct ext2_xattr_entry {
 
 extern const struct xattr_handler ext2_xattr_user_handler;
 extern const struct xattr_handler ext2_xattr_trusted_handler;
-extern const struct xattr_handler ext2_xattr_acl_access_handler;
-extern const struct xattr_handler ext2_xattr_acl_default_handler;
 extern const struct xattr_handler ext2_xattr_security_handler;
 
 extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 4f3d8fa0c0a..8bbaf5bcf98 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -145,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type)
 	struct posix_acl *acl;
 	int retval;
 
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return NULL;
-
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -190,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type)
  * inode->i_mutex: down unless called from ext3_new_inode
  */
 static int
-ext3_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	     struct posix_acl *acl)
 {
 	int name_index;
@@ -198,9 +191,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	size_t size = 0;
 	int error;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	switch(type) {
 		case ACL_TYPE_ACCESS:
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -243,204 +233,49 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	return error;
 }
 
-/*
- * Initialize the ACLs of a new inode. Called from ext3_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
 int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	struct posix_acl *acl = NULL;
-	int error = 0;
-
-	if (!S_ISLNK(inode->i_mode)) {
-		if (test_opt(dir->i_sb, POSIX_ACL)) {
-			acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
-		if (!acl)
-			inode->i_mode &= ~current_umask();
-	}
-	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			error = ext3_set_acl(handle, inode,
-					     ACL_TYPE_DEFAULT, acl);
-			if (error)
-				goto cleanup;
-		}
-		error = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-		if (error < 0)
-			return error;
-
-		if (error > 0) {
-			/* This is an extended ACL */
-			error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
-		}
-	}
-cleanup:
-	posix_acl_release(acl);
-	return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext3_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
 	handle_t *handle;
-	int retries = 0;
-        int error;
+	int error, retries = 0;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return 0;
-	acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (error)
-		return error;
 retry:
-	handle = ext3_journal_start(inode,
-			EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle)) {
-		error = PTR_ERR(handle);
-		ext3_std_error(inode->i_sb, error);
-		goto out;
-	}
-	error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	error = __ext3_set_acl(handle, inode, type, acl);
 	ext3_journal_stop(handle);
-	if (error == -ENOSPC &&
-	    ext3_should_retry_alloc(inode->i_sb, &retries))
+	if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-out:
-	posix_acl_release(acl);
 	return error;
 }
 
 /*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext3_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
  */
-static size_t
-ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
-			   const char *name, size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-	return size;
-}
-
-static size_t
-ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-			    const char *name, size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-	return size;
-}
-
-static int
-ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
-		   size_t size, int type)
+int
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl;
+	struct posix_acl *default_acl, *acl;
 	int error;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-
-	acl = ext3_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int
-ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-		   size_t size, int flags, int type)
-{
-	struct inode *inode = dentry->d_inode;
-	handle_t *handle;
-	struct posix_acl *acl;
-	int error, retries = 0;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		else if (acl) {
-			error = posix_acl_valid(acl);
-			if (error)
-				goto release_and_out;
-		}
-	} else
-		acl = NULL;
-
-retry:
-	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	error = ext3_set_acl(handle, inode, type, acl);
-	ext3_journal_stop(handle);
-	if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
 
-release_and_out:
-	posix_acl_release(acl);
+	if (default_acl) {
+		error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+				       default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
+					       acl);
+		posix_acl_release(acl);
+	}
 	return error;
 }
-
-const struct xattr_handler ext3_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.list	= ext3_xattr_list_acl_access,
-	.get	= ext3_xattr_get_acl,
-	.set	= ext3_xattr_set_acl,
-};
-
-const struct xattr_handler ext3_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= ext3_xattr_list_acl_default,
-	.get	= ext3_xattr_get_acl,
-	.set	= ext3_xattr_set_acl,
-};
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index dbc921e458c..ea1c69edab9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -55,18 +55,13 @@ static inline int ext3_acl_count(size_t size)
 
 /* acl.c */
 extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
-extern int ext3_acl_chmod (struct inode *);
+extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT3_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext3_get_acl NULL
-
-static inline int
-ext3_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
+#define ext3_set_acl NULL
 
 static inline int
 ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 25cb413277e..aad05311392 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -75,6 +75,7 @@ const struct inode_operations ext3_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.get_acl	= ext3_get_acl,
+	.set_acl	= ext3_set_acl,
 	.fiemap		= ext3_fiemap,
 };
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2bd85486b87..384b6ebb655 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3365,7 +3365,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 	mark_inode_dirty(inode);
 
 	if (ia_valid & ATTR_MODE)
-		rc = ext3_acl_chmod(inode);
+		rc = posix_acl_chmod(inode, inode->i_mode);
 
 err_out:
 	ext3_std_error(inode->i_sb, error);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index f8cde46de9c..f197736dccf 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2569,6 +2569,7 @@ const struct inode_operations ext3_dir_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.get_acl	= ext3_get_acl,
+	.set_acl	= ext3_set_acl,
 };
 
 const struct inode_operations ext3_special_inode_operations = {
@@ -2580,4 +2581,5 @@ const struct inode_operations ext3_special_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.get_acl	= ext3_get_acl,
+	.set_acl	= ext3_set_acl,
 };
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index b1fc96383e0..c6874be6d58 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -102,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache;
 static const struct xattr_handler *ext3_xattr_handler_map[] = {
 	[EXT3_XATTR_INDEX_USER]		     = &ext3_xattr_user_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-	[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
-	[EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
+	[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
+	[EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
 	[EXT3_XATTR_INDEX_TRUSTED]	     = &ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_SECURITY
@@ -115,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = {
 	&ext3_xattr_user_handler,
 	&ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-	&ext3_xattr_acl_access_handler,
-	&ext3_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 #ifdef CONFIG_EXT3_FS_SECURITY
 	&ext3_xattr_security_handler,
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2be4f69bfa6..32e93ebf803 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -60,8 +60,6 @@ struct ext3_xattr_entry {
 
 extern const struct xattr_handler ext3_xattr_user_handler;
 extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern const struct xattr_handler ext3_xattr_acl_access_handler;
-extern const struct xattr_handler ext3_xattr_acl_default_handler;
 extern const struct xattr_handler ext3_xattr_security_handler;
 
 extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f827f3bb6d4..d40c8dbbb0d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)
 	struct posix_acl *acl;
 	int retval;
 
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return NULL;
-
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)
  * inode->i_mutex: down unless called from ext4_new_inode
  */
 static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	     struct posix_acl *acl)
 {
 	int name_index;
@@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	size_t size = 0;
 	int error;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	return error;
 }
 
-/*
- * Initialize the ACLs of a new inode. Called from ext4_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
 int
-ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	struct posix_acl *acl = NULL;
-	int error = 0;
-
-	if (!S_ISLNK(inode->i_mode)) {
-		if (test_opt(dir->i_sb, POSIX_ACL)) {
-			acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
-		if (!acl)
-			inode->i_mode &= ~current_umask();
-	}
-	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			error = ext4_set_acl(handle, inode,
-					     ACL_TYPE_DEFAULT, acl);
-			if (error)
-				goto cleanup;
-		}
-		error = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-		if (error < 0)
-			return error;
-
-		if (error > 0) {
-			/* This is an extended ACL */
-			error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
-		}
-	}
-cleanup:
-	posix_acl_release(acl);
-	return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
 	handle_t *handle;
-	int retries = 0;
-	int error;
-
+	int error, retries = 0;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return 0;
-	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (error)
-		return error;
 retry:
 	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
 				    ext4_jbd2_credits_xattr(inode));
-	if (IS_ERR(handle)) {
-		error = PTR_ERR(handle);
-		ext4_std_error(inode->i_sb, error);
-		goto out;
-	}
-	error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	error = __ext4_set_acl(handle, inode, type, acl);
 	ext4_journal_stop(handle);
-	if (error == -ENOSPC &&
-	    ext4_should_retry_alloc(inode->i_sb, &retries))
+	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-out:
-	posix_acl_release(acl);
 	return error;
 }
 
 /*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
  */
-static size_t
-ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
-			   const char *name, size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-	return size;
-}
-
-static size_t
-ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-			    const char *name, size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-	return size;
-}
-
-static int
-ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
-		   size_t size, int type)
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl;
+	struct posix_acl *default_acl, *acl;
 	int error;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(dentry->d_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-
-	acl = ext4_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int
-ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-		   size_t size, int flags, int type)
-{
-	struct inode *inode = dentry->d_inode;
-	handle_t *handle;
-	struct posix_acl *acl;
-	int error, retries = 0;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		else if (acl) {
-			error = posix_acl_valid(acl);
-			if (error)
-				goto release_and_out;
-		}
-	} else
-		acl = NULL;
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
 
-retry:
-	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
-				    ext4_jbd2_credits_xattr(inode));
-	if (IS_ERR(handle)) {
-		error = PTR_ERR(handle);
-		goto release_and_out;
+	if (default_acl) {
+		error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+				       default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+					       acl);
+		posix_acl_release(acl);
 	}
-	error = ext4_set_acl(handle, inode, type, acl);
-	ext4_journal_stop(handle);
-	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-
-release_and_out:
-	posix_acl_release(acl);
 	return error;
 }
-
-const struct xattr_handler ext4_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.list	= ext4_xattr_list_acl_access,
-	.get	= ext4_xattr_get_acl,
-	.set	= ext4_xattr_set_acl,
-};
-
-const struct xattr_handler ext4_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= ext4_xattr_list_acl_default,
-	.get	= ext4_xattr_get_acl,
-	.set	= ext4_xattr_set_acl,
-};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 18cb39ed7c7..da2c79577d7 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size)
 
 /* acl.c */
 struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-extern int ext4_acl_chmod(struct inode *);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext4_get_acl NULL
-
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
+#define ext4_set_acl NULL
 
 static inline int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3da21945ff1..43e64f6022e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -617,6 +617,7 @@ const struct inode_operations ext4_file_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
 	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61d49ff22c8..23983c2cf95 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4663,7 +4663,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		ext4_orphan_del(NULL, inode);
 
 	if (!rc && (ia_valid & ATTR_MODE))
-		rc = ext4_acl_chmod(inode);
+		rc = posix_acl_chmod(inode, inode->i_mode);
 
 err_out:
 	ext4_std_error(inode->i_sb, error);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5a0408d7b11..e77c1ba6c8a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3225,6 +3225,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
 	.fiemap         = ext4_fiemap,
 };
 
@@ -3235,4 +3236,5 @@ const struct inode_operations ext4_special_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
 };
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1423c4816a4..e175e94116a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -95,8 +95,8 @@ static struct mb_cache *ext4_xattr_cache;
 static const struct xattr_handler *ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
-	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
+	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
 	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_SECURITY
@@ -108,8 +108,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-	&ext4_xattr_acl_access_handler,
-	&ext4_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 #ifdef CONFIG_EXT4_FS_SECURITY
 	&ext4_xattr_security_handler,
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index c767dbdd7fc..819d6398833 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find {
 
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern const struct xattr_handler ext4_xattr_acl_default_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
 
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
-- 
cgit v1.2.3-70-g09d2


From a6dda0e63e97122ce9e0ba04367e37cca28315fa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:45 -0800
Subject: f2fs: use generic posix ACL infrastructure

f2fs has some weird mode bit handling, so still using the old
chmod code for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/f2fs/acl.c   | 174 ++++++--------------------------------------------------
 fs/f2fs/acl.h   |   7 +--
 fs/f2fs/f2fs.h  |   4 ++
 fs/f2fs/file.c  |   3 +-
 fs/f2fs/namei.c |   2 +
 fs/f2fs/xattr.c |   9 +--
 fs/f2fs/xattr.h |   2 -
 7 files changed, 31 insertions(+), 170 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 45e84303c24..fa8da4cb8c4 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -17,9 +17,6 @@
 #include "xattr.h"
 #include "acl.h"
 
-#define get_inode_mode(i)	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
-					(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
-
 static inline size_t f2fs_acl_size(int count)
 {
 	if (count <= 4) {
@@ -167,19 +164,11 @@ fail:
 
 struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
 	void *value = NULL;
 	struct posix_acl *acl;
 	int retval;
 
-	if (!test_opt(sbi, POSIX_ACL))
-		return NULL;
-
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	if (type == ACL_TYPE_ACCESS)
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 
@@ -205,21 +194,15 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-static int f2fs_set_acl(struct inode *inode, int type,
+static int __f2fs_set_acl(struct inode *inode, int type,
 			struct posix_acl *acl, struct page *ipage)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	int name_index;
 	void *value = NULL;
 	size_t size = 0;
 	int error;
 
-	if (!test_opt(sbi, POSIX_ACL))
-		return 0;
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -261,154 +244,31 @@ static int f2fs_set_acl(struct inode *inode, int type,
 	return error;
 }
 
-int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
+int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-	struct posix_acl *acl = NULL;
-	int error = 0;
-
-	if (!S_ISLNK(inode->i_mode)) {
-		if (test_opt(sbi, POSIX_ACL)) {
-			acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
-		if (!acl)
-			inode->i_mode &= ~current_umask();
-	}
-
-	if (!test_opt(sbi, POSIX_ACL) || !acl)
-		goto cleanup;
-
-	if (S_ISDIR(inode->i_mode)) {
-		error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl, ipage);
-		if (error)
-			goto cleanup;
-	}
-	error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-	if (error < 0)
-		return error;
-	if (error > 0)
-		error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage);
-cleanup:
-	posix_acl_release(acl);
-	return error;
+	return __f2fs_set_acl(inode, type, acl, NULL);
 }
 
-int f2fs_acl_chmod(struct inode *inode)
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	struct posix_acl *acl;
-	int error;
-	umode_t mode = get_inode_mode(inode);
-
-	if (!test_opt(sbi, POSIX_ACL))
-		return 0;
-	if (S_ISLNK(mode))
-		return -EOPNOTSUPP;
-
-	acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
+	struct posix_acl *default_acl, *acl;
+	int error = 0;
 
-	error = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
 	if (error)
 		return error;
 
-	error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, NULL);
-	posix_acl_release(acl);
-	return error;
-}
-
-static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
-{
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-	const char *xname = POSIX_ACL_XATTR_DEFAULT;
-	size_t size;
-
-	if (!test_opt(sbi, POSIX_ACL))
-		return 0;
-
-	if (type == ACL_TYPE_ACCESS)
-		xname = POSIX_ACL_XATTR_ACCESS;
-
-	size = strlen(xname) + 1;
-	if (list && size <= list_size)
-		memcpy(list, xname, size);
-	return size;
-}
-
-static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
-{
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-	struct posix_acl *acl;
-	int error;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(sbi, POSIX_ACL))
-		return -EOPNOTSUPP;
-
-	acl = f2fs_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (!acl)
-		return -ENODATA;
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
-{
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl = NULL;
-	int error;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!test_opt(sbi, POSIX_ACL))
-		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		if (acl) {
-			error = posix_acl_valid(acl);
-			if (error)
-				goto release_and_out;
-		}
-	} else {
-		acl = NULL;
+	if (default_acl) {
+		error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
+				       ipage);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (error)
+			error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
+					       ipage);
+		posix_acl_release(acl);
 	}
 
-	error = f2fs_set_acl(inode, type, acl, NULL);
-
-release_and_out:
-	posix_acl_release(acl);
 	return error;
 }
-
-const struct xattr_handler f2fs_xattr_acl_default_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
-	.flags = ACL_TYPE_DEFAULT,
-	.list = f2fs_xattr_list_acl,
-	.get = f2fs_xattr_get_acl,
-	.set = f2fs_xattr_set_acl,
-};
-
-const struct xattr_handler f2fs_xattr_acl_access_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
-	.flags = ACL_TYPE_ACCESS,
-	.list = f2fs_xattr_list_acl,
-	.get = f2fs_xattr_get_acl,
-	.set = f2fs_xattr_set_acl,
-};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 49633131e03..e0864651cdc 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -37,18 +37,13 @@ struct f2fs_acl_header {
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 
 extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_acl_chmod(struct inode *);
+extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int f2fs_init_acl(struct inode *, struct inode *, struct page *);
 #else
 #define f2fs_check_acl	NULL
 #define f2fs_get_acl	NULL
 #define f2fs_set_acl	NULL
 
-static inline int f2fs_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
-
 static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
 							struct page *page)
 {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 89dc7508faf..934b59cf819 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -953,6 +953,10 @@ static inline int f2fs_readonly(struct super_block *sb)
 	return sb->s_flags & MS_RDONLY;
 }
 
+#define get_inode_mode(i) \
+	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
+
 /*
  * file.c
  */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d714f4972d..cf835e05f39 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -390,7 +390,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	__setattr_copy(inode, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		err = f2fs_acl_chmod(inode);
+		err = posix_acl_chmod(inode, get_inode_mode(inode));
 		if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
 			inode->i_mode = fi->i_acl_mode;
 			clear_inode_flag(fi, FI_ACL_MODE);
@@ -405,6 +405,7 @@ const struct inode_operations f2fs_file_inode_operations = {
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 	.get_acl	= f2fs_get_acl,
+	.set_acl	= f2fs_set_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 575adac17f8..5846eeb22ce 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -496,6 +496,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 	.get_acl	= f2fs_get_acl,
+	.set_acl	= f2fs_set_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -522,6 +523,7 @@ const struct inode_operations f2fs_special_inode_operations = {
 	.getattr	= f2fs_getattr,
 	.setattr        = f2fs_setattr,
 	.get_acl	= f2fs_get_acl,
+	.set_acl	= f2fs_set_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
 	.setxattr       = generic_setxattr,
 	.getxattr       = generic_getxattr,
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index aa7a3f139fe..e2b92995408 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/f2fs_fs.h>
 #include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
 #include "f2fs.h"
 #include "xattr.h"
 
@@ -216,8 +217,8 @@ const struct xattr_handler f2fs_xattr_security_handler = {
 static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
-	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
+	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
 	[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
 #ifdef CONFIG_F2FS_FS_SECURITY
@@ -229,8 +230,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 const struct xattr_handler *f2fs_xattr_handlers[] = {
 	&f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-	&f2fs_xattr_acl_access_handler,
-	&f2fs_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	&f2fs_xattr_trusted_handler,
 #ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 02a08fb88a1..b21d9ebdeff 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -108,8 +108,6 @@ struct f2fs_xattr_entry {
 #ifdef CONFIG_F2FS_FS_XATTR
 extern const struct xattr_handler f2fs_xattr_user_handler;
 extern const struct xattr_handler f2fs_xattr_trusted_handler;
-extern const struct xattr_handler f2fs_xattr_acl_access_handler;
-extern const struct xattr_handler f2fs_xattr_acl_default_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
 extern const struct xattr_handler f2fs_xattr_security_handler;
 
-- 
cgit v1.2.3-70-g09d2


From b0a7ab5706647844e7a1b91b0c31cdb3bee1e1cc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:46 -0800
Subject: hfsplus: use generic posix ACL infrastructure

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/acl.h       |   9 +--
 fs/hfsplus/dir.c       |   1 +
 fs/hfsplus/inode.c     |   3 +-
 fs/hfsplus/posix_acl.c | 168 +++++--------------------------------------------
 fs/hfsplus/xattr.c     |   5 +-
 fs/hfsplus/xattr.h     |   2 -
 6 files changed, 26 insertions(+), 162 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
index 07c0d494752..95c8ed9ec17 100644
--- a/fs/hfsplus/acl.h
+++ b/fs/hfsplus/acl.h
@@ -12,16 +12,13 @@
 
 /* posix_acl.c */
 struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
-extern int hfsplus_posix_acl_chmod(struct inode *);
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
+		int type);
 extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
 
 #else  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
 #define hfsplus_get_posix_acl NULL
-
-static inline int hfsplus_posix_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
+#define hfsplus_set_posix_acl NULL
 
 static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
 {
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4a4fea00267..9ee62985e73 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -532,6 +532,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 	.removexattr		= hfsplus_removexattr,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
 	.get_acl		= hfsplus_get_posix_acl,
+	.set_acl		= hfsplus_set_posix_acl,
 #endif
 };
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 37213d075f3..2e10993fa96 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -319,7 +319,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	mark_inode_dirty(inode);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		error = hfsplus_posix_acl_chmod(inode);
+		error = posix_acl_chmod(inode, inode->i_mode);
 		if (unlikely(error))
 			return error;
 	}
@@ -393,6 +393,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
 	.removexattr	= hfsplus_removexattr,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
 	.get_acl	= hfsplus_get_posix_acl,
+	.set_acl	= hfsplus_set_posix_acl,
 #endif
 };
 
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index 277942f36f8..df0c9af68d0 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -17,9 +17,7 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
 	char *value = NULL;
 	ssize_t size;
 
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
+	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
@@ -56,17 +54,15 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
 	return acl;
 }
 
-static int hfsplus_set_posix_acl(struct inode *inode,
-					int type,
-					struct posix_acl *acl)
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
+		int type)
 {
 	int err;
 	char *xattr_name;
 	size_t size = 0;
 	char *value = NULL;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
+	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
@@ -115,7 +111,7 @@ end_set_acl:
 int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
 {
 	int err = 0;
-	struct posix_acl *acl = NULL;
+	struct posix_acl *default_acl, *acl;
 
 	hfs_dbg(ACL_MOD,
 		"[%s]: ino %lu, dir->ino %lu\n",
@@ -124,151 +120,21 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
 	if (S_ISLNK(inode->i_mode))
 		return 0;
 
-	acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-
-	if (acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			err = hfsplus_set_posix_acl(inode,
-							ACL_TYPE_DEFAULT,
-							acl);
-			if (unlikely(err))
-				goto init_acl_cleanup;
-		}
-
-		err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-		if (unlikely(err < 0))
-			return err;
-
-		if (err > 0)
-			err = hfsplus_set_posix_acl(inode,
-							ACL_TYPE_ACCESS,
-							acl);
-	} else
-		inode->i_mode &= ~current_umask();
-
-init_acl_cleanup:
-	posix_acl_release(acl);
-	return err;
-}
-
-int hfsplus_posix_acl_chmod(struct inode *inode)
-{
-	int err;
-	struct posix_acl *acl;
-
-	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-
-	err = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (unlikely(err))
+	err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (err)
 		return err;
 
-	err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl);
-	posix_acl_release(acl);
-	return err;
-}
-
-static int hfsplus_xattr_get_posix_acl(struct dentry *dentry,
-					const char *name,
-					void *buffer,
-					size_t size,
-					int type)
-{
-	int err = 0;
-	struct posix_acl *acl;
-
-	hfs_dbg(ACL_MOD,
-		"[%s]: ino %lu, buffer %p, size %zu, type %#x\n",
-		__func__, dentry->d_inode->i_ino, buffer, size, type);
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
-	acl = hfsplus_get_posix_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-
-	err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return err;
-}
-
-static int hfsplus_xattr_set_posix_acl(struct dentry *dentry,
-					const char *name,
-					const void *value,
-					size_t size,
-					int flags,
-					int type)
-{
-	int err = 0;
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl = NULL;
-
-	hfs_dbg(ACL_MOD,
-		"[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n",
-		__func__, inode->i_ino, value, size, flags, type);
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		else if (acl) {
-			err = posix_acl_valid(acl);
-			if (err)
-				goto end_xattr_set_acl;
-		}
+	if (default_acl) {
+		err = hfsplus_set_posix_acl(inode, default_acl,
+					    ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
 	}
 
-	err = hfsplus_set_posix_acl(inode, type, acl);
-
-end_xattr_set_acl:
-	posix_acl_release(acl);
+	if (acl) {
+		if (!err)
+			err = hfsplus_set_posix_acl(inode, acl,
+						    ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
 	return err;
 }
-
-static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry,
-						char *list,
-						size_t list_size,
-						const char *name,
-						size_t name_len,
-						int type)
-{
-	/*
-	 * This method is not used.
-	 * It is used hfsplus_listxattr() instead of generic_listxattr().
-	 */
-	return -EOPNOTSUPP;
-}
-
-const struct xattr_handler hfsplus_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.list	= hfsplus_xattr_list_posix_acl,
-	.get	= hfsplus_xattr_get_posix_acl,
-	.set	= hfsplus_xattr_set_posix_acl,
-};
-
-const struct xattr_handler hfsplus_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= hfsplus_xattr_list_posix_acl,
-	.get	= hfsplus_xattr_get_posix_acl,
-	.set	= hfsplus_xattr_set_posix_acl,
-};
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 3c6136f98c7..bf88baa9bb6 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -7,6 +7,7 @@
  */
 
 #include "hfsplus_fs.h"
+#include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -15,8 +16,8 @@ const struct xattr_handler *hfsplus_xattr_handlers[] = {
 	&hfsplus_xattr_user_handler,
 	&hfsplus_xattr_trusted_handler,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-	&hfsplus_xattr_acl_access_handler,
-	&hfsplus_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	&hfsplus_xattr_security_handler,
 	NULL
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 841b5698c0f..9e214490c31 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -14,8 +14,6 @@
 extern const struct xattr_handler hfsplus_xattr_osx_handler;
 extern const struct xattr_handler hfsplus_xattr_user_handler;
 extern const struct xattr_handler hfsplus_xattr_trusted_handler;
-extern const struct xattr_handler hfsplus_xattr_acl_access_handler;
-extern const struct xattr_handler hfsplus_xattr_acl_default_handler;
 extern const struct xattr_handler hfsplus_xattr_security_handler;
 
 extern const struct xattr_handler *hfsplus_xattr_handlers[];
-- 
cgit v1.2.3-70-g09d2


From f2963d4551e7f500025d687586a25a09ea28941e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:47 -0800
Subject: jffs2: use generic posix ACL infrastructure

Also don't bother to set up a .get_acl method for symlinks as we do not
support access control (ACLs or even mode bits) for symlinks in Linux.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/acl.c     | 141 +++++------------------------------------------------
 fs/jffs2/acl.h     |   7 +--
 fs/jffs2/dir.c     |   1 +
 fs/jffs2/file.c    |   1 +
 fs/jffs2/fs.c      |   7 +--
 fs/jffs2/symlink.c |   1 -
 fs/jffs2/xattr.c   |   9 ++--
 7 files changed, 24 insertions(+), 143 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 4d6e31b1981..009ec0b5993 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -178,10 +178,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 	char *value = NULL;
 	int rc, xprefix;
 
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -232,13 +228,10 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
 	return rc;
 }
 
-static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int rc, xprefix;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -277,30 +270,21 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
 {
-	struct posix_acl *acl;
+	struct posix_acl *default_acl, *acl;
 	int rc;
 
 	cache_no_acl(inode);
 
-	if (S_ISLNK(*i_mode))
-		return 0;	/* Symlink always has no-ACL */
-
-	acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-
-	if (!acl) {
-		*i_mode &= ~current_umask();
-	} else {
-		if (S_ISDIR(*i_mode))
-			set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
-
-		rc = __posix_acl_create(&acl, GFP_KERNEL, i_mode);
-		if (rc < 0)
-			return rc;
-		if (rc > 0)
-			set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+	rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl);
+	if (rc)
+		return rc;
 
+	if (default_acl) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
 		posix_acl_release(acl);
 	}
 	return 0;
@@ -324,106 +308,3 @@ int jffs2_init_acl_post(struct inode *inode)
 
 	return 0;
 }
-
-int jffs2_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
-	int rc;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-	rc = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (rc)
-		return rc;
-	rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl);
-	posix_acl_release(acl);
-	return rc;
-}
-
-static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
-{
-	const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
-
-	if (list && retlen <= list_size)
-		strcpy(list, POSIX_ACL_XATTR_ACCESS);
-	return retlen;
-}
-
-static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
-		size_t list_size, const char *name, size_t name_len, int type)
-{
-	const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
-	if (list && retlen <= list_size)
-		strcpy(list, POSIX_ACL_XATTR_DEFAULT);
-	return retlen;
-}
-
-static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
-{
-	struct posix_acl *acl;
-	int rc;
-
-	if (name[0] != '\0')
-		return -EINVAL;
-
-	acl = jffs2_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (!acl)
-		return -ENODATA;
-	rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return rc;
-}
-
-static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
-{
-	struct posix_acl *acl;
-	int rc;
-
-	if (name[0] != '\0')
-		return -EINVAL;
-	if (!inode_owner_or_capable(dentry->d_inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		if (acl) {
-			rc = posix_acl_valid(acl);
-			if (rc)
-				goto out;
-		}
-	} else {
-		acl = NULL;
-	}
-	rc = jffs2_set_acl(dentry->d_inode, type, acl);
- out:
-	posix_acl_release(acl);
-	return rc;
-}
-
-const struct xattr_handler jffs2_acl_access_xattr_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= jffs2_acl_access_listxattr,
-	.get	= jffs2_acl_getxattr,
-	.set	= jffs2_acl_setxattr,
-};
-
-const struct xattr_handler jffs2_acl_default_xattr_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= jffs2_acl_default_listxattr,
-	.get	= jffs2_acl_getxattr,
-	.set	= jffs2_acl_setxattr,
-};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9b477246f2a..2e2b5745c3b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -27,17 +27,14 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
 struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
-extern int jffs2_acl_chmod(struct inode *);
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
 extern int jffs2_init_acl_post(struct inode *);
 
-extern const struct xattr_handler jffs2_acl_access_xattr_handler;
-extern const struct xattr_handler jffs2_acl_default_xattr_handler;
-
 #else
 
 #define jffs2_get_acl				(NULL)
-#define jffs2_acl_chmod(inode)			(0)
+#define jffs2_set_acl				(NULL)
 #define jffs2_init_acl_pre(dir_i,inode,mode)	(0)
 #define jffs2_init_acl_post(inode)		(0)
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index e3aac222472..938556025d6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -59,6 +59,7 @@ const struct inode_operations jffs2_dir_inode_operations =
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
 	.get_acl =	jffs2_get_acl,
+	.set_acl =	jffs2_set_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 1506673c087..256cd19a3b7 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -66,6 +66,7 @@ const struct file_operations jffs2_file_operations =
 const struct inode_operations jffs2_file_inode_operations =
 {
 	.get_acl =	jffs2_get_acl,
+	.set_acl =	jffs2_set_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09b3ed45572..a69e426435d 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -190,15 +190,16 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 
 int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
+	struct inode *inode = dentry->d_inode;
 	int rc;
 
-	rc = inode_change_ok(dentry->d_inode, iattr);
+	rc = inode_change_ok(inode, iattr);
 	if (rc)
 		return rc;
 
-	rc = jffs2_do_setattr(dentry->d_inode, iattr);
+	rc = jffs2_do_setattr(inode, iattr);
 	if (!rc && (iattr->ia_valid & ATTR_MODE))
-		rc = jffs2_acl_chmod(dentry->d_inode);
+		rc = posix_acl_chmod(inode, inode->i_mode);
 
 	return rc;
 }
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 6e563332bb2..c7c77b0dfcc 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -22,7 +22,6 @@ const struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
 	.follow_link =	jffs2_follow_link,
-	.get_acl =	jffs2_get_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3034e970eb9..ad0f2e2a170 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -22,6 +22,7 @@
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
 /* -------- xdatum related functions ----------------
@@ -921,8 +922,8 @@ const struct xattr_handler *jffs2_xattr_handlers[] = {
 	&jffs2_security_xattr_handler,
 #endif
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-	&jffs2_acl_access_xattr_handler,
-	&jffs2_acl_default_xattr_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	&jffs2_trusted_xattr_handler,
 	NULL
@@ -942,10 +943,10 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) {
 #endif
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 	case JFFS2_XPREFIX_ACL_ACCESS:
-		ret = &jffs2_acl_access_xattr_handler;
+		ret = &posix_acl_access_xattr_handler;
 		break;
 	case JFFS2_XPREFIX_ACL_DEFAULT:
-		ret = &jffs2_acl_default_xattr_handler;
+		ret = &posix_acl_default_xattr_handler;
 		break;
 #endif
 	case JFFS2_XPREFIX_TRUSTED:
-- 
cgit v1.2.3-70-g09d2


From 702e5bc68ad2c02f1b12b53ef7093074af9d2441 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:48 -0800
Subject: ocfs2: use generic posix ACL infrastructure

This contains some major refactoring for the create path so that
inodes are created with the right mode to start with instead of
fixing it up later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/acl.c          | 234 ++----------------------------------------------
 fs/ocfs2/acl.h          |  13 +--
 fs/ocfs2/file.c         |   4 +-
 fs/ocfs2/namei.c        |  25 +++++-
 fs/ocfs2/refcounttree.c |  19 +++-
 fs/ocfs2/xattr.c        |  21 +++--
 fs/ocfs2/xattr.h        |   6 +-
 7 files changed, 72 insertions(+), 250 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c0f9d2fe134..555f4cddefe 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
 	return acl;
 }
 
-
-/*
- * Get posix acl.
- */
-static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct buffer_head *di_bh = NULL;
-	struct posix_acl *acl;
-	int ret;
-
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-		return NULL;
-
-	ret = ocfs2_inode_lock(inode, &di_bh, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		acl = ERR_PTR(ret);
-		return acl;
-	}
-
-	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
-
-	ocfs2_inode_unlock(inode, 0);
-
-	brelse(di_bh);
-
-	return acl;
-}
-
 /*
  * Helper function to set i_mode in memory and disk. Some call paths
  * will not have di_bh or a journal handle to pass, in which case it
@@ -250,7 +220,7 @@ out:
 /*
  * Set the access or default ACL of an inode.
  */
-static int ocfs2_set_acl(handle_t *handle,
+int ocfs2_set_acl(handle_t *handle,
 			 struct inode *inode,
 			 struct buffer_head *di_bh,
 			 int type,
@@ -313,6 +283,11 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+}
+
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 {
 	struct ocfs2_super *osb;
@@ -334,200 +309,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 
 	return acl;
 }
-
-int ocfs2_acl_chmod(struct inode *inode)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct posix_acl *acl;
-	int ret;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-		return 0;
-
-	acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (ret)
-		return ret;
-	ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
-			    acl, NULL, NULL);
-	posix_acl_release(acl);
-	return ret;
-}
-
-/*
- * Initialize the ACLs of a new inode. If parent directory has default ACL,
- * then clone to new inode. Called from ocfs2_mknod.
- */
-int ocfs2_init_acl(handle_t *handle,
-		   struct inode *inode,
-		   struct inode *dir,
-		   struct buffer_head *di_bh,
-		   struct buffer_head *dir_bh,
-		   struct ocfs2_alloc_context *meta_ac,
-		   struct ocfs2_alloc_context *data_ac)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct posix_acl *acl = NULL;
-	int ret = 0, ret2;
-	umode_t mode;
-
-	if (!S_ISLNK(inode->i_mode)) {
-		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
-			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
-						   dir_bh);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
-		if (!acl) {
-			mode = inode->i_mode & ~current_umask();
-			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
-			if (ret) {
-				mlog_errno(ret);
-				goto cleanup;
-			}
-		}
-	}
-	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			ret = ocfs2_set_acl(handle, inode, di_bh,
-					    ACL_TYPE_DEFAULT, acl,
-					    meta_ac, data_ac);
-			if (ret)
-				goto cleanup;
-		}
-		mode = inode->i_mode;
-		ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
-		if (ret < 0)
-			return ret;
-
-		ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
-		if (ret2) {
-			mlog_errno(ret2);
-			ret = ret2;
-			goto cleanup;
-		}
-		if (ret > 0) {
-			ret = ocfs2_set_acl(handle, inode,
-					    di_bh, ACL_TYPE_ACCESS,
-					    acl, meta_ac, data_ac);
-		}
-	}
-cleanup:
-	posix_acl_release(acl);
-	return ret;
-}
-
-static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
-					  char *list,
-					  size_t list_len,
-					  const char *name,
-					  size_t name_len,
-					  int type)
-{
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-		return 0;
-
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-	return size;
-}
-
-static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
-					   char *list,
-					   size_t list_len,
-					   const char *name,
-					   size_t name_len,
-					   int type)
-{
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-		return 0;
-
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-	return size;
-}
-
-static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
-		void *buffer, size_t size, int type)
-{
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-	struct posix_acl *acl;
-	int ret;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-		return -EOPNOTSUPP;
-
-	acl = ocfs2_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return ret;
-}
-
-static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
-{
-	struct inode *inode = dentry->d_inode;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct posix_acl *acl;
-	int ret = 0;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-		return -EOPNOTSUPP;
-
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		else if (acl) {
-			ret = posix_acl_valid(acl);
-			if (ret)
-				goto cleanup;
-		}
-	} else
-		acl = NULL;
-
-	ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
-
-cleanup:
-	posix_acl_release(acl);
-	return ret;
-}
-
-const struct xattr_handler ocfs2_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.list	= ocfs2_xattr_list_acl_access,
-	.get	= ocfs2_xattr_get_acl,
-	.set	= ocfs2_xattr_set_acl,
-};
-
-const struct xattr_handler ocfs2_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= ocfs2_xattr_list_acl_default,
-	.get	= ocfs2_xattr_get_acl,
-	.set	= ocfs2_xattr_set_acl,
-};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 071fbd380f2..3fce68d0862 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -27,10 +27,13 @@ struct ocfs2_acl_entry {
 };
 
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
-extern int ocfs2_acl_chmod(struct inode *);
-extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
-			  struct buffer_head *, struct buffer_head *,
-			  struct ocfs2_alloc_context *,
-			  struct ocfs2_alloc_context *);
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_set_acl(handle_t *handle,
+			 struct inode *inode,
+			 struct buffer_head *di_bh,
+			 int type,
+			 struct posix_acl *acl,
+			 struct ocfs2_alloc_context *meta_ac,
+			 struct ocfs2_alloc_context *data_ac);
 
 #endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad1..014a38e9006 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1236,7 +1236,7 @@ bail:
 		dqput(transfer_to[qtype]);
 
 	if (!status && attr->ia_valid & ATTR_MODE) {
-		status = ocfs2_acl_chmod(inode);
+		status = posix_acl_chmod(inode, inode->i_mode);
 		if (status < 0)
 			mlog_errno(status);
 	}
@@ -2661,6 +2661,7 @@ const struct inode_operations ocfs2_file_iops = {
 	.removexattr	= generic_removexattr,
 	.fiemap		= ocfs2_fiemap,
 	.get_acl	= ocfs2_iop_get_acl,
+	.set_acl	= ocfs2_iop_set_acl,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
@@ -2668,6 +2669,7 @@ const struct inode_operations ocfs2_special_file_iops = {
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.get_acl	= ocfs2_iop_get_acl,
+	.set_acl	= ocfs2_iop_set_acl,
 };
 
 /*
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4f791f6d27d..c975eed2e71 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -230,6 +230,7 @@ static int ocfs2_mknod(struct inode *dir,
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	sigset_t oldset;
 	int did_block_signals = 0;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
 
 	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
 			  (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -331,6 +332,12 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
+	status = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
 							    S_ISDIR(mode),
 							    xattr_credits));
@@ -379,8 +386,17 @@ static int ocfs2_mknod(struct inode *dir,
 		inc_nlink(dir);
 	}
 
-	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
-				meta_ac, data_ac);
+	if (default_acl) {
+		status = ocfs2_set_acl(handle, inode, new_fe_bh,
+				       ACL_TYPE_DEFAULT, default_acl,
+				       meta_ac, data_ac);
+	}
+	if (!status && acl) {
+		status = ocfs2_set_acl(handle, inode, new_fe_bh,
+				       ACL_TYPE_ACCESS, acl,
+				       meta_ac, data_ac);
+	}
+
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -419,6 +435,10 @@ static int ocfs2_mknod(struct inode *dir,
 	d_instantiate(dentry, inode);
 	status = 0;
 leave:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
 	if (status < 0 && did_quota_inode)
 		dquot_free_inode(inode);
 	if (handle)
@@ -2504,4 +2524,5 @@ const struct inode_operations ocfs2_dir_iops = {
 	.removexattr	= generic_removexattr,
 	.fiemap         = ocfs2_fiemap,
 	.get_acl	= ocfs2_iop_get_acl,
+	.set_acl	= ocfs2_iop_set_acl,
 };
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 55767e1ba72..6ba4bcbc479 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -46,6 +46,7 @@
 #include <linux/quotaops.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/posix_acl.h>
 
 struct ocfs2_cow_context {
 	struct inode *inode;
@@ -4268,11 +4269,20 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 	struct inode *inode = old_dentry->d_inode;
 	struct buffer_head *old_bh = NULL;
 	struct inode *new_orphan_inode = NULL;
+	struct posix_acl *default_acl, *acl;
+	umode_t mode;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
 		return -EOPNOTSUPP;
 
-	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+	mode = inode->i_mode;
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = ocfs2_create_inode_in_orphan(dir, mode,
 					     &new_orphan_inode);
 	if (error) {
 		mlog_errno(error);
@@ -4303,11 +4313,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 	/* If the security isn't preserved, we need to re-initialize them. */
 	if (!preserve) {
 		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
-						    &new_dentry->d_name);
+						    &new_dentry->d_name,
+						    default_acl, acl);
 		if (error)
 			mlog_errno(error);
 	}
 out:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
 	if (!error) {
 		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
 						       new_dentry);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f0a1326d9bb..185fa3b7f96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 
 const struct xattr_handler *ocfs2_xattr_handlers[] = {
 	&ocfs2_xattr_user_handler,
-	&ocfs2_xattr_acl_access_handler,
-	&ocfs2_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 	&ocfs2_xattr_trusted_handler,
 	&ocfs2_xattr_security_handler,
 	NULL
@@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = {
 static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
-					= &ocfs2_xattr_acl_access_handler,
+					= &posix_acl_access_xattr_handler,
 	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
-					= &ocfs2_xattr_acl_default_handler,
+					= &posix_acl_default_xattr_handler,
 	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
 	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
 };
@@ -7190,10 +7190,12 @@ out:
  */
 int ocfs2_init_security_and_acl(struct inode *dir,
 				struct inode *inode,
-				const struct qstr *qstr)
+				const struct qstr *qstr,
+				struct posix_acl *default_acl,
+				struct posix_acl *acl)
 {
-	int ret = 0;
 	struct buffer_head *dir_bh = NULL;
+	int ret = 0;
 
 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
 	if (ret) {
@@ -7207,9 +7209,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 		goto leave;
 	}
 
-	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
-	if (ret)
-		mlog_errno(ret);
+	if (!ret && default_acl)
+		ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+	if (!ret && acl)
+		ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
 
 	ocfs2_inode_unlock(dir, 0);
 	brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 19f134e896a..f10d5b93c36 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info {
 extern const struct xattr_handler ocfs2_xattr_user_handler;
 extern const struct xattr_handler ocfs2_xattr_trusted_handler;
 extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
 extern const struct xattr_handler *ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
@@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
 			 bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
 				struct inode *inode,
-				const struct qstr *qstr);
+				const struct qstr *qstr,
+				struct posix_acl *default_acl,
+				struct posix_acl *acl);
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3-70-g09d2


From 47f70d08facf288a9faad6e6c36ac2e670be8195 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:49 -0800
Subject: reiserfs: use generic posix ACL infrastructure

Also don't bother to set up a .get_acl method for symlinks as we do not
support access control (ACLs or even mode bits) for symlinks in Linux.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/acl.h       |   4 +-
 fs/reiserfs/file.c      |   1 +
 fs/reiserfs/namei.c     |   4 +-
 fs/reiserfs/xattr.c     |   5 +-
 fs/reiserfs/xattr_acl.c | 182 +++++++-----------------------------------------
 5 files changed, 35 insertions(+), 161 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index f096b80e73d..4a211f5b34b 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -48,18 +48,18 @@ static inline int reiserfs_acl_count(size_t size)
 
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
+int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int reiserfs_acl_chmod(struct inode *inode);
 int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 				 struct inode *dir, struct dentry *dentry,
 				 struct inode *inode);
 int reiserfs_cache_default_acl(struct inode *dir);
-extern const struct xattr_handler reiserfs_posix_acl_default_handler;
-extern const struct xattr_handler reiserfs_posix_acl_access_handler;
 
 #else
 
 #define reiserfs_cache_default_acl(inode) 0
 #define reiserfs_get_acl NULL
+#define reiserfs_set_acl NULL
 
 static inline int reiserfs_acl_chmod(struct inode *inode)
 {
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index dcaafcfc23b..ed58d843d57 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -260,4 +260,5 @@ const struct inode_operations reiserfs_file_inode_operations = {
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
 	.get_acl = reiserfs_get_acl,
+	.set_acl = reiserfs_set_acl,
 };
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index dc5236f6de1..e825f8b63e6 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1522,6 +1522,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
 	.get_acl = reiserfs_get_acl,
+	.set_acl = reiserfs_set_acl,
 };
 
 /*
@@ -1538,8 +1539,6 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
 	.listxattr = reiserfs_listxattr,
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
-	.get_acl = reiserfs_get_acl,
-
 };
 
 /*
@@ -1553,4 +1552,5 @@ const struct inode_operations reiserfs_special_inode_operations = {
 	.removexattr = reiserfs_removexattr,
 	.permission = reiserfs_permission,
 	.get_acl = reiserfs_get_acl,
+	.set_acl = reiserfs_set_acl,
 };
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8a9e2dcfe00..5cdfbd638b5 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -50,6 +50,7 @@
 #include <linux/stat.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
 
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
@@ -904,8 +905,8 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = {
 	&reiserfs_xattr_security_handler,
 #endif
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	&reiserfs_posix_acl_access_handler,
-	&reiserfs_posix_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	NULL
 };
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d95c9592327..a6ce532402d 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -11,35 +11,19 @@
 #include "acl.h"
 #include <asm/uaccess.h>
 
-static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
+static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
 			    struct inode *inode, int type,
 			    struct posix_acl *acl);
 
-static int
-reiserfs_posix_acl_set(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags, int type)
+
+int
+reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl;
 	int error, error2;
 	struct reiserfs_transaction_handle th;
 	size_t jcreate_blocks;
-	if (!reiserfs_posixacl(inode->i_sb))
-		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl)) {
-			return PTR_ERR(acl);
-		} else if (acl) {
-			error = posix_acl_valid(acl);
-			if (error)
-				goto release_and_out;
-		}
-	} else
-		acl = NULL;
+	int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
+
 
 	/* Pessimism: We can't assume that anything from the xattr root up
 	 * has been created. */
@@ -51,7 +35,7 @@ reiserfs_posix_acl_set(struct dentry *dentry, const char *name, const void *valu
 	error = journal_begin(&th, inode->i_sb, jcreate_blocks);
 	reiserfs_write_unlock(inode->i_sb);
 	if (error == 0) {
-		error = reiserfs_set_acl(&th, inode, type, acl);
+		error = __reiserfs_set_acl(&th, inode, type, acl);
 		reiserfs_write_lock(inode->i_sb);
 		error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
 		reiserfs_write_unlock(inode->i_sb);
@@ -59,29 +43,6 @@ reiserfs_posix_acl_set(struct dentry *dentry, const char *name, const void *valu
 			error = error2;
 	}
 
-      release_and_out:
-	posix_acl_release(acl);
-	return error;
-}
-
-static int
-reiserfs_posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
-		size_t size, int type)
-{
-	struct posix_acl *acl;
-	int error;
-
-	if (!reiserfs_posixacl(dentry->d_sb))
-		return -EOPNOTSUPP;
-
-	acl = reiserfs_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
 	return error;
 }
 
@@ -221,10 +182,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	int size;
 	int retval;
 
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
@@ -273,7 +230,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
  * BKL held [before 2.5.x]
  */
 static int
-reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
+__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 		 int type, struct posix_acl *acl)
 {
 	char *name;
@@ -281,9 +238,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	size_t size = 0;
 	int error;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
@@ -343,7 +297,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 			     struct inode *dir, struct dentry *dentry,
 			     struct inode *inode)
 {
-	struct posix_acl *acl;
+	struct posix_acl *default_acl, *acl;
 	int err = 0;
 
 	/* ACLs only get applied to files and directories */
@@ -363,37 +317,28 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 		goto apply_umask;
 	}
 
-	acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
+	err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (err)
+		return err;
 
+	if (default_acl) {
+		err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+					 default_acl);
+		posix_acl_release(default_acl);
+	}
 	if (acl) {
-		/* Copy the default ACL to the default ACL of a new directory */
-		if (S_ISDIR(inode->i_mode)) {
-			err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
-					       acl);
-			if (err)
-				goto cleanup;
-		}
-
-		/* Now we reconcile the new ACL and the mode,
-		   potentially modifying both */
-		err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-		if (err < 0)
-			return err;
-
-		/* If we need an ACL.. */
-		if (err > 0)
-			err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
-	      cleanup:
+		if (!err)
+			err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
+						 acl);
 		posix_acl_release(acl);
-	} else {
-	      apply_umask:
-		/* no ACL, apply umask */
-		inode->i_mode &= ~current_umask();
 	}
 
 	return err;
+
+      apply_umask:
+	/* no ACL, apply umask */
+	inode->i_mode &= ~current_umask();
+	return err;
 }
 
 /* This is used to cache the default acl before a new object is created.
@@ -442,84 +387,11 @@ int reiserfs_cache_default_acl(struct inode *inode)
  */
 int reiserfs_acl_chmod(struct inode *inode)
 {
-	struct reiserfs_transaction_handle th;
-	struct posix_acl *acl;
-	size_t size;
-	int error;
-
 	if (IS_PRIVATE(inode))
 		return 0;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
-	    !reiserfs_posixacl(inode->i_sb)) {
+	    !reiserfs_posixacl(inode->i_sb))
 		return 0;
-	}
 
-	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-	if (!acl)
-		return 0;
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	error = __posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode);
-	if (error)
-		return error;
-
-	size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count));
-	reiserfs_write_lock(inode->i_sb);
-	error = journal_begin(&th, inode->i_sb, size * 2);
-	reiserfs_write_unlock(inode->i_sb);
-	if (!error) {
-		int error2;
-		error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl);
-		reiserfs_write_lock(inode->i_sb);
-		error2 = journal_end(&th, inode->i_sb, size * 2);
-		reiserfs_write_unlock(inode->i_sb);
-		if (error2)
-			error = error2;
-	}
-	posix_acl_release(acl);
-	return error;
-}
-
-static size_t posix_acl_access_list(struct dentry *dentry, char *list,
-				    size_t list_size, const char *name,
-				    size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-	if (!reiserfs_posixacl(dentry->d_sb))
-		return 0;
-	if (list && size <= list_size)
-		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-	return size;
+	return posix_acl_chmod(inode, inode->i_mode);
 }
-
-const struct xattr_handler reiserfs_posix_acl_access_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
-	.flags = ACL_TYPE_ACCESS,
-	.get = reiserfs_posix_acl_get,
-	.set = reiserfs_posix_acl_set,
-	.list = posix_acl_access_list,
-};
-
-static size_t posix_acl_default_list(struct dentry *dentry, char *list,
-				     size_t list_size, const char *name,
-				     size_t name_len, int type)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-	if (!reiserfs_posixacl(dentry->d_sb))
-		return 0;
-	if (list && size <= list_size)
-		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-	return size;
-}
-
-const struct xattr_handler reiserfs_posix_acl_default_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
-	.flags = ACL_TYPE_DEFAULT,
-	.get = reiserfs_posix_acl_get,
-	.set = reiserfs_posix_acl_set,
-	.list = posix_acl_default_list,
-};
-- 
cgit v1.2.3-70-g09d2


From 2401dc2975fc5a33021dc8347ea82984c4707a08 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:50 -0800
Subject: xfs: use generic posix ACL infrastructure

Also don't bother to set up a .get_acl method for symlinks as we do not
support access control (ACLs or even mode bits) for symlinks in Linux,
and create inodes with the proper mode instead of fixing it up later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/xfs_acl.c   | 151 +++--------------------------------------------------
 fs/xfs/xfs_acl.h   |   9 +---
 fs/xfs/xfs_iops.c  |  41 ++++++++-------
 fs/xfs/xfs_iops.h  |   2 +-
 fs/xfs/xfs_xattr.c |   4 +-
 5 files changed, 36 insertions(+), 171 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 057ae2d502d..0ecec1896f2 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -124,16 +124,12 @@ struct posix_acl *
 xfs_get_acl(struct inode *inode, int type)
 {
 	struct xfs_inode *ip = XFS_I(inode);
-	struct posix_acl *acl;
+	struct posix_acl *acl = NULL;
 	struct xfs_acl *xfs_acl;
 	unsigned char *ea_name;
 	int error;
 	int len;
 
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	trace_xfs_get_acl(ip);
 
 	switch (type) {
@@ -164,10 +160,8 @@ xfs_get_acl(struct inode *inode, int type)
 		 * cache entry, for any other error assume it is transient and
 		 * leave the cache entry as ACL_NOT_CACHED.
 		 */
-		if (error == -ENOATTR) {
-			acl = NULL;
+		if (error == -ENOATTR)
 			goto out_update_cache;
-		}
 		goto out;
 	}
 
@@ -183,15 +177,12 @@ out:
 }
 
 STATIC int
-xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
 	struct xfs_inode *ip = XFS_I(inode);
 	unsigned char *ea_name;
 	int error;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		ea_name = SGI_ACL_FILE;
@@ -282,131 +273,23 @@ posix_acl_default_exists(struct inode *inode)
 	return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
 }
 
-/*
- * No need for i_mutex because the inode is not yet exposed to the VFS.
- */
 int
-xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
+xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	umode_t mode = inode->i_mode;
-	int error = 0, inherit = 0;
-
-	if (S_ISDIR(inode->i_mode)) {
-		error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
-		if (error)
-			goto out;
-	}
-
-	error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
-	if (error < 0)
-		return error;
-
-	/*
-	 * If __posix_acl_create returns a positive value we need to
-	 * inherit a permission that can't be represented using the Unix
-	 * mode bits and we actually need to set an ACL.
-	 */
-	if (error > 0)
-		inherit = 1;
-
-	error = xfs_set_mode(inode, mode);
-	if (error)
-		goto out;
-
-	if (inherit)
-		error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-
-out:
-	posix_acl_release(acl);
-	return error;
-}
-
-int
-xfs_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
-	int error;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-
-	error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (error)
-		return error;
-
-	error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-	posix_acl_release(acl);
-	return error;
-}
-
-static int
-xfs_xattr_acl_get(struct dentry *dentry, const char *name,
-		void *value, size_t size, int type)
-{
-	struct posix_acl *acl;
-	int error;
-
-	acl = xfs_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-
-	error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int
-xfs_xattr_acl_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
-{
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl = NULL;
 	int error = 0;
 
-	if (flags & XATTR_CREATE)
-		return -EINVAL;
-	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
-		return value ? -EACCES : 0;
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	if (!value)
+	if (!acl)
 		goto set_acl;
 
-	acl = posix_acl_from_xattr(&init_user_ns, value, size);
-	if (!acl) {
-		/*
-		 * acl_set_file(3) may request that we set default ACLs with
-		 * zero length -- defend (gracefully) against that here.
-		 */
-		goto out;
-	}
-	if (IS_ERR(acl)) {
-		error = PTR_ERR(acl);
-		goto out;
-	}
-
-	error = posix_acl_valid(acl);
-	if (error)
-		goto out_release;
-
 	error = -EINVAL;
 	if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
-		goto out_release;
+		return error;
 
 	if (type == ACL_TYPE_ACCESS) {
 		umode_t mode = inode->i_mode;
 		error = posix_acl_equiv_mode(acl, &mode);
 
 		if (error <= 0) {
-			posix_acl_release(acl);
 			acl = NULL;
 
 			if (error < 0)
@@ -415,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 
 		error = xfs_set_mode(inode, mode);
 		if (error)
-			goto out_release;
+			return error;
 	}
 
  set_acl:
-	error = xfs_set_acl(inode, type, acl);
- out_release:
-	posix_acl_release(acl);
- out:
-	return error;
+	return __xfs_set_acl(inode, type, acl);
 }
-
-const struct xattr_handler xfs_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.get	= xfs_xattr_acl_get,
-	.set	= xfs_xattr_acl_set,
-};
-
-const struct xattr_handler xfs_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.get	= xfs_xattr_acl_get,
-	.set	= xfs_xattr_acl_set,
-};
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 4016a567b83..5dc16374451 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -60,20 +60,15 @@ struct xfs_acl {
 
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
-extern int xfs_acl_chmod(struct inode *inode);
+extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
-
-extern const struct xattr_handler xfs_xattr_acl_access_handler;
-extern const struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
 {
 	return NULL;
 }
-# define xfs_inherit_acl(inode, default_acl)		0
-# define xfs_acl_chmod(inode)				0
+# define xfs_set_acl					NULL
 # define posix_acl_access_exists(inode)			0
 # define posix_acl_default_exists(inode)		0
 #endif /* CONFIG_XFS_POSIX_ACL */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a3dad17b135..d47fbee3121 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -123,7 +123,7 @@ xfs_vn_mknod(
 {
 	struct inode	*inode;
 	struct xfs_inode *ip = NULL;
-	struct posix_acl *default_acl = NULL;
+	struct posix_acl *default_acl, *acl;
 	struct xfs_name	name;
 	int		error;
 
@@ -139,14 +139,9 @@ xfs_vn_mknod(
 		rdev = 0;
 	}
 
-	if (IS_POSIXACL(dir)) {
-		default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
-		if (IS_ERR(default_acl))
-			return PTR_ERR(default_acl);
-
-		if (!default_acl)
-			mode &= ~current_umask();
-	}
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		return error;
 
 	xfs_dentry_to_name(&name, dentry, mode);
 	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
@@ -159,22 +154,30 @@ xfs_vn_mknod(
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
+#ifdef CONFIG_XFS_POSIX_ACL
 	if (default_acl) {
-		error = -xfs_inherit_acl(inode, default_acl);
-		default_acl = NULL;
-		if (unlikely(error))
+		error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		if (error)
 			goto out_cleanup_inode;
 	}
-
+	if (acl) {
+		error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (error)
+			goto out_cleanup_inode;
+	}
+#endif
 
 	d_instantiate(dentry, inode);
+ out_free_acl:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
 	return -error;
 
  out_cleanup_inode:
 	xfs_cleanup_inode(dir, inode, dentry);
- out_free_acl:
-	posix_acl_release(default_acl);
-	return -error;
+	goto out_free_acl;
 }
 
 STATIC int
@@ -672,7 +675,7 @@ xfs_setattr_nonsize(
 	 * 	     Posix ACL code seems to care about this issue either.
 	 */
 	if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
-		error = -xfs_acl_chmod(inode);
+		error = -posix_acl_chmod(inode, inode->i_mode);
 		if (error)
 			return XFS_ERROR(error);
 	}
@@ -1041,6 +1044,7 @@ xfs_vn_fiemap(
 
 static const struct inode_operations xfs_inode_operations = {
 	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -1068,6 +1072,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
 	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -1094,6 +1099,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
 	.get_acl		= xfs_get_acl,
+	.set_acl		= xfs_set_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -1107,7 +1113,6 @@ static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
 	.follow_link		= xfs_vn_follow_link,
 	.put_link		= kfree_put_link,
-	.get_acl		= xfs_get_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index d2c5057b5cc..1c34e433592 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -30,7 +30,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
 /*
  * Internal setattr interfaces.
  */
-#define XFS_ATTR_NOACL		0x01	/* Don't call xfs_acl_chmod */
+#define XFS_ATTR_NOACL		0x01	/* Don't call posix_acl_chmod */
 
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
 			       int flags);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 9d479073ba4..78ed92a46fd 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -102,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
 	&xfs_xattr_trusted_handler,
 	&xfs_xattr_security_handler,
 #ifdef CONFIG_XFS_POSIX_ACL
-	&xfs_xattr_acl_access_handler,
-	&xfs_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	NULL
 };
-- 
cgit v1.2.3-70-g09d2


From 2cc6a5a01cdbeb0e46f3aa144819d5d7cee458a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:51 -0800
Subject: jfs: use generic posix ACL infrastructure

Copy the scheme I introduced to btrfs many years ago to only use the
xattr handler for ACLs, but pass plain attrs straight through.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/acl.c       | 105 +++++++++++++++++++++------------------------------
 fs/jfs/file.c      |   4 +-
 fs/jfs/jfs_acl.h   |   7 +---
 fs/jfs/jfs_xattr.h |   2 +
 fs/jfs/namei.c     |   1 +
 fs/jfs/super.c     |   2 +
 fs/jfs/xattr.c     | 108 +++++++++++++++++++----------------------------------
 7 files changed, 89 insertions(+), 140 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 28d529ae9a4..e973b85d6af 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -72,7 +72,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
+static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
 		       struct posix_acl *acl)
 {
 	char *ea_name;
@@ -80,21 +80,22 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 	int size = 0;
 	char *value = NULL;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			ea_name = POSIX_ACL_XATTR_ACCESS;
-			break;
-		case ACL_TYPE_DEFAULT:
-			ea_name = POSIX_ACL_XATTR_DEFAULT;
-			if (!S_ISDIR(inode->i_mode))
-				return acl ? -EACCES : 0;
-			break;
-		default:
-			return -EINVAL;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = POSIX_ACL_XATTR_ACCESS;
+		rc = posix_acl_equiv_mode(acl, &inode->i_mode);
+		if (rc < 0)
+			return rc;
+		if (rc == 0)
+			acl = NULL;
+		break;
+	case ACL_TYPE_DEFAULT:
+		ea_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
 	}
+
 	if (acl) {
 		size = posix_acl_xattr_size(acl->a_count);
 		value = kmalloc(size, GFP_KERNEL);
@@ -114,65 +115,43 @@ out:
 	return rc;
 }
 
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int rc;
+	tid_t tid;
+
+	tid = txBegin(inode->i_sb, 0);
+	mutex_lock(&JFS_IP(inode)->commit_mutex);
+	rc = __jfs_set_acl(tid, inode, type, acl);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	mutex_unlock(&JFS_IP(inode)->commit_mutex);
+	return rc;
+}
+
 int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl = NULL;
+	struct posix_acl *default_acl, *acl;
 	int rc = 0;
 
-	if (S_ISLNK(inode->i_mode))
-		return 0;
+	rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (rc)
+		return rc;
 
-	acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
+	if (default_acl) {
+		rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl);
+		posix_acl_release(default_acl);
+	}
 
 	if (acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
-			if (rc)
-				goto cleanup;
-		}
-		rc = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-		if (rc < 0)
-			goto cleanup; /* posix_acl_release(NULL) is no-op */
-		if (rc > 0)
-			rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
-cleanup:
+		if (!rc)
+			rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
 		posix_acl_release(acl);
-	} else
-		inode->i_mode &= ~current_umask();
+	}
 
 	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
 			       inode->i_mode;
 
 	return rc;
 }
-
-int jfs_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
-	int rc;
-	tid_t tid;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-
-	rc = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (rc)
-		return rc;
-
-	tid = txBegin(inode->i_sb, 0);
-	mutex_lock(&JFS_IP(inode)->commit_mutex);
-	rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
-	if (!rc)
-		rc = txCommit(tid, 1, &inode, 0);
-	txEnd(tid);
-	mutex_unlock(&JFS_IP(inode)->commit_mutex);
-
-	posix_acl_release(acl);
-	return rc;
-}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index dd7442c5835..794da944d5c 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -19,6 +19,7 @@
 
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/posix_acl.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
@@ -131,7 +132,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE)
-		rc = jfs_acl_chmod(inode);
+		rc = posix_acl_chmod(inode, inode->i_mode);
 	return rc;
 }
 
@@ -143,6 +144,7 @@ const struct inode_operations jfs_file_inode_operations = {
 	.setattr	= jfs_setattr,
 #ifdef CONFIG_JFS_POSIX_ACL
 	.get_acl	= jfs_get_acl,
+	.set_acl	= jfs_set_acl,
 #endif
 };
 
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index ad84fe50ca9..489f993b7b1 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -21,8 +21,8 @@
 #ifdef CONFIG_JFS_POSIX_ACL
 
 struct posix_acl *jfs_get_acl(struct inode *inode, int type);
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
-int jfs_acl_chmod(struct inode *inode);
 
 #else
 
@@ -32,10 +32,5 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
 	return 0;
 }
 
-static inline int jfs_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
-
 #endif
 #endif		/* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index e9e100fd7c0..e8d717dabca 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -61,6 +61,8 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
 
+extern const struct xattr_handler *jfs_xattr_handlers[];
+
 #ifdef CONFIG_JFS_SECURITY
 extern int jfs_init_security(tid_t, struct inode *, struct inode *,
 			     const struct qstr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index aa8a3370631..d59c7defb1e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1524,6 +1524,7 @@ const struct inode_operations jfs_dir_inode_operations = {
 	.setattr	= jfs_setattr,
 #ifdef CONFIG_JFS_POSIX_ACL
 	.get_acl	= jfs_get_acl,
+	.set_acl	= jfs_set_acl,
 #endif
 };
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6669aa2042c..e2b7483444f 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -44,6 +44,7 @@
 #include "jfs_imap.h"
 #include "jfs_acl.h"
 #include "jfs_debug.h"
+#include "jfs_xattr.h"
 
 MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
 MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
@@ -522,6 +523,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	sb->s_op = &jfs_super_operations;
 	sb->s_export_op = &jfs_export_operations;
+	sb->s_xattr = jfs_xattr_handlers;
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &dquot_operations;
 	sb->s_qcop = &dquot_quotactl_ops;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index d3472f4cd53..5324e4e2b99 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -665,82 +665,13 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
 	return 0;
 }
 
-/*
- * can_set_system_xattr
- *
- * This code is specific to the system.* namespace.  It contains policy
- * which doesn't belong in the main xattr codepath.
- */
-static int can_set_system_xattr(struct inode *inode, const char *name,
-				const void *value, size_t value_len)
-{
-#ifdef CONFIG_JFS_POSIX_ACL
-	struct posix_acl *acl;
-	int rc;
-
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	/*
-	 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
-	 */
-	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
-		if (IS_ERR(acl)) {
-			rc = PTR_ERR(acl);
-			printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
-			       rc);
-			return rc;
-		}
-		if (acl) {
-			rc = posix_acl_equiv_mode(acl, &inode->i_mode);
-			posix_acl_release(acl);
-			if (rc < 0) {
-				printk(KERN_ERR
-				       "posix_acl_equiv_mode returned %d\n",
-				       rc);
-				return rc;
-			}
-			mark_inode_dirty(inode);
-		}
-		/*
-		 * We're changing the ACL.  Get rid of the cached one
-		 */
-		forget_cached_acl(inode, ACL_TYPE_ACCESS);
-
-		return 0;
-	} else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
-		if (IS_ERR(acl)) {
-			rc = PTR_ERR(acl);
-			printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
-			       rc);
-			return rc;
-		}
-		posix_acl_release(acl);
-
-		/*
-		 * We're changing the default ACL.  Get rid of the cached one
-		 */
-		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-
-		return 0;
-	}
-#endif			/* CONFIG_JFS_POSIX_ACL */
-	return -EOPNOTSUPP;
-}
-
 /*
  * Most of the permission checking is done by xattr_permission in the vfs.
- * The local file system is responsible for handling the system.* namespace.
  * We also need to verify that this is a namespace that we recognize.
  */
 static int can_set_xattr(struct inode *inode, const char *name,
 			 const void *value, size_t value_len)
 {
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return can_set_system_xattr(inode, name, value, value_len);
-
 	if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
 		/*
 		 * This makes sure that we aren't trying to set an
@@ -748,7 +679,7 @@ static int can_set_xattr(struct inode *inode, const char *name,
 		 * with "os2."
 		 */
 		if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
-				return -EOPNOTSUPP;
+			return -EOPNOTSUPP;
 		return 0;
 	}
 
@@ -913,6 +844,14 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	if ((rc = can_set_xattr(inode, name, value, value_len)))
 		return rc;
 
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, value_len, flags);
+
 	if (value == NULL) {	/* empty EA, do not remove */
 		value = "";
 		value_len = 0;
@@ -986,6 +925,14 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
 {
 	int err;
 
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, data, buf_size);
+
 	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		/*
 		 * skip past "os2." prefix
@@ -1077,6 +1024,14 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 	if ((rc = can_set_xattr(inode, name, NULL, 0)))
 		return rc;
 
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
 	tid = txBegin(inode->i_sb, 0);
 	mutex_lock(&ji->commit_mutex);
 	rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
@@ -1088,6 +1043,19 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 	return rc;
 }
 
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+const struct xattr_handler *jfs_xattr_handlers[] = {
+#ifdef JFS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL,
+};
+
+
 #ifdef CONFIG_JFS_SECURITY
 static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
 			  void *fs_info)
-- 
cgit v1.2.3-70-g09d2


From e01580bf9e4d0e3bbaead44bd46cdbfe61957732 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:52 -0800
Subject: gfs2: use generic posix ACL infrastructure

This contains some major refactoring for the create path so that
inodes are created with the right mode to start with instead of
fixing it up later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/acl.c   | 234 ++++++++------------------------------------------------
 fs/gfs2/acl.h   |   4 +-
 fs/gfs2/inode.c |  34 ++++++--
 fs/gfs2/xattr.c |   4 +-
 4 files changed, 62 insertions(+), 214 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index e82e4ac574a..ba9456685f4 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -49,10 +49,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
 	if (!ip->i_eattr)
 		return NULL;
 
-	acl = get_cached_acl(&ip->i_inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	name = gfs2_acl_name(type);
 	if (name == NULL)
 		return ERR_PTR(-EINVAL);
@@ -80,7 +76,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
 	return error;
 }
 
-static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int error;
 	int len;
@@ -88,219 +84,49 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
 	const char *name = gfs2_acl_name(type);
 
 	BUG_ON(name == NULL);
-	len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
-	if (len == 0)
-		return 0;
-	data = kmalloc(len, GFP_NOFS);
-	if (data == NULL)
-		return -ENOMEM;
-	error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
-	if (error < 0)
-		goto out;
-	error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
-	if (!error)
-		set_cached_acl(inode, type, acl);
-out:
-	kfree(data);
-	return error;
-}
-
-int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	struct posix_acl *acl;
-	umode_t mode = inode->i_mode;
-	int error = 0;
-
-	if (!sdp->sd_args.ar_posix_acl)
-		return 0;
-	if (S_ISLNK(inode->i_mode))
-		return 0;
-
-	acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (!acl) {
-		mode &= ~current_umask();
-		return gfs2_set_mode(inode, mode);
-	}
-
-	if (S_ISDIR(inode->i_mode)) {
-		error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
-		if (error)
-			goto out;
-	}
-
-	error = __posix_acl_create(&acl, GFP_NOFS, &mode);
-	if (error < 0)
-		return error;
 
-	if (error == 0)
-		goto munge;
-
-	error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
-	if (error)
-		goto out;
-munge:
-	error = gfs2_set_mode(inode, mode);
-out:
-	posix_acl_release(acl);
-	return error;
-}
-
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
-{
-	struct inode *inode = &ip->i_inode;
-	struct posix_acl *acl;
-	char *data;
-	unsigned int len;
-	int error;
-
-	acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (!acl)
-		return gfs2_setattr_simple(inode, attr);
-
-	error = __posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
-	if (error)
-		return error;
-
-	len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
-	data = kmalloc(len, GFP_NOFS);
-	error = -ENOMEM;
-	if (data == NULL)
-		goto out;
-	posix_acl_to_xattr(&init_user_ns, acl, data, len);
-	error = gfs2_xattr_acl_chmod(ip, attr, data);
-	kfree(data);
-	set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
-
-out:
-	posix_acl_release(acl);
-	return error;
-}
-
-static int gfs2_acl_type(const char *name)
-{
-	if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
-		return ACL_TYPE_ACCESS;
-	if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
-		return ACL_TYPE_DEFAULT;
-	return -EINVAL;
-}
-
-static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
-				 void *buffer, size_t size, int xtype)
-{
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct posix_acl *acl;
-	int type;
-	int error;
-
-	if (!sdp->sd_args.ar_posix_acl)
-		return -EOPNOTSUPP;
-
-	type = gfs2_acl_type(name);
-	if (type < 0)
-		return type;
-
-	acl = gfs2_get_acl(inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
-				 const void *value, size_t size, int flags,
-				 int xtype)
-{
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct posix_acl *acl = NULL;
-	int error = 0, type;
-
-	if (!sdp->sd_args.ar_posix_acl)
-		return -EOPNOTSUPP;
-
-	type = gfs2_acl_type(name);
-	if (type < 0)
-		return type;
-	if (flags & XATTR_CREATE)
-		return -EINVAL;
-	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
-		return value ? -EACCES : 0;
-	if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))
-		return -EPERM;
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	if (!value)
-		goto set_acl;
-
-	acl = posix_acl_from_xattr(&init_user_ns, value, size);
-	if (!acl) {
-		/*
-		 * acl_set_file(3) may request that we set default ACLs with
-		 * zero length -- defend (gracefully) against that here.
-		 */
-		goto out;
-	}
-	if (IS_ERR(acl)) {
-		error = PTR_ERR(acl);
-		goto out;
-	}
-
-	error = posix_acl_valid(acl);
-	if (error)
-		goto out_release;
-
-	error = -EINVAL;
 	if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
-		goto out_release;
+		return -EINVAL;
 
 	if (type == ACL_TYPE_ACCESS) {
 		umode_t mode = inode->i_mode;
+
 		error = posix_acl_equiv_mode(acl, &mode);
+		if (error < 0)
+			return error;
 
-		if (error <= 0) {
-			posix_acl_release(acl);
+		if (error == 0)
 			acl = NULL;
 
-			if (error < 0)
-				return error;
-		}
-
 		error = gfs2_set_mode(inode, mode);
 		if (error)
-			goto out_release;
+			return error;
 	}
 
-set_acl:
-	error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS);
-	if (!error) {
-		if (acl)
-			set_cached_acl(inode, type, acl);
-		else
-			forget_cached_acl(inode, type);
+	if (acl) {
+		len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
+		if (len == 0)
+			return 0;
+		data = kmalloc(len, GFP_NOFS);
+		if (data == NULL)
+			return -ENOMEM;
+		error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
+		if (error < 0)
+			goto out;
+	} else {
+		data = NULL;
+		len = 0;
 	}
-out_release:
-	posix_acl_release(acl);
+
+	error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
+	if (error)
+		goto out;
+
+	if (acl)
+		set_cached_acl(inode, type, acl);
+	else
+		forget_cached_acl(inode, type);
 out:
+	kfree(data);
 	return error;
 }
-
-const struct xattr_handler gfs2_xattr_system_handler = {
-	.prefix = XATTR_SYSTEM_PREFIX,
-	.flags  = GFS2_EATYPE_SYS,
-	.get    = gfs2_xattr_system_get,
-	.set    = gfs2_xattr_system_set,
-};
-
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 0da38dc7efe..301260c999b 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -17,8 +17,6 @@
 #define GFS2_ACL_MAX_ENTRIES		25
 
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
-extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
-extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern const struct xattr_handler gfs2_xattr_system_handler;
+extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f..d573125d102 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -552,6 +552,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 			     unsigned int size, int excl, int *opened)
 {
 	const struct qstr *name = &dentry->d_name;
+	struct posix_acl *default_acl, *acl;
 	struct gfs2_holder ghs[2];
 	struct inode *inode = NULL;
 	struct gfs2_inode *dip = GFS2_I(dir), *ip;
@@ -611,10 +612,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (!inode)
 		goto fail_gunlock;
 
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		goto fail_free_vfs_inode;
+
 	ip = GFS2_I(inode);
 	error = gfs2_rs_alloc(ip);
 	if (error)
-		goto fail_free_inode;
+		goto fail_free_acls;
 
 	inode->i_mode = mode;
 	set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
@@ -682,7 +687,16 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	gfs2_set_iop(inode);
 	insert_inode_hash(inode);
 
-	error = gfs2_acl_create(dip, inode);
+	if (default_acl) {
+		error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+
 	if (error)
 		goto fail_gunlock3;
 
@@ -716,6 +730,12 @@ fail_free_inode:
 	if (ip->i_gl)
 		gfs2_glock_put(ip->i_gl);
 	gfs2_rs_delete(ip, NULL);
+fail_free_acls:
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
+fail_free_vfs_inode:
 	free_inode_nonrcu(inode);
 	inode = NULL;
 fail_gunlock:
@@ -1678,10 +1698,11 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		error = gfs2_setattr_size(inode, attr->ia_size);
 	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
 		error = setattr_chown(inode, attr);
-	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
-		error = gfs2_acl_chmod(ip, attr);
-	else
+	else {
 		error = gfs2_setattr_simple(inode, attr);
+		if (!error && attr->ia_valid & ATTR_MODE)
+			error = posix_acl_chmod(inode, inode->i_mode);
+	}
 
 out:
 	if (!error)
@@ -1841,6 +1862,7 @@ const struct inode_operations gfs2_file_iops = {
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
 	.get_acl = gfs2_get_acl,
+	.set_acl = gfs2_set_acl,
 };
 
 const struct inode_operations gfs2_dir_iops = {
@@ -1862,6 +1884,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
 	.get_acl = gfs2_get_acl,
+	.set_acl = gfs2_set_acl,
 	.atomic_open = gfs2_atomic_open,
 };
 
@@ -1877,6 +1900,5 @@ const struct inode_operations gfs2_symlink_iops = {
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
-	.get_acl = gfs2_get_acl,
 };
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8c6a6f6bdba..0b81f783f78 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/posix_acl_xattr.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -1500,7 +1501,8 @@ static const struct xattr_handler gfs2_xattr_security_handler = {
 const struct xattr_handler *gfs2_xattr_handlers[] = {
 	&gfs2_xattr_user_handler,
 	&gfs2_xattr_security_handler,
-	&gfs2_xattr_system_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 	NULL,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 9e443bc369e04b8db3266d7253ce7c1eee2ec979 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 14 Nov 2013 21:15:13 +0000
Subject: um: hostfs: make functions static

The hostfs_*() callback functions are all only used within
hostfs_kern.c, so make them static.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: user-mode-linux-devel@lists.sourceforge.net
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/hostfs/hostfs_kern.c | 53 ++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index db23ce1bd90..fe649d325b1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -186,7 +186,7 @@ static struct inode *hostfs_iget(struct super_block *sb)
 	return inode;
 }
 
-int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	/*
 	 * do_statfs uses struct statfs64 internally, but the linux kernel
@@ -268,7 +268,7 @@ static const struct super_operations hostfs_sbops = {
 	.show_options	= hostfs_show_options,
 };
 
-int hostfs_readdir(struct file *file, struct dir_context *ctx)
+static int hostfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	void *dir;
 	char *name;
@@ -293,7 +293,7 @@ int hostfs_readdir(struct file *file, struct dir_context *ctx)
 	return 0;
 }
 
-int hostfs_file_open(struct inode *ino, struct file *file)
+static int hostfs_file_open(struct inode *ino, struct file *file)
 {
 	static DEFINE_MUTEX(open_mutex);
 	char *name;
@@ -359,7 +359,8 @@ static int hostfs_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
+			int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	int ret;
@@ -394,7 +395,7 @@ static const struct file_operations hostfs_dir_fops = {
 	.read		= generic_read_dir,
 };
 
-int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
@@ -430,7 +431,7 @@ int hostfs_writepage(struct page *page, struct writeback_control *wbc)
 	return err;
 }
 
-int hostfs_readpage(struct file *file, struct page *page)
+static int hostfs_readpage(struct file *file, struct page *page)
 {
 	char *buffer;
 	long long start;
@@ -455,9 +456,9 @@ int hostfs_readpage(struct file *file, struct page *page)
 	return err;
 }
 
-int hostfs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata)
+static int hostfs_write_begin(struct file *file, struct address_space *mapping,
+			      loff_t pos, unsigned len, unsigned flags,
+			      struct page **pagep, void **fsdata)
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 
@@ -467,9 +468,9 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
-int hostfs_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata)
+static int hostfs_write_end(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned copied,
+			    struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	void *buffer;
@@ -549,8 +550,8 @@ static int read_name(struct inode *ino, char *name)
 	return 0;
 }
 
-int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		  bool excl)
+static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			 bool excl)
 {
 	struct inode *inode;
 	char *name;
@@ -591,8 +592,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return error;
 }
 
-struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
-			     unsigned int flags)
+static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
+				    unsigned int flags)
 {
 	struct inode *inode;
 	char *name;
@@ -628,7 +629,8 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
 	return ERR_PTR(err);
 }
 
-int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
+static int hostfs_link(struct dentry *to, struct inode *ino,
+		       struct dentry *from)
 {
 	char *from_name, *to_name;
 	int err;
@@ -646,7 +648,7 @@ int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
 	return err;
 }
 
-int hostfs_unlink(struct inode *ino, struct dentry *dentry)
+static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
 {
 	char *file;
 	int err;
@@ -662,7 +664,8 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
 	return err;
 }
 
-int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
+static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
+			  const char *to)
 {
 	char *file;
 	int err;
@@ -674,7 +677,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
 	return err;
 }
 
-int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
 {
 	char *file;
 	int err;
@@ -686,7 +689,7 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
 	return err;
 }
 
-int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
+static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 {
 	char *file;
 	int err;
@@ -738,8 +741,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-int hostfs_rename(struct inode *from_ino, struct dentry *from,
-		  struct inode *to_ino, struct dentry *to)
+static int hostfs_rename(struct inode *from_ino, struct dentry *from,
+			 struct inode *to_ino, struct dentry *to)
 {
 	char *from_name, *to_name;
 	int err;
@@ -756,7 +759,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	return err;
 }
 
-int hostfs_permission(struct inode *ino, int desired)
+static int hostfs_permission(struct inode *ino, int desired)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
@@ -782,7 +785,7 @@ int hostfs_permission(struct inode *ino, int desired)
 	return err;
 }
 
-int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct hostfs_iattr attrs;
-- 
cgit v1.2.3-70-g09d2


From 013cdf1088d7235da9477a2375654921d9b9ba9f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:53 -0800
Subject: nfs: use generic posix ACL infrastructure for v3 Posix ACLs

This causes a small behaviour change in that we don't bother to set
ACLs on file creation if the mode bit can express the access permissions
fully, and thus behaving identical to local filesystems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/inode.c            |   4 -
 fs/nfs/nfs3acl.c          | 291 +++++++++-------------------------------------
 fs/nfs/nfs3proc.c         |  76 ++++++++----
 fs/nfs/nfs3super.c        |   3 +
 include/linux/nfs_fs.h    |  24 ++--
 include/linux/posix_acl.h |   4 +
 6 files changed, 121 insertions(+), 281 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00ad1c2b217..ecd11ba7f96 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1641,10 +1641,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
 		return NULL;
 	nfsi->flags = 0UL;
 	nfsi->cache_validity = 0UL;
-#ifdef CONFIG_NFS_V3_ACL
-	nfsi->acl_access = ERR_PTR(-EAGAIN);
-	nfsi->acl_default = ERR_PTR(-EAGAIN);
-#endif
 #if IS_ENABLED(CONFIG_NFS_V4)
 	nfsi->nfs4_acl = NULL;
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index e85967587d7..9a5ca03fa53 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -10,179 +10,7 @@
 
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
-ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl;
-	int pos=0, len=0;
-
-#	define output(s) do {						\
-			if (pos + sizeof(s) <= size) {			\
-				memcpy(buffer + pos, s, sizeof(s));	\
-				pos += sizeof(s);			\
-			}						\
-			len += sizeof(s);				\
-		} while(0)
-
-	acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl) {
-		output("system.posix_acl_access");
-		posix_acl_release(acl);
-	}
-
-	if (S_ISDIR(inode->i_mode)) {
-		acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		if (acl) {
-			output("system.posix_acl_default");
-			posix_acl_release(acl);
-		}
-	}
-
-#	undef output
-
-	if (!buffer || len <= size)
-		return len;
-	return -ERANGE;
-}
-
-ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
-		void *buffer, size_t size)
-{
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl;
-	int type, error = 0;
-
-	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
-		type = ACL_TYPE_ACCESS;
-	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
-		type = ACL_TYPE_DEFAULT;
-	else
-		return -EOPNOTSUPP;
-
-	acl = nfs3_proc_getacl(inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	else if (acl) {
-		if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
-			error = -ENODATA;
-		else
-			error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-		posix_acl_release(acl);
-	} else
-		error = -ENODATA;
-
-	return error;
-}
-
-int nfs3_setxattr(struct dentry *dentry, const char *name,
-	     const void *value, size_t size, int flags)
-{
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl;
-	int type, error;
-
-	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
-		type = ACL_TYPE_ACCESS;
-	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
-		type = ACL_TYPE_DEFAULT;
-	else
-		return -EOPNOTSUPP;
-
-	acl = posix_acl_from_xattr(&init_user_ns, value, size);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	error = nfs3_proc_setacl(inode, type, acl);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-int nfs3_removexattr(struct dentry *dentry, const char *name)
-{
-	struct inode *inode = dentry->d_inode;
-	int type;
-
-	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
-		type = ACL_TYPE_ACCESS;
-	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
-		type = ACL_TYPE_DEFAULT;
-	else
-		return -EOPNOTSUPP;
-
-	return nfs3_proc_setacl(inode, type, NULL);
-}
-
-static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
-{
-	if (!IS_ERR(nfsi->acl_access)) {
-		posix_acl_release(nfsi->acl_access);
-		nfsi->acl_access = ERR_PTR(-EAGAIN);
-	}
-	if (!IS_ERR(nfsi->acl_default)) {
-		posix_acl_release(nfsi->acl_default);
-		nfsi->acl_default = ERR_PTR(-EAGAIN);
-	}
-}
-
-void nfs3_forget_cached_acls(struct inode *inode)
-{
-	dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
-		inode->i_ino);
-	spin_lock(&inode->i_lock);
-	__nfs3_forget_cached_acls(NFS_I(inode));
-	spin_unlock(&inode->i_lock);
-}
-
-static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct posix_acl *acl = ERR_PTR(-EINVAL);
-
-	spin_lock(&inode->i_lock);
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = nfsi->acl_access;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			acl = nfsi->acl_default;
-			break;
-
-		default:
-			goto out;
-	}
-	if (IS_ERR(acl))
-		acl = ERR_PTR(-EAGAIN);
-	else
-		acl = posix_acl_dup(acl);
-out:
-	spin_unlock(&inode->i_lock);
-	dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
-		inode->i_ino, type, acl);
-	return acl;
-}
-
-static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
-		    struct posix_acl *dfacl)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id,
-		inode->i_ino, acl, dfacl);
-	spin_lock(&inode->i_lock);
-	__nfs3_forget_cached_acls(NFS_I(inode));
-	if (!IS_ERR(acl))
-		nfsi->acl_access = posix_acl_dup(acl);
-	if (!IS_ERR(dfacl))
-		nfsi->acl_default = posix_acl_dup(dfacl);
-	spin_unlock(&inode->i_lock);
-}
-
-struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
+struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct page *pages[NFSACL_MAXPAGES] = { };
@@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 		.rpc_argp	= &args,
 		.rpc_resp	= &res,
 	};
-	struct posix_acl *acl;
 	int status, count;
 
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 	status = nfs_revalidate_inode(server, inode);
 	if (status < 0)
 		return ERR_PTR(status);
-	acl = nfs3_get_cached_acl(inode, type);
-	if (acl != ERR_PTR(-EAGAIN))
-		return acl;
-	acl = NULL;
 
 	/*
 	 * Only get the access acl when explicitly requested: We don't
@@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 	}
 
 	if (res.acl_access != NULL) {
-		if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) {
+		if (posix_acl_equiv_mode(res.acl_access, NULL) ||
+		    res.acl_access->a_count == 0) {
 			posix_acl_release(res.acl_access);
 			res.acl_access = NULL;
 		}
 	}
-	nfs3_cache_acls(inode,
-		(res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL),
-		(res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
 
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = res.acl_access;
-			res.acl_access = NULL;
-			break;
+	if (res.mask & NFS_ACL)
+		set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access);
+	else
+		forget_cached_acl(inode, ACL_TYPE_ACCESS);
 
-		case ACL_TYPE_DEFAULT:
-			acl = res.acl_default;
-			res.acl_default = NULL;
+	if (res.mask & NFS_DFACL)
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default);
+	else
+		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+
+	nfs_free_fattr(res.fattr);
+	if (type == ACL_TYPE_ACCESS) {
+		posix_acl_release(res.acl_default);
+		return res.acl_access;
+	} else {
+		posix_acl_release(res.acl_access);
+		return res.acl_default;
 	}
 
 getout:
 	posix_acl_release(res.acl_access);
 	posix_acl_release(res.acl_default);
 	nfs_free_fattr(res.fattr);
-
-	if (status != 0) {
-		posix_acl_release(acl);
-		acl = ERR_PTR(status);
-	}
-	return acl;
+	return ERR_PTR(status);
 }
 
-static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
-		  struct posix_acl *dfacl)
+int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_fattr *fattr;
@@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	switch (status) {
 		case 0:
 			status = nfs_refresh_inode(inode, fattr);
-			nfs3_cache_acls(inode, acl, dfacl);
+			set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+			set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
 			break;
 		case -EPFNOSUPPORT:
 		case -EPROTONOSUPPORT:
@@ -373,33 +198,27 @@ out:
 	return status;
 }
 
-int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl)
+int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	struct posix_acl *alloc = NULL, *dfacl = NULL;
 	int status;
 
 	if (S_ISDIR(inode->i_mode)) {
 		switch(type) {
-			case ACL_TYPE_ACCESS:
-				alloc = dfacl = nfs3_proc_getacl(inode,
-						ACL_TYPE_DEFAULT);
-				if (IS_ERR(alloc))
-					goto fail;
-				break;
-
-			case ACL_TYPE_DEFAULT:
-				dfacl = acl;
-				alloc = acl = nfs3_proc_getacl(inode,
-						ACL_TYPE_ACCESS);
-				if (IS_ERR(alloc))
-					goto fail;
-				break;
-
-			default:
-				return -EINVAL;
+		case ACL_TYPE_ACCESS:
+			alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT);
+			if (IS_ERR(alloc))
+				goto fail;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			dfacl = acl;
+			alloc = acl = get_acl(inode, ACL_TYPE_ACCESS);
+			if (IS_ERR(alloc))
+				goto fail;
+			break;
 		}
-	} else if (type != ACL_TYPE_ACCESS)
-			return -EINVAL;
+	}
 
 	if (acl == NULL) {
 		alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -417,24 +236,24 @@ fail:
 int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
 		umode_t mode)
 {
-	struct posix_acl *dfacl, *acl;
-	int error = 0;
+	struct posix_acl *default_acl, *acl;
+	int error;
 
-	dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT);
-	if (IS_ERR(dfacl)) {
-		error = PTR_ERR(dfacl);
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
 		return (error == -EOPNOTSUPP) ? 0 : error;
-	}
-	if (!dfacl)
-		return 0;
-	acl = posix_acl_dup(dfacl);
-	error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
-	if (error < 0)
-		goto out_release_dfacl;
-	error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
-						      dfacl : NULL);
-	posix_acl_release(acl);
-out_release_dfacl:
-	posix_acl_release(dfacl);
+
+	error = nfs3_proc_setacls(inode, acl, default_acl);
+
+	if (acl)
+		posix_acl_release(acl);
+	if (default_acl)
+		posix_acl_release(default_acl);
 	return error;
 }
+
+const struct xattr_handler *nfs3_xattr_handlers[] = {
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	NULL,
+};
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 01b6f6a49d1..d2255d70542 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -317,8 +317,8 @@ static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		 int flags)
 {
+	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
-	umode_t mode = sattr->ia_mode;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  create %pd\n", dentry);
@@ -340,7 +340,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		data->arg.create.verifier[1] = cpu_to_be32(current->pid);
 	}
 
-	sattr->ia_mode &= ~current_umask();
+	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+	if (status)
+		goto out;
 
 	for (;;) {
 		status = nfs3_do_create(dir, dentry, data);
@@ -366,7 +368,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	}
 
 	if (status != 0)
-		goto out;
+		goto out_release_acls;
 
 	/* When we created the file with exclusive semantics, make
 	 * sure we set the attributes afterwards. */
@@ -385,9 +387,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
 		dprintk("NFS reply setattr (post-create): %d\n", status);
 		if (status != 0)
-			goto out;
+			goto out_release_acls;
 	}
-	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+
+	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+
+out_release_acls:
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
 out:
 	nfs3_free_createdata(data);
 	dprintk("NFS reply create: %d\n", status);
@@ -572,18 +579,20 @@ out:
 static int
 nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
+	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
-	umode_t mode = sattr->ia_mode;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  mkdir %pd\n", dentry);
 
-	sattr->ia_mode &= ~current_umask();
-
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
 		goto out;
 
+	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+	if (status)
+		goto out;
+
 	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
 	data->arg.mkdir.fh = NFS_FH(dir);
 	data->arg.mkdir.name = dentry->d_name.name;
@@ -592,9 +601,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 
 	status = nfs3_do_create(dir, dentry, data);
 	if (status != 0)
-		goto out;
+		goto out_release_acls;
 
-	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+
+out_release_acls:
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
 out:
 	nfs3_free_createdata(data);
 	dprintk("NFS reply mkdir: %d\n", status);
@@ -691,19 +704,21 @@ static int
 nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		dev_t rdev)
 {
+	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
-	umode_t mode = sattr->ia_mode;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  mknod %pd %u:%u\n", dentry,
 			MAJOR(rdev), MINOR(rdev));
 
-	sattr->ia_mode &= ~current_umask();
-
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
 		goto out;
 
+	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+	if (status)
+		goto out;
+
 	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
 	data->arg.mknod.fh = NFS_FH(dir);
 	data->arg.mknod.name = dentry->d_name.name;
@@ -731,8 +746,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 
 	status = nfs3_do_create(dir, dentry, data);
 	if (status != 0)
-		goto out;
-	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+		goto out_release_acls;
+
+	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+
+out_release_acls:
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
 out:
 	nfs3_free_createdata(data);
 	dprintk("NFS reply mknod: %d\n", status);
@@ -904,20 +924,28 @@ static const struct inode_operations nfs3_dir_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
-	.listxattr	= nfs3_listxattr,
-	.getxattr	= nfs3_getxattr,
-	.setxattr	= nfs3_setxattr,
-	.removexattr	= nfs3_removexattr,
+	.listxattr	= generic_listxattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.removexattr	= generic_removexattr,
+#ifdef CONFIG_NFS_V3_ACL
+	.get_acl	= nfs3_get_acl,
+	.set_acl	= nfs3_set_acl,
+#endif
 };
 
 static const struct inode_operations nfs3_file_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
-	.listxattr	= nfs3_listxattr,
-	.getxattr	= nfs3_getxattr,
-	.setxattr	= nfs3_setxattr,
-	.removexattr	= nfs3_removexattr,
+	.listxattr	= generic_listxattr,
+	.getxattr	= generic_getxattr,
+	.setxattr	= generic_setxattr,
+	.removexattr	= generic_removexattr,
+#ifdef CONFIG_NFS_V3_ACL
+	.get_acl	= nfs3_get_acl,
+	.set_acl	= nfs3_set_acl,
+#endif
 };
 
 const struct nfs_rpc_ops nfs_v3_clientops = {
@@ -965,7 +993,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
-	.clear_acl_cache = nfs3_forget_cached_acls,
+	.clear_acl_cache = forget_all_cached_acls,
 	.close_context	= nfs_close_context,
 	.have_delegation = nfs3_have_delegation,
 	.return_delegation = nfs3_return_delegation,
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index cc471c72523..d6a98949af1 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = {
 	.rpc_vers = &nfs_version3,
 	.rpc_ops  = &nfs_v3_clientops,
 	.sops     = &nfs_sops,
+#ifdef CONFIG_NFS_V3_ACL
+	.xattr    = nfs3_xattr_handlers,
+#endif
 };
 
 static int __init init_nfs_v3(void)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 48997374eaf..2b00625952a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -154,10 +154,6 @@ struct nfs_inode {
 	struct rb_root		access_cache;
 	struct list_head	access_cache_entry_lru;
 	struct list_head	access_cache_inode_lru;
-#ifdef CONFIG_NFS_V3_ACL
-	struct posix_acl	*acl_access;
-	struct posix_acl	*acl_default;
-#endif
 
 	/*
 	 * This is the cookie verifier used for NFSv3 readdir
@@ -564,23 +560,17 @@ extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
  * linux/fs/nfs3proc.c
  */
 #ifdef CONFIG_NFS_V3_ACL
-extern struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type);
-extern int nfs3_proc_setacl(struct inode *inode, int type,
-			    struct posix_acl *acl);
-extern int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
-		umode_t mode);
-extern void nfs3_forget_cached_acls(struct inode *inode);
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
 #else
-static inline int nfs3_proc_set_default_acl(struct inode *dir,
-					    struct inode *inode,
-					    umode_t mode)
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl)
 {
 	return 0;
 }
-
-static inline void nfs3_forget_cached_acls(struct inode *inode)
-{
-}
 #endif /* CONFIG_NFS_V3_ACL */
 
 /*
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index f7e6f6cb214..3d14be8e14d 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -191,6 +191,10 @@ static inline int posix_acl_create(struct inode *inode, umode_t *mode,
 	*default_acl = *acl = NULL;
 	return 0;
 }
+
+static inline void forget_all_cached_acls(struct inode *inode)
+{
+}
 #endif /* CONFIG_FS_POSIX_ACL */
 
 struct posix_acl *get_acl(struct inode *inode, int type);
-- 
cgit v1.2.3-70-g09d2


From feda821e76f3bbbba4bd54d30b4d4005a7848aa5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:54 -0800
Subject: fs: remove generic_acl

And instead convert tmpfs to use the new generic ACL code, with two stub
methods provided for in-memory filesystems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig                  |   6 +-
 fs/Makefile                 |   1 -
 fs/generic_acl.c            | 184 --------------------------------------------
 fs/posix_acl.c              |  36 +++++++++
 include/linux/generic_acl.h |  14 ----
 include/linux/posix_acl.h   |   9 +++
 mm/shmem.c                  |  57 ++++++--------
 7 files changed, 69 insertions(+), 238 deletions(-)
 delete mode 100644 fs/generic_acl.c
 delete mode 100644 include/linux/generic_acl.h

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index c229f828eb0..7385e54be4b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -68,10 +68,6 @@ source "fs/quota/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 
-config GENERIC_ACL
-	bool
-	select FS_POSIX_ACL
-
 menu "Caches"
 
 source "fs/fscache/Kconfig"
@@ -119,7 +115,7 @@ config TMPFS_POSIX_ACL
 	bool "Tmpfs POSIX Access Control Lists"
 	depends on TMPFS
 	select TMPFS_XATTR
-	select GENERIC_ACL
+	select FS_POSIX_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support additional access rights
 	  for users and groups beyond the standard owner/group/world scheme,
diff --git a/fs/Makefile b/fs/Makefile
index f2c1843820e..5bebad4b01c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -44,7 +44,6 @@ obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
-obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
deleted file mode 100644
index 4357f39c844..00000000000
--- a/fs/generic_acl.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
- *
- * This file is released under the GPL.
- *
- * Generic ACL support for in-memory filesystems.
- */
-
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <linux/fs.h>
-#include <linux/generic_acl.h>
-#include <linux/posix_acl.h>
-#include <linux/posix_acl_xattr.h>
-
-
-static size_t
-generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t name_len, int type)
-{
-	struct posix_acl *acl;
-	const char *xname;
-	size_t size;
-
-	acl = get_cached_acl(dentry->d_inode, type);
-	if (!acl)
-		return 0;
-	posix_acl_release(acl);
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		xname = POSIX_ACL_XATTR_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		xname = POSIX_ACL_XATTR_DEFAULT;
-		break;
-	default:
-		return 0;
-	}
-	size = strlen(xname) + 1;
-	if (list && size <= list_size)
-		memcpy(list, xname, size);
-	return size;
-}
-
-static int
-generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
-		     size_t size, int type)
-{
-	struct posix_acl *acl;
-	int error;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
-	acl = get_cached_acl(dentry->d_inode, type);
-	if (!acl)
-		return -ENODATA;
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int
-generic_acl_set(struct dentry *dentry, const char *name, const void *value,
-		     size_t size, int flags, int type)
-{
-	struct inode *inode = dentry->d_inode;
-	struct posix_acl *acl = NULL;
-	int error;
-
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-	}
-	if (acl) {
-		error = posix_acl_valid(acl);
-		if (error)
-			goto failed;
-		switch (type) {
-		case ACL_TYPE_ACCESS:
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
-				goto failed;
-			inode->i_ctime = CURRENT_TIME;
-			if (error == 0) {
-				posix_acl_release(acl);
-				acl = NULL;
-			}
-			break;
-		case ACL_TYPE_DEFAULT:
-			if (!S_ISDIR(inode->i_mode)) {
-				error = -EINVAL;
-				goto failed;
-			}
-			break;
-		}
-	}
-	set_cached_acl(inode, type, acl);
-	error = 0;
-failed:
-	posix_acl_release(acl);
-	return error;
-}
-
-/**
- * generic_acl_init  -  Take care of acl inheritance at @inode create time
- *
- * Files created inside a directory with a default ACL inherit the
- * directory's default ACL.
- */
-int
-generic_acl_init(struct inode *inode, struct inode *dir)
-{
-	struct posix_acl *acl = NULL;
-	int error;
-
-	if (!S_ISLNK(inode->i_mode))
-		acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
-	if (acl) {
-		if (S_ISDIR(inode->i_mode))
-			set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
-		error = __posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-		if (error < 0)
-			return error;
-		if (error > 0)
-			set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
-	} else {
-		inode->i_mode &= ~current_umask();
-	}
-	error = 0;
-
-	posix_acl_release(acl);
-	return error;
-}
-
-/**
- * generic_acl_chmod  -  change the access acl of @inode upon chmod()
- *
- * A chmod also changes the permissions of the owner, group/mask, and
- * other ACL entries.
- */
-int
-generic_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl;
-	int error = 0;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
-	if (acl) {
-		error = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-		if (error)
-			return error;
-		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
-		posix_acl_release(acl);
-	}
-	return error;
-}
-
-const struct xattr_handler generic_acl_access_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.list	= generic_acl_list,
-	.get	= generic_acl_get,
-	.set	= generic_acl_set,
-};
-
-const struct xattr_handler generic_acl_default_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.list	= generic_acl_list,
-	.get	= generic_acl_get,
-	.set	= generic_acl_set,
-};
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8f245ab2014..f40df9b665f 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -786,3 +786,39 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
 	.set = posix_acl_xattr_set,
 };
 EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
+
+int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int error;
+
+	if (type == ACL_TYPE_ACCESS) {
+		error = posix_acl_equiv_mode(acl, &inode->i_mode);
+		if (error < 0)
+			return 0;
+		if (error == 0)
+			acl = NULL;
+	}
+
+	inode->i_ctime = CURRENT_TIME;
+	set_cached_acl(inode, type, acl);
+	return 0;
+}
+
+int simple_acl_create(struct inode *dir, struct inode *inode)
+{
+	struct posix_acl *default_acl, *acl;
+	int error;
+
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+	set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+
+	if (default_acl)
+		posix_acl_release(default_acl);
+	if (acl)
+		posix_acl_release(acl);
+	return 0;
+}
diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h
deleted file mode 100644
index b6d657544ef..00000000000
--- a/include/linux/generic_acl.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef LINUX_GENERIC_ACL_H
-#define LINUX_GENERIC_ACL_H
-
-#include <linux/xattr.h>
-
-struct inode;
-
-extern const struct xattr_handler generic_acl_access_handler;
-extern const struct xattr_handler generic_acl_default_handler;
-
-int generic_acl_init(struct inode *, struct inode *);
-int generic_acl_chmod(struct inode *);
-
-#endif /* LINUX_GENERIC_ACL_H */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 3d14be8e14d..6b12b3d57e9 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -99,6 +99,9 @@ extern int posix_acl_chmod(struct inode *, umode_t);
 extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
 		struct posix_acl **);
 
+extern int simple_set_acl(struct inode *, struct posix_acl *, int);
+extern int simple_acl_create(struct inode *, struct inode *);
+
 static inline struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
 	switch (type) {
@@ -181,6 +184,12 @@ static inline int posix_acl_chmod(struct inode *inode, umode_t mode)
 	return 0;
 }
 
+#define simple_set_acl		NULL
+
+static inline int simple_acl_create(struct inode *dir, struct inode *inode)
+{
+	return 0;
+}
 static inline void cache_no_acl(struct inode *inode)
 {
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 902a14842b7..b21ca543458 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/xattr.h>
 #include <linux/exportfs.h>
 #include <linux/posix_acl.h>
-#include <linux/generic_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/slab.h>
@@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	setattr_copy(inode, attr);
-#ifdef CONFIG_TMPFS_POSIX_ACL
 	if (attr->ia_valid & ATTR_MODE)
-		error = generic_acl_chmod(inode);
-#endif
+		error = posix_acl_chmod(inode, inode->i_mode);
 	return error;
 }
 
@@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 
 	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
 	if (inode) {
-#ifdef CONFIG_TMPFS_POSIX_ACL
-		error = generic_acl_init(inode, dir);
-		if (error) {
-			iput(inode);
-			return error;
-		}
-#endif
+		error = simple_acl_create(dir, inode);
+		if (error)
+			goto out_iput;
 		error = security_inode_init_security(inode, dir,
 						     &dentry->d_name,
 						     shmem_initxattrs, NULL);
-		if (error) {
-			if (error != -EOPNOTSUPP) {
-				iput(inode);
-				return error;
-			}
-		}
+		if (error && error != -EOPNOTSUPP)
+			goto out_iput;
 
 		error = 0;
 		dir->i_size += BOGO_DIRENT_SIZE;
@@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 		dget(dentry); /* Extra count - pin the dentry in core */
 	}
 	return error;
+out_iput:
+	iput(inode);
+	return error;
 }
 
 static int
@@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 		error = security_inode_init_security(inode, dir,
 						     NULL,
 						     shmem_initxattrs, NULL);
-		if (error) {
-			if (error != -EOPNOTSUPP) {
-				iput(inode);
-				return error;
-			}
-		}
-#ifdef CONFIG_TMPFS_POSIX_ACL
-		error = generic_acl_init(inode, dir);
-		if (error) {
-			iput(inode);
-			return error;
-		}
-#else
-		error = 0;
-#endif
+		if (error && error != -EOPNOTSUPP)
+			goto out_iput;
+		error = simple_acl_create(dir, inode);
+		if (error)
+			goto out_iput;
 		d_tmpfile(dentry, inode);
 	}
 	return error;
+out_iput:
+	iput(inode);
+	return error;
 }
 
 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode,
 
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
-	&generic_acl_access_handler,
-	&generic_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	NULL
 };
@@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = {
 	.getxattr	= shmem_getxattr,
 	.listxattr	= shmem_listxattr,
 	.removexattr	= shmem_removexattr,
+	.set_acl	= simple_set_acl,
 #endif
 };
 
@@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	.setattr	= shmem_setattr,
+	.set_acl	= simple_set_acl,
 #endif
 };
 
@@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	.setattr	= shmem_setattr,
+	.set_acl	= simple_set_acl,
 #endif
 };
 
-- 
cgit v1.2.3-70-g09d2


From 4ac7249ea5a0ceef9f8269f63f33cc873c3fac61 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:55 -0800
Subject: nfsd: use get_acl and ->set_acl

Remove the boilerplate code to marshall and unmarhall ACL objects into
xattrs and operate on the posix_acl objects directly.  Also move all
the ACL handling code into nfs?acl.c where it belongs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/acl.h      |  16 ++--
 fs/nfsd/nfs2acl.c  |  72 +++++++++-------
 fs/nfsd/nfs3acl.c  |  62 +++++++-------
 fs/nfsd/nfs4acl.c  | 120 +++++++++++++++++++-------
 fs/nfsd/nfs4proc.c |   1 +
 fs/nfsd/vfs.c      | 241 -----------------------------------------------------
 fs/nfsd/vfs.h      |   8 --
 7 files changed, 173 insertions(+), 347 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 8b186a4955c..8b68218e2c1 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -35,7 +35,9 @@
 #ifndef LINUX_NFS4_ACL_H
 #define LINUX_NFS4_ACL_H
 
-#include <linux/posix_acl.h>
+struct nfs4_acl;
+struct svc_fh;
+struct svc_rqst;
 
 /* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
  * fit in a page: */
@@ -45,13 +47,9 @@ struct nfs4_acl *nfs4_acl_new(int);
 int nfs4_acl_get_whotype(char *, u32);
 int nfs4_acl_write_who(int who, char *p);
 
-#define NFS4_ACL_TYPE_DEFAULT	0x01
-#define NFS4_ACL_DIR		0x02
-#define NFS4_ACL_OWNER		0x04
-
-struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
-				struct posix_acl *, unsigned int flags);
-int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
-				struct posix_acl **, unsigned int flags);
+int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+		struct nfs4_acl **acl);
+__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct nfs4_acl *acl);
 
 #endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 95d76dc6c5d..11c1fba2931 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -30,8 +30,9 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
-	svc_fh *fh;
 	struct posix_acl *acl;
+	struct inode *inode;
+	svc_fh *fh;
 	__be32 nfserr = 0;
 
 	dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
@@ -41,6 +42,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
+	inode = fh->fh_dentry->d_inode;
+
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
 		RETURN_STATUS(nfserr_inval);
 	resp->mask = argp->mask;
@@ -50,21 +53,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 		goto fail;
 
 	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
-		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+		acl = get_acl(inode, ACL_TYPE_ACCESS);
 		if (IS_ERR(acl)) {
-			int err = PTR_ERR(acl);
-
-			if (err == -ENODATA || err == -EOPNOTSUPP)
-				acl = NULL;
-			else {
-				nfserr = nfserrno(err);
-				goto fail;
-			}
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
 		}
 		if (acl == NULL) {
 			/* Solaris returns the inode's minimum ACL. */
-
-			struct inode *inode = fh->fh_dentry->d_inode;
 			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
 		}
 		resp->acl_access = acl;
@@ -72,17 +67,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
 		/* Check how Solaris handles requests for the Default ACL
 		   of a non-directory! */
-
-		acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+		acl = get_acl(inode, ACL_TYPE_DEFAULT);
 		if (IS_ERR(acl)) {
-			int err = PTR_ERR(acl);
-
-			if (err == -ENODATA || err == -EOPNOTSUPP)
-				acl = NULL;
-			else {
-				nfserr = nfserrno(err);
-				goto fail;
-			}
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
 		}
 		resp->acl_default = acl;
 	}
@@ -103,31 +91,51 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
 		struct nfsd3_setaclargs *argp,
 		struct nfsd_attrstat *resp)
 {
+	struct inode *inode;
 	svc_fh *fh;
 	__be32 nfserr = 0;
+	int error;
 
 	dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
 	fh = fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+	if (nfserr)
+		goto out;
 
-	if (!nfserr) {
-		nfserr = nfserrno( nfsd_set_posix_acl(
-			fh, ACL_TYPE_ACCESS, argp->acl_access) );
-	}
-	if (!nfserr) {
-		nfserr = nfserrno( nfsd_set_posix_acl(
-			fh, ACL_TYPE_DEFAULT, argp->acl_default) );
-	}
-	if (!nfserr) {
-		nfserr = fh_getattr(fh, &resp->stat);
+	inode = fh->fh_dentry->d_inode;
+	if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
+		error = -EOPNOTSUPP;
+		goto out_errno;
 	}
 
+	error = fh_want_write(fh);
+	if (error)
+		goto out_errno;
+
+	error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+	if (error)
+		goto out_drop_write;
+	error = inode->i_op->set_acl(inode, argp->acl_default,
+				     ACL_TYPE_DEFAULT);
+	if (error)
+		goto out_drop_write;
+
+	fh_drop_write(fh);
+
+	nfserr = fh_getattr(fh, &resp->stat);
+
+out:
 	/* argp->acl_{access,default} may have been allocated in
 	   nfssvc_decode_setaclargs. */
 	posix_acl_release(argp->acl_access);
 	posix_acl_release(argp->acl_default);
 	return nfserr;
+out_drop_write:
+	fh_drop_write(fh);
+out_errno:
+	nfserr = nfserrno(error);
+	goto out;
 }
 
 /*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9cbc1a841f8..adc5f1b1dc2 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -29,8 +29,9 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
-	svc_fh *fh;
 	struct posix_acl *acl;
+	struct inode *inode;
+	svc_fh *fh;
 	__be32 nfserr = 0;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
@@ -38,26 +39,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
+	inode = fh->fh_dentry->d_inode;
+
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
 		RETURN_STATUS(nfserr_inval);
 	resp->mask = argp->mask;
 
 	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
-		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+		acl = get_acl(inode, ACL_TYPE_ACCESS);
 		if (IS_ERR(acl)) {
-			int err = PTR_ERR(acl);
-
-			if (err == -ENODATA || err == -EOPNOTSUPP)
-				acl = NULL;
-			else {
-				nfserr = nfserrno(err);
-				goto fail;
-			}
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
 		}
 		if (acl == NULL) {
 			/* Solaris returns the inode's minimum ACL. */
-
-			struct inode *inode = fh->fh_dentry->d_inode;
 			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
 		}
 		resp->acl_access = acl;
@@ -65,17 +60,10 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
 		/* Check how Solaris handles requests for the Default ACL
 		   of a non-directory! */
-
-		acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+		acl = get_acl(inode, ACL_TYPE_DEFAULT);
 		if (IS_ERR(acl)) {
-			int err = PTR_ERR(acl);
-
-			if (err == -ENODATA || err == -EOPNOTSUPP)
-				acl = NULL;
-			else {
-				nfserr = nfserrno(err);
-				goto fail;
-			}
+			nfserr = nfserrno(PTR_ERR(acl));
+			goto fail;
 		}
 		resp->acl_default = acl;
 	}
@@ -96,21 +84,37 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
 		struct nfsd3_setaclargs *argp,
 		struct nfsd3_attrstat *resp)
 {
+	struct inode *inode;
 	svc_fh *fh;
 	__be32 nfserr = 0;
+	int error;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+	if (nfserr)
+		goto out;
 
-	if (!nfserr) {
-		nfserr = nfserrno( nfsd_set_posix_acl(
-			fh, ACL_TYPE_ACCESS, argp->acl_access) );
-	}
-	if (!nfserr) {
-		nfserr = nfserrno( nfsd_set_posix_acl(
-			fh, ACL_TYPE_DEFAULT, argp->acl_default) );
+	inode = fh->fh_dentry->d_inode;
+	if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
+		error = -EOPNOTSUPP;
+		goto out_errno;
 	}
 
+	error = fh_want_write(fh);
+	if (error)
+		goto out_errno;
+
+	error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+	if (error)
+		goto out_drop_write;
+	error = inode->i_op->set_acl(inode, argp->acl_default,
+				     ACL_TYPE_DEFAULT);
+
+out_drop_write:
+	fh_drop_write(fh);
+out_errno:
+	nfserr = nfserrno(error);
+out:
 	/* argp->acl_{access,default} may have been allocated in
 	   nfs3svc_decode_setaclargs. */
 	posix_acl_release(argp->acl_access);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 8a50b3c1809..649ad7cf220 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -37,8 +37,13 @@
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/export.h>
+#include "nfsfh.h"
 #include "acl.h"
+#include "vfs.h"
 
+#define NFS4_ACL_TYPE_DEFAULT	0x01
+#define NFS4_ACL_DIR		0x02
+#define NFS4_ACL_OWNER		0x04
 
 /* mode bit translations: */
 #define NFS4_READ_MODE (NFS4_ACE_READ_DATA)
@@ -130,36 +135,50 @@ static short ace2type(struct nfs4_ace *);
 static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
 				unsigned int);
 
-struct nfs4_acl *
-nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
-			unsigned int flags)
+int
+nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+		struct nfs4_acl **acl)
 {
-	struct nfs4_acl *acl;
+	struct inode *inode = dentry->d_inode;
+	int error = 0;
+	struct posix_acl *pacl = NULL, *dpacl = NULL;
+	unsigned int flags = 0;
 	int size = 0;
 
-	if (pacl) {
-		if (posix_acl_valid(pacl) < 0)
-			return ERR_PTR(-EINVAL);
-		size += 2*pacl->a_count;
+	pacl = get_acl(inode, ACL_TYPE_ACCESS);
+	if (!pacl) {
+		pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		if (IS_ERR(pacl))
+			return PTR_ERR(pacl);
+		/* allocate for worst case: one (deny, allow) pair each: */
+		size += 2 * pacl->a_count;
 	}
-	if (dpacl) {
-		if (posix_acl_valid(dpacl) < 0)
-			return ERR_PTR(-EINVAL);
-		size += 2*dpacl->a_count;
+
+	if (S_ISDIR(inode->i_mode)) {
+		flags = NFS4_ACL_DIR;
+		dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+		if (dpacl)
+			size += 2 * dpacl->a_count;
+	} else {
+		dpacl = NULL;
 	}
 
-	/* Allocate for worst case: one (deny, allow) pair each: */
-	acl = nfs4_acl_new(size);
-	if (acl == NULL)
-		return ERR_PTR(-ENOMEM);
+	*acl = nfs4_acl_new(size);
+	if (*acl == NULL) {
+		error = -ENOMEM;
+		goto out;
+	}
 
 	if (pacl)
-		_posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
+		_posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
 
 	if (dpacl)
-		_posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT);
+		_posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
 
-	return acl;
+ out:
+	posix_acl_release(pacl);
+	posix_acl_release(dpacl);
+	return error;
 }
 
 struct posix_acl_summary {
@@ -719,8 +738,9 @@ static void process_one_v4_ace(struct posix_acl_state *state,
 	}
 }
 
-int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
-			    struct posix_acl **dpacl, unsigned int flags)
+static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
+		struct posix_acl **pacl, struct posix_acl **dpacl,
+		unsigned int flags)
 {
 	struct posix_acl_state effective_acl_state, default_acl_state;
 	struct nfs4_ace *ace;
@@ -780,6 +800,57 @@ out_estate:
 	return ret;
 }
 
+__be32
+nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct nfs4_acl *acl)
+{
+	__be32 error;
+	int host_error;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct posix_acl *pacl = NULL, *dpacl = NULL;
+	unsigned int flags = 0;
+
+	/* Get inode */
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
+	if (error)
+		return error;
+
+	dentry = fhp->fh_dentry;
+	inode = dentry->d_inode;
+
+	if (!inode->i_op->set_acl || !IS_POSIXACL(inode))
+		return nfserr_attrnotsupp;
+
+	if (S_ISDIR(inode->i_mode))
+		flags = NFS4_ACL_DIR;
+
+	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+	if (host_error == -EINVAL)
+		return nfserr_attrnotsupp;
+	if (host_error < 0)
+		goto out_nfserr;
+
+	host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS);
+	if (host_error < 0)
+		goto out_release;
+
+	if (S_ISDIR(inode->i_mode)) {
+		host_error = inode->i_op->set_acl(inode, dpacl,
+						  ACL_TYPE_DEFAULT);
+	}
+
+out_release:
+	posix_acl_release(pacl);
+	posix_acl_release(dpacl);
+out_nfserr:
+	if (host_error == -EOPNOTSUPP)
+		return nfserr_attrnotsupp;
+	else
+		return nfserrno(host_error);
+}
+
+
 static short
 ace2type(struct nfs4_ace *ace)
 {
@@ -798,9 +869,6 @@ ace2type(struct nfs4_ace *ace)
 	return -1;
 }
 
-EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
-EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
-
 struct nfs4_acl *
 nfs4_acl_new(int n)
 {
@@ -862,7 +930,3 @@ nfs4_acl_write_who(int who, char *p)
 	BUG();
 	return -1;
 }
-
-EXPORT_SYMBOL(nfs4_acl_new);
-EXPORT_SYMBOL(nfs4_acl_get_whotype);
-EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 419572f33b7..825b8a99b99 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -41,6 +41,7 @@
 #include "vfs.h"
 #include "current_stateid.h"
 #include "netns.h"
+#include "acl.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7eea63cada1..1426eb66c8c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -468,158 +468,7 @@ out:
 	return err;
 }
 
-#if defined(CONFIG_NFSD_V2_ACL) || \
-    defined(CONFIG_NFSD_V3_ACL) || \
-    defined(CONFIG_NFSD_V4)
-static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
-{
-	ssize_t buflen;
-	ssize_t ret;
-
-	buflen = vfs_getxattr(dentry, key, NULL, 0);
-	if (buflen <= 0)
-		return buflen;
-
-	*buf = kmalloc(buflen, GFP_KERNEL);
-	if (!*buf)
-		return -ENOMEM;
-
-	ret = vfs_getxattr(dentry, key, *buf, buflen);
-	if (ret < 0)
-		kfree(*buf);
-	return ret;
-}
-#endif
-
 #if defined(CONFIG_NFSD_V4)
-static int
-set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
-{
-	int len;
-	size_t buflen;
-	char *buf = NULL;
-	int error = 0;
-
-	buflen = posix_acl_xattr_size(pacl->a_count);
-	buf = kmalloc(buflen, GFP_KERNEL);
-	error = -ENOMEM;
-	if (buf == NULL)
-		goto out;
-
-	len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen);
-	if (len < 0) {
-		error = len;
-		goto out;
-	}
-
-	error = vfs_setxattr(dentry, key, buf, len, 0);
-out:
-	kfree(buf);
-	return error;
-}
-
-__be32
-nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
-    struct nfs4_acl *acl)
-{
-	__be32 error;
-	int host_error;
-	struct dentry *dentry;
-	struct inode *inode;
-	struct posix_acl *pacl = NULL, *dpacl = NULL;
-	unsigned int flags = 0;
-
-	/* Get inode */
-	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
-	if (error)
-		return error;
-
-	dentry = fhp->fh_dentry;
-	inode = dentry->d_inode;
-	if (S_ISDIR(inode->i_mode))
-		flags = NFS4_ACL_DIR;
-
-	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
-	if (host_error == -EINVAL) {
-		return nfserr_attrnotsupp;
-	} else if (host_error < 0)
-		goto out_nfserr;
-
-	host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
-	if (host_error < 0)
-		goto out_release;
-
-	if (S_ISDIR(inode->i_mode))
-		host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
-
-out_release:
-	posix_acl_release(pacl);
-	posix_acl_release(dpacl);
-out_nfserr:
-	if (host_error == -EOPNOTSUPP)
-		return nfserr_attrnotsupp;
-	else
-		return nfserrno(host_error);
-}
-
-static struct posix_acl *
-_get_posix_acl(struct dentry *dentry, char *key)
-{
-	void *buf = NULL;
-	struct posix_acl *pacl = NULL;
-	int buflen;
-
-	buflen = nfsd_getxattr(dentry, key, &buf);
-	if (!buflen)
-		buflen = -ENODATA;
-	if (buflen <= 0)
-		return ERR_PTR(buflen);
-
-	pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen);
-	kfree(buf);
-	return pacl;
-}
-
-int
-nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
-{
-	struct inode *inode = dentry->d_inode;
-	int error = 0;
-	struct posix_acl *pacl = NULL, *dpacl = NULL;
-	unsigned int flags = 0;
-
-	pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
-	if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
-		pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
-	if (IS_ERR(pacl)) {
-		error = PTR_ERR(pacl);
-		pacl = NULL;
-		goto out;
-	}
-
-	if (S_ISDIR(inode->i_mode)) {
-		dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
-		if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
-			dpacl = NULL;
-		else if (IS_ERR(dpacl)) {
-			error = PTR_ERR(dpacl);
-			dpacl = NULL;
-			goto out;
-		}
-		flags = NFS4_ACL_DIR;
-	}
-
-	*acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
-	if (IS_ERR(*acl)) {
-		error = PTR_ERR(*acl);
-		*acl = NULL;
-	}
- out:
-	posix_acl_release(pacl);
-	posix_acl_release(dpacl);
-	return error;
-}
-
 /*
  * NFS junction information is stored in an extended attribute.
  */
@@ -2284,93 +2133,3 @@ out_nomem:
 	nfsd_racache_shutdown();
 	return -ENOMEM;
 }
-
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-struct posix_acl *
-nfsd_get_posix_acl(struct svc_fh *fhp, int type)
-{
-	struct inode *inode = fhp->fh_dentry->d_inode;
-	char *name;
-	void *value = NULL;
-	ssize_t size;
-	struct posix_acl *acl;
-
-	if (!IS_POSIXACL(inode))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
-		break;
-	default:
-		return ERR_PTR(-EOPNOTSUPP);
-	}
-
-	size = nfsd_getxattr(fhp->fh_dentry, name, &value);
-	if (size < 0)
-		return ERR_PTR(size);
-
-	acl = posix_acl_from_xattr(&init_user_ns, value, size);
-	kfree(value);
-	return acl;
-}
-
-int
-nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
-{
-	struct inode *inode = fhp->fh_dentry->d_inode;
-	char *name;
-	void *value = NULL;
-	size_t size;
-	int error;
-
-	if (!IS_POSIXACL(inode) ||
-	    !inode->i_op->setxattr || !inode->i_op->removexattr)
-		return -EOPNOTSUPP;
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			name = POSIX_ACL_XATTR_ACCESS;
-			break;
-		case ACL_TYPE_DEFAULT:
-			name = POSIX_ACL_XATTR_DEFAULT;
-			break;
-		default:
-			return -EOPNOTSUPP;
-	}
-
-	if (acl && acl->a_count) {
-		size = posix_acl_xattr_size(acl->a_count);
-		value = kmalloc(size, GFP_KERNEL);
-		if (!value)
-			return -ENOMEM;
-		error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-		if (error < 0)
-			goto getout;
-		size = error;
-	} else
-		size = 0;
-
-	error = fh_want_write(fhp);
-	if (error)
-		goto getout;
-	if (size)
-		error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
-	else {
-		if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
-			error = 0;
-		else {
-			error = vfs_removexattr(fhp->fh_dentry, name);
-			if (error == -ENODATA)
-				error = 0;
-		}
-	}
-	fh_drop_write(fhp);
-
-getout:
-	kfree(value);
-	return error;
-}
-#endif  /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a4be2e38967..1bc1d440a1a 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -52,9 +52,6 @@ __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
 				struct iattr *, int, time_t);
 int nfsd_mountpoint(struct dentry *, struct svc_export *);
 #ifdef CONFIG_NFSD_V4
-__be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
-                    struct nfs4_acl *);
-int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
 __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
 		    struct xdr_netobj *);
 #endif /* CONFIG_NFSD_V4 */
@@ -101,11 +98,6 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
-int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
-#endif
-
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
-- 
cgit v1.2.3-70-g09d2


From 2796e4cec525a2b1cace3b29b2f02735dafea007 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 20 Dec 2013 05:16:56 -0800
Subject: hfsplus: remove can_set_xattr

When using the per-superblock xattr handlers permission checking is
done by the generic code.  hfsplus just needs to check for the magic
osx attribute not to leak into protected namespaces.

Also given that the code was obviously copied from JFS the proper
attribution was missing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/xattr.c | 87 ++----------------------------------------------------
 1 file changed, 3 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index bf88baa9bb6..0b4a5c9b93c 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -52,82 +52,6 @@ static inline int is_known_namespace(const char *name)
 	return true;
 }
 
-static int can_set_system_xattr(struct inode *inode, const char *name,
-				const void *value, size_t size)
-{
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-	struct posix_acl *acl;
-	int err;
-
-	if (!inode_owner_or_capable(inode))
-		return -EPERM;
-
-	/*
-	 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
-	 */
-	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		if (acl) {
-			err = posix_acl_equiv_mode(acl, &inode->i_mode);
-			posix_acl_release(acl);
-			if (err < 0)
-				return err;
-			mark_inode_dirty(inode);
-		}
-		/*
-		 * We're changing the ACL.  Get rid of the cached one
-		 */
-		forget_cached_acl(inode, ACL_TYPE_ACCESS);
-
-		return 0;
-	} else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		posix_acl_release(acl);
-
-		/*
-		 * We're changing the default ACL.  Get rid of the cached one
-		 */
-		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-
-		return 0;
-	}
-#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
-	return -EOPNOTSUPP;
-}
-
-static int can_set_xattr(struct inode *inode, const char *name,
-				const void *value, size_t value_len)
-{
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return can_set_system_xattr(inode, name, value, value_len);
-
-	if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
-		/*
-		 * This makes sure that we aren't trying to set an
-		 * attribute in a different namespace by prefixing it
-		 * with "osx."
-		 */
-		if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN))
-			return -EOPNOTSUPP;
-
-		return 0;
-	}
-
-	/*
-	 * Don't allow setting an attribute in an unknown namespace.
-	 */
-	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
-	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
-	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-		return -EOPNOTSUPP;
-
-	return 0;
-}
-
 static void hfsplus_init_header_node(struct inode *attr_file,
 					u32 clump_size,
 					char *buf, u16 node_size)
@@ -350,10 +274,6 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
 				HFSPLUS_IS_RSRC(inode))
 		return -EOPNOTSUPP;
 
-	err = can_set_xattr(inode, name, value, size);
-	if (err)
-		return err;
-
 	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
 				XATTR_MAC_OSX_PREFIX_LEN) == 0)
 		name += XATTR_MAC_OSX_PREFIX_LEN;
@@ -841,10 +761,6 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name)
 	if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
 		return -EOPNOTSUPP;
 
-	err = can_set_xattr(inode, name, NULL, 0);
-	if (err)
-		return err;
-
 	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
 				XATTR_MAC_OSX_PREFIX_LEN) == 0)
 		name += XATTR_MAC_OSX_PREFIX_LEN;
@@ -941,6 +857,9 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
 	if (len > HFSPLUS_ATTR_MAX_STRLEN)
 		return -EOPNOTSUPP;
 
+	if (is_known_namespace(name))
+		return -EOPNOTSUPP;
+
 	strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
 	strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
 
-- 
cgit v1.2.3-70-g09d2


From 9fe55eea7e4b444bafc42fa0000cc2d1d2847275 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 24 Jan 2014 14:42:22 +0000
Subject: Fix race when checking i_size on direct i/o read

So far I've had one ACK for this, and no other comments. So I think it
is probably time to send this via some suitable tree. I'm guessing that
the vfs tree would be the most appropriate route, but not sure that
there is one at the moment (don't see anything recent at kernel.org)
so in that case I think -mm is the "back up plan". Al, please let me
know if you will take this?

Steve.

---------------------

Following on from the "Re: [PATCH v3] vfs: fix a bug when we do some dio
reads with append dio writes" thread on linux-fsdevel, this patch is my
current version of the fix proposed as option (b) in that thread.

Removing the i_size test from the direct i/o read path at vfs level
means that filesystems now have to deal with requests which are beyond
i_size themselves. These I've divided into three sets:

 a) Those with "no op" ->direct_IO (9p, cifs, ceph)
These are obviously not going to be an issue

 b) Those with "home brew" ->direct_IO (nfs, fuse)
I've been told that NFS should not have any problem with the larger
i_size, however I've added an extra test to FUSE to duplicate the
original behaviour just to be on the safe side.

 c) Those using __blockdev_direct_IO()
These call through to ->get_block() which should deal with the EOF
condition correctly. I've verified that with GFS2 and I believe that
Zheng has verified it for ext4. I've also run the test on XFS and it
passes both before and after this change.

The part of the patch in filemap.c looks a lot larger than it really is
- there are only two lines of real change. The rest is just indentation
of the contained code.

There remains a test of i_size though, which was added for btrfs. It
doesn't cause the other filesystems a problem as the test is performed
after ->direct_IO has been called. It is possible that there is a race
that does matter to btrfs, however this patch doesn't change that, so
its still an overall improvement.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Reported-by: Zheng Liu <gnehzuil.liu@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/file.c |  3 +++
 mm/filemap.c   | 42 ++++++++++++++++++++----------------------
 2 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e70506297b..89fdfd1919a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2710,6 +2710,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	inode = file->f_mapping->host;
 	i_size = i_size_read(inode);
 
+	if ((rw == READ) && (offset > i_size))
+		return 0;
+
 	/* optimization for short read */
 	if (async_dio && rw != WRITE && offset + count > i_size) {
 		if (offset >= i_size)
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021..01842867c9d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		if (!count)
 			goto out; /* skip atime */
 		size = i_size_read(inode);
-		if (pos < size) {
-			retval = filemap_write_and_wait_range(mapping, pos,
+		retval = filemap_write_and_wait_range(mapping, pos,
 					pos + iov_length(iov, nr_segs) - 1);
-			if (!retval) {
-				retval = mapping->a_ops->direct_IO(READ, iocb,
-							iov, pos, nr_segs);
-			}
-			if (retval > 0) {
-				*ppos = pos + retval;
-				count -= retval;
-			}
+		if (!retval) {
+			retval = mapping->a_ops->direct_IO(READ, iocb,
+							   iov, pos, nr_segs);
+		}
+		if (retval > 0) {
+			*ppos = pos + retval;
+			count -= retval;
+		}
 
-			/*
-			 * Btrfs can have a short DIO read if we encounter
-			 * compressed extents, so if there was an error, or if
-			 * we've already read everything we wanted to, or if
-			 * there was a short read because we hit EOF, go ahead
-			 * and return.  Otherwise fallthrough to buffered io for
-			 * the rest of the read.
-			 */
-			if (retval < 0 || !count || *ppos >= size) {
-				file_accessed(filp);
-				goto out;
-			}
+		/*
+		 * Btrfs can have a short DIO read if we encounter
+		 * compressed extents, so if there was an error, or if
+		 * we've already read everything we wanted to, or if
+		 * there was a short read because we hit EOF, go ahead
+		 * and return.  Otherwise fallthrough to buffered io for
+		 * the rest of the read.
+		 */
+		if (retval < 0 || !count || *ppos >= size) {
+			file_accessed(filp);
+			goto out;
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 260a459d2e39761fbd39803497205ce1690bc7b1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Jan 2014 15:26:15 -0800
Subject: vfs: Is mounted should be testing mnt_ns for NULL or error.

A bug was introduced with the is_mounted helper function in
commit f7a99c5b7c8bd3d3f533c8b38274e33f3da9096e
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Sat Jun 9 00:59:08 2012 -0400

    get rid of ->mnt_longterm

    it's enough to set ->mnt_ns of internal vfsmounts to something
    distinct from all struct mnt_namespace out there; then we can
    just use the check for ->mnt_ns != NULL in the fast path of
    mntput_no_expire()

    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

The intent was to test if the real_mount(vfsmount)->mnt_ns was
NULL_OR_ERR but the code is actually testing real_mount(vfsmount)
and always returning true.

The result is d_absolute_path returning paths it should be hiding.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index d64c594be6c..a17458ca6f2 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -74,7 +74,7 @@ static inline int mnt_has_parent(struct mount *mnt)
 static inline int is_mounted(struct vfsmount *mnt)
 {
 	/* neither detached nor internal? */
-	return !IS_ERR_OR_NULL(real_mount(mnt));
+	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
 }
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
-- 
cgit v1.2.3-70-g09d2


From a8323da0366d3398eda62741d2ac1130c8a172ed Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Jan 2014 15:43:25 -0800
Subject: vfs: Remove second variable named error in __dentry_path

In commit  232d2d60aa5469bb097f55728f65146bd49c1d25
Author: Waiman Long <Waiman.Long@hp.com>
Date:   Mon Sep 9 12:18:13 2013 -0400

    dcache: Translating dentry into pathname without taking rename_lock

The __dentry_path locking was changed and the variable error was
intended to be moved outside of the loop.  Unfortunately the inner
declaration of error was not removed. Resulting in a version of
__dentry_path that will never return an error.

Remove the problematic inner declaration of error and allow
__dentry_path to return errors once again.

Cc: stable@vger.kernel.org
Cc: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index cb4a1069086..fdbe2302781 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3135,7 +3135,6 @@ restart:
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (!IS_ROOT(dentry)) {
 		struct dentry *parent = dentry->d_parent;
-		int error;
 
 		prefetch(parent);
 		error = prepend_name(&end, &len, &dentry->d_name);
-- 
cgit v1.2.3-70-g09d2


From f6500801522c61782d4990fa1ad96154cb397cd4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2014 12:37:55 -0500
Subject: __dentry_path() fixes

* we need to save the starting point for restarts
* reject pathologically short buffers outright

Spotted-by: Denys Vlasenko <dvlasenk@redhat.com>
Spotted-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index fdbe2302781..265e0ce9769 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3116,19 +3116,22 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
-static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *d, char *buf, int buflen)
 {
+	struct dentry *dentry;
 	char *end, *retval;
 	int len, seq = 0;
 	int error = 0;
 
+	if (buflen < 2)
+		goto Elong;
+
 	rcu_read_lock();
 restart:
+	dentry = d;
 	end = buf + buflen;
 	len = buflen;
 	prepend(&end, &len, "\0", 1);
-	if (buflen < 1)
-		goto Elong;
 	/* Get '/' right */
 	retval = end-1;
 	*retval = '/';
-- 
cgit v1.2.3-70-g09d2


From 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sun, 26 Jan 2014 23:53:43 -0600
Subject: [CIFS] Fix SMB2 mounts so they don't try to set or get xattrs via
 cifs

When mounting with smb2 (or smb2.1 or smb3) we need to check to make
sure that attempts to query or set extended attributes do not
attempt to send the request with the older cifs protocol instead
(eventually we also need to add the support in SMB2
to query/set extended attributes but this patch prevents us from
using the wrong protocol for extended attribute operations).

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h |  6 ++++++
 fs/cifs/xattr.c    | 49 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 61228b7f6b6..a245d1809ed 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -389,6 +389,12 @@ struct smb_version_operations {
 			struct cifsFileInfo *target_file, u64 src_off, u64 len,
 			u64 dest_off);
 	int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
+	ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
+			const unsigned char *, const unsigned char *, char *,
+			size_t, const struct nls_table *, int);
+	int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
+			const char *, const void *, const __u16,
+			const struct nls_table *, int);
 };
 
 struct smb_version_values {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 09afda4cc58..95c43bb2033 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 			goto remove_ea_exit;
 
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
-			(__u16)0, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, NULL, (__u16)0,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 remove_ea_exit:
 	kfree(full_path);
@@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 			cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
 
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
-			(__u16)value_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
 		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 
 		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
-			(__u16)value_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
 			strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
 #ifdef CONFIG_CIFS_ACL
@@ -272,17 +278,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
-			buf_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
 		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
-		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
-			buf_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
 			  strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
@@ -400,11 +410,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 	/* if proc/fs/cifs/streamstoxattr is set then
 		search server for EAs or streams to
 		returns as xattrs */
-	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
-				buf_size, cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+	if (pTcon->ses->server->ops->query_all_EAs)
+		rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, NULL, data, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 list_ea_exit:
 	kfree(full_path);
 	free_xid(xid);
-- 
cgit v1.2.3-70-g09d2


From 7dd7d95916fe7c8494aa8708204d5a2b8689d270 Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Thu, 23 Jan 2014 08:54:55 -0600
Subject: nfs: handle servers that support only ALLOW ACE type.

Currently we support ACLs if the NFS server file system supports both
ALLOW and DENY ACE types. This patch makes the Linux client work with
ACLs even if the server supports only 'ALLOW' ACE type.

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a1965329a12..ed10d0d4f86 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2744,7 +2744,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
 				NFS_CAP_CTIME|NFS_CAP_MTIME|
 				NFS_CAP_SECURITY_LABEL);
-		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
+		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
+				res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
 			server->caps |= NFS_CAP_ACLS;
 		if (res.has_links != 0)
 			server->caps |= NFS_CAP_HARDLINKS;
@@ -4321,9 +4322,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 
 static inline int nfs4_server_supports_acls(struct nfs_server *server)
 {
-	return (server->caps & NFS_CAP_ACLS)
-		&& (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
-		&& (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
+	return server->caps & NFS_CAP_ACLS;
 }
 
 /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
-- 
cgit v1.2.3-70-g09d2


From e873088f2939dcd60721183ce6802515afc43ceb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 23 Jan 2012 13:52:01 -0500
Subject: nfsd4: minor nfs4_setlease cleanup

As far as I can tell, this list is used only under the state lock, so we
may as well do this in the simpler order.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5795d5f58f4..ed3085b2bf1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3043,18 +3043,18 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
 	if (!fl)
 		return -ENOMEM;
 	fl->fl_file = find_readable_file(fp);
-	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
 	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
-	if (status) {
-		list_del_init(&dp->dl_perclnt);
-		locks_free_lock(fl);
-		return status;
-	}
+	if (status)
+		goto out_free;
+	list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
 	fp->fi_lease = fl;
 	fp->fi_deleg_file = get_file(fl->fl_file);
 	atomic_set(&fp->fi_delegees, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	return 0;
+out_free:
+	locks_free_lock(fl);
+	return status;
 }
 
 static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
-- 
cgit v1.2.3-70-g09d2


From c0e6bee480591a78caad5b13bd377948c025d0cd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 27 Jan 2012 17:26:06 -0500
Subject: nfsd4: delay setting current_fh in open

This is basically a no-op, to simplify a following patch.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index dadff09b0b0..844813a7e12 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -230,17 +230,16 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
 }
 
 static __be32
-do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
 {
 	struct svc_fh *current_fh = &cstate->current_fh;
-	struct svc_fh *resfh;
 	int accmode;
 	__be32 status;
 
-	resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
-	if (!resfh)
+	*resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+	if (!*resfh)
 		return nfserr_jukebox;
-	fh_init(resfh, NFS4_FHSIZE);
+	fh_init(*resfh, NFS4_FHSIZE);
 	open->op_truncate = 0;
 
 	if (open->op_create) {
@@ -265,12 +264,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
 		 */
 		status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
 					open->op_fname.len, &open->op_iattr,
-					resfh, open->op_createmode,
+					*resfh, open->op_createmode,
 					(u32 *)open->op_verf.data,
 					&open->op_truncate, &open->op_created);
 
 		if (!status && open->op_label.len)
-			nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
+			nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
 
 		/*
 		 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -282,29 +281,26 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
 							FATTR4_WORD1_TIME_MODIFY);
 	} else {
 		status = nfsd_lookup(rqstp, current_fh,
-				     open->op_fname.data, open->op_fname.len, resfh);
+				     open->op_fname.data, open->op_fname.len, *resfh);
 		fh_unlock(current_fh);
 	}
 	if (status)
 		goto out;
-	status = nfsd_check_obj_isreg(resfh);
+	status = nfsd_check_obj_isreg(*resfh);
 	if (status)
 		goto out;
 
 	if (is_create_with_attrs(open) && open->op_acl != NULL)
-		do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval);
+		do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
 
-	nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
+	nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
 	accmode = NFSD_MAY_NOP;
 	if (open->op_created ||
 			open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
 		accmode |= NFSD_MAY_OWNER_OVERRIDE;
-	status = do_open_permission(rqstp, resfh, open, accmode);
+	status = do_open_permission(rqstp, *resfh, open, accmode);
 	set_change_info(&open->op_cinfo, current_fh);
-	fh_dup2(current_fh, resfh);
 out:
-	fh_put(resfh);
-	kfree(resfh);
 	return status;
 }
 
@@ -357,6 +353,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   struct nfsd4_open *open)
 {
 	__be32 status;
+	struct svc_fh *resfh = NULL;
 	struct nfsd4_compoundres *resp;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -423,7 +420,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
 		case NFS4_OPEN_CLAIM_NULL:
-			status = do_open_lookup(rqstp, cstate, open);
+			status = do_open_lookup(rqstp, cstate, open, &resfh);
 			if (status)
 				goto out;
 			break;
@@ -439,6 +436,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			status = do_open_fhandle(rqstp, cstate, open);
 			if (status)
 				goto out;
+			resfh = &cstate->current_fh;
 			break;
 		case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
              	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
@@ -458,9 +456,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * successful, it (1) truncates the file if open->op_truncate was
 	 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
 	 */
-	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+	status = nfsd4_process_open2(rqstp, resfh, open);
 	WARN_ON(status && open->op_created);
 out:
+	if (resfh && resfh != &cstate->current_fh) {
+		fh_dup2(&cstate->current_fh, resfh);
+		fh_put(resfh);
+		kfree(resfh);
+	}
 	nfsd4_cleanup_open_state(open, status);
 	if (open->op_openowner && !nfsd4_has_session(cstate))
 		cstate->replay_owner = &open->op_openowner->oo_owner;
-- 
cgit v1.2.3-70-g09d2


From 4335723e8e9fdc6e4bb2555696bc7f1abe75f200 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 24 Jan 2014 18:04:40 -0500
Subject: nfsd4: fix delegation-unlink/rename race

If a file is unlinked or renamed between the time when we do the local
open and the time when we get the delegation, then we will return to the
client indicating that it holds a delegation even though the file no
longer exists under the name it was open under.

But a client performing an open-by-name, when it is returned a
delegation, must be able to assume that the file is still linked at the
name it was opened under.

So, hold the parent i_mutex for longer to prevent concurrent renames or
unlinks.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 10 +++++++---
 fs/nfsd/vfs.c      |  7 ++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 844813a7e12..ef76ba63238 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -279,11 +279,15 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
 		if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
 			open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
 							FATTR4_WORD1_TIME_MODIFY);
-	} else {
+	} else
+		/*
+		 * Note this may exit with the parent still locked.
+		 * We will hold the lock until nfsd4_open's final
+		 * lookup, to prevent renames or unlinks until we've had
+		 * a chance to an acquire a delegation if appropriate.
+		 */
 		status = nfsd_lookup(rqstp, current_fh,
 				     open->op_fname.data, open->op_fname.len, *resfh);
-		fh_unlock(current_fh);
-	}
 	if (status)
 		goto out;
 	status = nfsd_check_obj_isreg(*resfh);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index e85b463fac4..a41302a0065 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -207,7 +207,12 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				goto out_nfserr;
 		}
 	} else {
-		fh_lock(fhp);
+		/*
+		 * In the nfsd4_open() case, this may be held across
+		 * subsequent open and delegation acquisition which may
+		 * need to take the child's i_mutex:
+		 */
+		fh_lock_nested(fhp, I_MUTEX_PARENT);
 		dentry = lookup_one_len(name, dparent, len);
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
-- 
cgit v1.2.3-70-g09d2


From d529ef83c355f97027ff85298a9709fe06216a66 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 27 Jan 2014 13:46:15 -0500
Subject: NFS: fix the handling of NFS_INO_INVALID_DATA flag in
 nfs_revalidate_mapping

There is a possible race in how the nfs_invalidate_mapping function is
handled.  Currently, we go and invalidate the pages in the file and then
clear NFS_INO_INVALID_DATA.

The problem is that it's possible for a stale page to creep into the
mapping after the page was invalidated (i.e., via readahead). If another
writer comes along and sets the flag after that happens but before
invalidate_inode_pages2 returns then we could clear the flag
without the cache having been properly invalidated.

So, we must clear the flag first and then invalidate the pages. Doing
this however, opens another race:

It's possible to have two concurrent read() calls that end up in
nfs_revalidate_mapping at the same time. The first one clears the
NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.

Just before calling that though, the other task races in, checks the
flag and finds it cleared. At that point, it trusts that the mapping is
good and gets the lock on the page, allowing the read() to be satisfied
from the cache even though the data is no longer valid.

These effects are easily manifested by running diotest3 from the LTP
test suite on NFS. That program does a series of DIO writes and buffered
reads. The operations are serialized and page-aligned but the existing
code fails the test since it occasionally allows a read to come out of
the cache incorrectly. While mixing direct and buffered I/O isn't
recommended, I believe it's possible to hit this in other ways that just
use buffered I/O, though that situation is much harder to reproduce.

The problem is that the checking/clearing of that flag and the
invalidation of the mapping really need to be atomic. Fix this by
serializing concurrent invalidations with a bitlock.

At the same time, we also need to allow other places that check
NFS_INO_INVALID_DATA to check whether we might be in the middle of
invalidating the file, so fix up a couple of places that do that
to look for the new NFS_INO_INVALIDATING flag.

Doing this requires us to be careful not to set the bitlock
unnecessarily, so this code only does that if it believes it will
be doing an invalidation.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c           |  3 ++-
 fs/nfs/inode.c         | 42 ++++++++++++++++++++++++++++++++++++++----
 fs/nfs/nfstrace.h      |  1 +
 fs/nfs/write.c         |  6 +++++-
 include/linux/nfs_fs.h |  1 +
 5 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b266f734bd5..b39a0468829 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -288,7 +288,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 
 			new_pos = desc->current_index + i;
 			if (ctx->attr_gencount != nfsi->attr_gencount
-			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+			    || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
 				ctx->duped = 0;
 				ctx->attr_gencount = nfsi->attr_gencount;
 			} else if (new_pos < desc->ctx->pos) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c63e1522446..0a972ee9ccc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 		if (ret < 0)
 			return ret;
 	}
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode)) {
+		spin_lock(&inode->i_lock);
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-	spin_unlock(&inode->i_lock);
+		spin_unlock(&inode->i_lock);
+	}
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
@@ -1008,6 +1008,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &nfsi->flags;
 	int ret = 0;
 
 	/* swapfiles are not supposed to be shared. */
@@ -1019,12 +1020,45 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 		if (ret < 0)
 			goto out;
 	}
+
+	/*
+	 * We must clear NFS_INO_INVALID_DATA first to ensure that
+	 * invalidations that come in while we're shooting down the mappings
+	 * are respected. But, that leaves a race window where one revalidator
+	 * can clear the flag, and then another checks it before the mapping
+	 * gets invalidated. Fix that by serializing access to this part of
+	 * the function.
+	 *
+	 * At the same time, we need to allow other tasks to see whether we
+	 * might be in the middle of invalidating the pages, so we only set
+	 * the bit lock here if it looks like we're going to be doing that.
+	 */
+	for (;;) {
+		ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
+				  nfs_wait_bit_killable, TASK_KILLABLE);
+		if (ret)
+			goto out;
+		if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
+			goto out;
+		if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
+			break;
+	}
+
+	spin_lock(&inode->i_lock);
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+		spin_unlock(&inode->i_lock);
 		trace_nfs_invalidate_mapping_enter(inode);
 		ret = nfs_invalidate_mapping(inode, mapping);
 		trace_nfs_invalidate_mapping_exit(inode, ret);
+	} else {
+		/* something raced in and cleared the flag */
+		spin_unlock(&inode->i_lock);
 	}
 
+	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_INVALIDATING);
 out:
 	return ret;
 }
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 89fe741e58b..59f838cdc00 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,6 +36,7 @@
 	__print_flags(v, "|", \
 			{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
 			{ 1 << NFS_INO_STALE, "STALE" }, \
+			{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
 			{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
 			{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
 			{ 1 << NFS_INO_COMMIT, "COMMIT" }, \
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a44a87268a6..5511a424719 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -909,9 +909,13 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
  */
 static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 {
+	struct nfs_inode *nfsi = NFS_I(inode);
+
 	if (nfs_have_delegated_attributes(inode))
 		goto out;
-	if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+	if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+		return false;
+	if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
 		return false;
 out:
 	return PageUptodate(page) != 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 48997374eaf..18fb16f4b93 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -215,6 +215,7 @@ struct nfs_inode {
 #define NFS_INO_ADVISE_RDPLUS	(0)		/* advise readdirplus */
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
+#define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
-- 
cgit v1.2.3-70-g09d2


From 0ea9de0ea6a4e4a1d343130b2a159b4f986e288e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 27 Jan 2014 14:53:27 -0500
Subject: sunrpc: turn warn_gssd() log message into a dprintk()

The original printk() made sense when the GSSAPI codepaths were called
only when sec=krb5* was explicitly requested. Now however, in many cases
the nfs client will try to acquire GSSAPI credentials by default, even
when it's not requested.

Since we don't have a great mechanism to distinguish between the two
cases, just turn the pr_warn into a dprintk instead. With this change we
can also get rid of the ratelimiting.

We do need to keep the EXPORT_SYMBOL(gssd_running) in place since
auth_gss.ko needs it and sunrpc.ko provides it. We can however,
eliminate the gssd_running call in the nfs code since that's a bit of a
layering violation.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4client.c            | 5 +----
 net/sunrpc/auth_gss/auth_gss.c | 8 +-------
 2 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 73d4ecda1e3..dbb3e1f30c6 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -372,10 +372,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
 	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
 
-	error = -EINVAL;
-	if (gssd_running(clp->cl_net))
-		error = nfs_create_rpc_client(clp, timeparms,
-					      RPC_AUTH_GSS_KRB5I);
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
 	if (error == -EINVAL)
 		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
 	if (error < 0)
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 0a2aee060f9..6c0513a7f99 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -532,13 +532,7 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred)
 
 static void warn_gssd(void)
 {
-	static unsigned long ratelimit;
-	unsigned long now = jiffies;
-
-	if (time_after(now, ratelimit)) {
-		pr_warn("RPC: AUTH_GSS upcall failed. Please check user daemon is running.\n");
-		ratelimit = now + 15*HZ;
-	}
+	dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n");
 }
 
 static inline int
-- 
cgit v1.2.3-70-g09d2


From 7c13cb64352230deac24d3cb058387a6c0676f83 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Mon, 27 Jan 2014 17:40:19 +0200
Subject: libceph: replace ceph_calc_ceph_pg() with ceph_oloc_oid_to_pg()

Switch ceph_calc_ceph_pg() to new oloc and oid abstractions and rename
it to ceph_oloc_oid_to_pg() to make its purpose more clear.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/ioctl.c             |  8 ++++++--
 include/linux/ceph/osdmap.h |  7 +++++--
 net/ceph/osd_client.c       |  4 ++--
 net/ceph/osdmap.c           | 29 +++++++++++++++++------------
 4 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 669622fd1ae..dc66c9e023e 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	struct ceph_object_locator oloc;
+	struct ceph_object_id oid;
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_pg pgid;
@@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
 
-	r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
-				ceph_file_layout_pg_pool(ci->i_layout));
+	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+	ceph_oid_set_name(&oid, dl.object_name);
+
+	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
 	if (r < 0) {
 		up_read(&osdc->map_sem);
 		return r;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index c85f7d43b86..ebb8ec285de 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -163,8 +163,11 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
-extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
-			  struct ceph_osdmap *osdmap, uint64_t pool);
+extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+			       struct ceph_object_locator *oloc,
+			       struct ceph_object_id *oid,
+			       struct ceph_pg *pg_out);
+
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
 			       struct ceph_pg pgid,
 			       int *acting);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2988d68b24c..10360dedcda 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1269,8 +1269,8 @@ static int __map_request(struct ceph_osd_client *osdc,
 	bool was_paused;
 
 	dout("map_request %p tid %lld\n", req, req->r_tid);
-	err = ceph_calc_ceph_pg(&pgid, req->r_oid.name, osdc->osdmap,
-				req->r_oloc.pool);
+	err = ceph_oloc_oid_to_pg(osdc->osdmap, &req->r_oloc, &req->r_oid,
+				  &pgid);
 	if (err) {
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
 		return err;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 8b1a6b48bb5..768dd04eb9b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1090,25 +1090,30 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
+ * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
+ * called with target's (oloc, oid), since tiering isn't taken into
+ * account.
  */
-int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
-			struct ceph_osdmap *osdmap, uint64_t pool)
+int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+			struct ceph_object_locator *oloc,
+			struct ceph_object_id *oid,
+			struct ceph_pg *pg_out)
 {
-	struct ceph_pg_pool_info *pool_info;
+	struct ceph_pg_pool_info *pi;
 
-	BUG_ON(!osdmap);
-	pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
-	if (!pool_info)
+	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+	if (!pi)
 		return -EIO;
-	pg->pool = pool;
-	pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
 
-	dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
+	pg_out->pool = oloc->pool;
+	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
+				     oid->name_len);
+
+	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+	     pg_out->pool, pg_out->seed);
 	return 0;
 }
-EXPORT_SYMBOL(ceph_calc_ceph_pg);
+EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
 
 static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
 			     int *result, int result_max,
-- 
cgit v1.2.3-70-g09d2


From ed47b062ce9546fbe1eebf9da6937df4c5035372 Mon Sep 17 00:00:00 2001
From: Ming Chen <mchen@cs.stonybrook.edu>
Date: Thu, 9 Jan 2014 21:26:10 +0000
Subject: nfsd: consider CLAIM_FH when handing out delegation

CLAIM_FH was added by NFSv4.1.  It is the same as CLAIM_NULL except that it
uses only current FH to identify the file to be opened.

The NFS client is using CLAIM_FH if the FH is available when opening a file.
Currently, we cannot get any delegation if we stat a file before open it
because the server delegation code does not recognize CLAIM_FH.

We tested this patch and found delegation can be handed out now when claim is
CLAIM_FH.

See http://marc.info/?l=linux-nfs&m=136369847801388&w=2 and
http://www.linux-nfs.org/wiki/index.php/Server_4.0_and_4.1_issues#New_open_claim_types

Signed-off-by: Ming Chen <mchen@cs.stonybrook.edu>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ed3085b2bf1..d5d070fbeb3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3134,6 +3134,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
 				goto out_no_deleg;
 			break;
 		case NFS4_OPEN_CLAIM_NULL:
+		case NFS4_OPEN_CLAIM_FH:
 			/*
 			 * Let's not give out any delegations till everyone's
 			 * had the chance to reclaim theirs....
-- 
cgit v1.2.3-70-g09d2


From f5b258550f3c4e74901fcb71daf3aea454d694ae Mon Sep 17 00:00:00 2001
From: "Xiaowei.Hu" <xiaowei.hu@oracle.com>
Date: Mon, 27 Jan 2014 17:06:56 -0800
Subject: ocfs2: do not log ENOENT in unlink()

Suppress log message like this: (open_delete,8328,0):ocfs2_unlink:951
ERROR: status = -2

Orabug:17445485

Signed-off-by: Xiaowei Hu <xiaowei.hu@oracle.com>
Cc: Joe Jin <joe.jin@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4f791f6d27d..41513a4e98e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -948,7 +948,7 @@ leave:
 	ocfs2_free_dir_lookup_result(&orphan_insert);
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	if (status && (status != -ENOTEMPTY))
+	if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
 		mlog_errno(status);
 
 	return status;
-- 
cgit v1.2.3-70-g09d2


From 592f6b842f64e416c7598a1b97c649b34241e22d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 27 Jan 2014 17:07:19 -0800
Subject: compat: fix sys_fanotify_mark

Commit 91c2e0bcae72 ("unify compat fanotify_mark(2), switch to
COMPAT_SYSCALL_DEFINE") added a new unified compat fanotify_mark syscall
to be used by all architectures.

Unfortunately the unified version merges the split mask parameter in a
wrong way: the lower and higher word got swapped.

This was discovered with glibc's tst-fanotify test case.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reported-by: Andreas Krebbel <krebbel@linux.vnet.ibm.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: <stable@vger.kernel.org>	[3.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify_user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 57d7c083cb4..1fd66abe574 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -886,9 +886,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
 {
 	return sys_fanotify_mark(fanotify_fd, flags,
 #ifdef __BIG_ENDIAN
-				((__u64)mask1 << 32) | mask0,
-#else
 				((__u64)mask0 << 32) | mask1,
+#else
+				((__u64)mask1 << 32) | mask0,
 #endif
 				 dfd, pathname);
 }
-- 
cgit v1.2.3-70-g09d2


From 17dfeb9113397a6119091a491ef7182649f0c5a9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 28 Jan 2014 09:37:16 -0500
Subject: NFS: Fix races in nfs_revalidate_mapping

Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
a potential race, since it doesn't test the value of nfsi->cache_validity
and set the bitlock in nfsi->flags atomically.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/inode.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0a972ee9ccc..e5070aa5f17 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1038,24 +1038,24 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 				  nfs_wait_bit_killable, TASK_KILLABLE);
 		if (ret)
 			goto out;
-		if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
-			goto out;
-		if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
+		spin_lock(&inode->i_lock);
+		if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
 			break;
-	}
-
-	spin_lock(&inode->i_lock);
-	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
-		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
-		spin_unlock(&inode->i_lock);
-		trace_nfs_invalidate_mapping_enter(inode);
-		ret = nfs_invalidate_mapping(inode, mapping);
-		trace_nfs_invalidate_mapping_exit(inode, ret);
-	} else {
-		/* something raced in and cleared the flag */
 		spin_unlock(&inode->i_lock);
+		goto out;
 	}
 
+	set_bit(NFS_INO_INVALIDATING, bitlock);
+	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	spin_unlock(&inode->i_lock);
+	trace_nfs_invalidate_mapping_enter(inode);
+	ret = nfs_invalidate_mapping(inode, mapping);
+	trace_nfs_invalidate_mapping_exit(inode, ret);
+
 	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
 	smp_mb__after_clear_bit();
 	wake_up_bit(bitlock, NFS_INO_INVALIDATING);
-- 
cgit v1.2.3-70-g09d2


From 37b52fe60838b135913e877b0c849e59fae587c3 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 28 Jan 2014 18:29:29 +0200
Subject: ceph: fix dout() compile warnings in ceph_filemap_fault()

PAGE_CACHE_SIZE is unsigned long on all architectures, however size_t
is either unsigned int or unsigned long.  Rather than change format
strings, cast PAGE_CACHE_SIZE to size_t to be in line with dout()s in
ceph_page_mkwrite().

Cc: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/addr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 791a9a23fc6..b53278c9fd9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1220,7 +1220,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	int want, got, ret;
 
 	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
-	     inode, ceph_vinop(inode), off, PAGE_CACHE_SIZE);
+	     inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
@@ -1236,12 +1236,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		}
 	}
 	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
-	     inode, off, PAGE_CACHE_SIZE, ceph_cap_string(got));
+	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
 
 	ret = filemap_fault(vma, vmf);
 
 	dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
-	     inode, off, PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
 	ceph_put_cap_refs(ci, got);
 
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 125d725c923527a85876c031028c7f55c28b74b3 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <ilya.dryomov@inktank.com>
Date: Tue, 28 Jan 2014 19:16:18 +0200
Subject: ceph: cast PAGE_SIZE to size_t in ceph_sync_write()

Use min_t(size_t, ...) instead of plain min(), which does strict type
checking, to avoid compile warning on i386.

Cc: Jianpeng Ma <majianpeng@gmail.com>
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d10510a4733..dfd2ce3419f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -734,7 +734,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
 
 		left = len;
 		for (n = 0; n < num_pages; n++) {
-			size_t plen = min(left, PAGE_SIZE);
+			size_t plen = min_t(size_t, left, PAGE_SIZE);
 			ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
 			if (ret != plen) {
 				ret = -EFAULT;
-- 
cgit v1.2.3-70-g09d2


From 4db72b40fdbc706f8957e9773ae73b1574b8c694 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 28 Jan 2014 13:47:46 -0500
Subject: nfs: add memory barriers around NFS_INO_INVALID_DATA and
 NFS_INO_INVALIDATING

If the setting of NFS_INO_INVALIDATING gets reordered to before the
clearing of NFS_INO_INVALID_DATA, then another task may hit a race
window where both appear to be clear, even though the inode's pages are
still in need of invalidation. Fix this by adding the appropriate memory
barriers.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c   | 14 +++++++++++---
 fs/nfs/inode.c |  1 +
 fs/nfs/write.c |  1 +
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b39a0468829..be38b573495 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -274,6 +274,15 @@ out_eof:
 	return -EBADCOOKIE;
 }
 
+static bool
+nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
+{
+	if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+		return false;
+	smp_rmb();
+	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
+}
+
 static
 int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
@@ -287,9 +296,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 			struct nfs_open_dir_context *ctx = desc->file->private_data;
 
 			new_pos = desc->current_index + i;
-			if (ctx->attr_gencount != nfsi->attr_gencount
-			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
-			    || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
+			if (ctx->attr_gencount != nfsi->attr_gencount ||
+			    !nfs_readdir_inode_mapping_valid(nfsi)) {
 				ctx->duped = 0;
 				ctx->attr_gencount = nfsi->attr_gencount;
 			} else if (new_pos < desc->ctx->pos) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e5070aa5f17..02e18516860 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1050,6 +1050,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 	}
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
+	smp_wmb();
 	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5511a424719..9a3b6a4cd6b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -915,6 +915,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 		goto out;
 	if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
 		return false;
+	smp_rmb();
 	if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
 		return false;
 out:
-- 
cgit v1.2.3-70-g09d2


From 16e7549f045d33b0c5b0ebf19d08439e9221d40c Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 22 Oct 2013 12:18:51 -0400
Subject: Btrfs: incompatible format change to remove hole extents

Btrfs has always had these filler extent data items for holes in inodes.  This
has made somethings very easy, like logging hole punches and sending hole
punches.  However for large holey files these extent data items are pure
overhead.  So add an incompatible feature to no longer add hole extents to
reduce the amount of metadata used by these sort of files.  This has a few
changes for logging and send obviously since they will need to detect holes and
log/send the holes if there are any.  I've tested this thoroughly with xfstests
and it doesn't cause any issues with and without the incompat format set.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c    |   3 +-
 fs/btrfs/ctree.h    |   5 +-
 fs/btrfs/file.c     |  13 ++--
 fs/btrfs/inode.c    |  78 +++++++++++++++---------
 fs/btrfs/send.c     | 158 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/tree-log.c | 172 +++++++++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 373 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 316136bd6dd..bcd0bd85e3e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -41,7 +41,6 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 		    int level, int slot);
 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 				 struct extent_buffer *eb);
-static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -4817,7 +4816,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * This may release the path, and so you may lose any locks held at the
  * time you call it.
  */
-static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
 	struct btrfs_key key;
 	struct btrfs_disk_key found_key;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54ab86127f7..8be78f7d57e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -521,6 +521,7 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
 #define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
 #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
@@ -532,7 +533,8 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
-	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -3399,6 +3401,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq);
 static inline int btrfs_next_old_item(struct btrfs_root *root,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 82d0342763c..c77da440146 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1963,11 +1963,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
 	struct btrfs_key key;
 	int ret;
 
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+		goto out;
+
 	key.objectid = btrfs_ino(inode);
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = offset;
 
-
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0)
 		return ret;
@@ -2064,8 +2066,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	u64 drop_end;
 	int ret = 0;
 	int err = 0;
+	int rsv_count;
 	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
 
 	ret = btrfs_wait_ordered_range(inode, offset, len);
 	if (ret)
@@ -2163,9 +2167,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	/*
 	 * 1 - update the inode
 	 * 1 - removing the extents in the range
-	 * 1 - adding the hole extent
+	 * 1 - adding the hole extent if no_holes isn't set
 	 */
-	trans = btrfs_start_transaction(root, 3);
+	rsv_count = no_holes ? 2 : 3;
+	trans = btrfs_start_transaction(root, rsv_count);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out_free;
@@ -2202,7 +2207,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		btrfs_end_transaction(trans, root);
 		btrfs_btree_balance_dirty(root);
 
-		trans = btrfs_start_transaction(root, 3);
+		trans = btrfs_start_transaction(root, rsv_count);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
 			trans = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d03..c0c0dc8f07f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4203,6 +4203,49 @@ out:
 	return ret;
 }
 
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+			     u64 offset, u64 len)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/*
+	 * Still need to make sure the inode looks like it's been updated so
+	 * that any holes get logged if we fsync.
+	 */
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+		BTRFS_I(inode)->last_trans = root->fs_info->generation;
+		BTRFS_I(inode)->last_sub_trans = root->log_transid;
+		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+		return 0;
+	}
+
+	/*
+	 * 1 - for the one we're dropping
+	 * 1 - for the one we're adding
+	 * 1 - for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, len, 0, len, 0, 0, 0);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	else
+		btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
 /*
  * This function puts in dummy file extents for the area we're creating a hole
  * for.  So if we are truncating this file to a larger size we need to insert
@@ -4211,7 +4254,6 @@ out:
  */
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
@@ -4266,31 +4308,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			struct extent_map *hole_em;
 			hole_size = last_byte - cur_offset;
 
-			trans = btrfs_start_transaction(root, 3);
-			if (IS_ERR(trans)) {
-				err = PTR_ERR(trans);
-				break;
-			}
-
-			err = btrfs_drop_extents(trans, root, inode,
-						 cur_offset,
-						 cur_offset + hole_size, 1);
-			if (err) {
-				btrfs_abort_transaction(trans, root, err);
-				btrfs_end_transaction(trans, root);
-				break;
-			}
-
-			err = btrfs_insert_file_extent(trans, root,
-					btrfs_ino(inode), cur_offset, 0,
-					0, hole_size, 0, hole_size,
-					0, 0, 0);
-			if (err) {
-				btrfs_abort_transaction(trans, root, err);
-				btrfs_end_transaction(trans, root);
+			err = maybe_insert_hole(root, inode, cur_offset,
+						hole_size);
+			if (err)
 				break;
-			}
-
 			btrfs_drop_extent_cache(inode, cur_offset,
 						cur_offset + hole_size - 1, 0);
 			hole_em = alloc_extent_map();
@@ -4309,7 +4330,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			hole_em->ram_bytes = hole_size;
 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
-			hole_em->generation = trans->transid;
+			hole_em->generation = root->fs_info->generation;
 
 			while (1) {
 				write_lock(&em_tree->lock);
@@ -4322,17 +4343,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 							hole_size - 1, 0);
 			}
 			free_extent_map(hole_em);
-next:
-			btrfs_update_inode(trans, root, inode);
-			btrfs_end_transaction(trans, root);
 		}
+next:
 		free_extent_map(em);
 		em = NULL;
 		cur_offset = last_byte;
 		if (cur_offset >= block_end)
 			break;
 	}
-
 	free_extent_map(em);
 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
 			     GFP_NOFS);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 945d1db98f2..29803b4129f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -111,6 +111,7 @@ struct send_ctx {
 	int cur_inode_deleted;
 	u64 cur_inode_size;
 	u64 cur_inode_mode;
+	u64 cur_inode_last_extent;
 
 	u64 send_progress;
 
@@ -145,6 +146,13 @@ struct name_cache_entry {
 	char name[];
 };
 
+static int need_send_hole(struct send_ctx *sctx)
+{
+	return (sctx->parent_root && !sctx->cur_inode_new &&
+		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+		S_ISREG(sctx->cur_inode_mode));
+}
+
 static void fs_path_reset(struct fs_path *p)
 {
 	if (p->reversed) {
@@ -3752,6 +3760,39 @@ out:
 	return ret;
 }
 
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+	struct fs_path *p = NULL;
+	u64 offset = sctx->cur_inode_last_extent;
+	u64 len;
+	int ret = 0;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+	memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+	while (offset < end) {
+		len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+
+		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+		if (ret < 0)
+			break;
+		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+		if (ret < 0)
+			break;
+		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+		TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+		ret = send_cmd(sctx);
+		if (ret < 0)
+			break;
+		offset += len;
+	}
+tlv_put_failure:
+	fs_path_free(p);
+	return ret;
+}
+
 static int send_write_or_clone(struct send_ctx *sctx,
 			       struct btrfs_path *path,
 			       struct btrfs_key *key,
@@ -3979,6 +4020,84 @@ out:
 	return ret;
 }
 
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root = sctx->send_root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+	u8 type;
+	int ret;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	sctx->cur_inode_last_extent = 0;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+	ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+		goto out;
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], fi);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0], fi);
+		extent_end = ALIGN(key.offset + size,
+				   sctx->send_root->sectorsize);
+	} else {
+		extent_end = key.offset +
+			btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	}
+	sctx->cur_inode_last_extent = extent_end;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+			   struct btrfs_key *key)
+{
+	struct btrfs_file_extent_item *fi;
+	u64 extent_end;
+	u8 type;
+	int ret = 0;
+
+	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+		return 0;
+
+	if (sctx->cur_inode_last_extent == (u64)-1) {
+		ret = get_last_extent(sctx, key->offset - 1);
+		if (ret)
+			return ret;
+	}
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], fi);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0], fi);
+		extent_end = ALIGN(key->offset + size,
+				   sctx->send_root->sectorsize);
+	} else {
+		extent_end = key->offset +
+			btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	}
+	if (sctx->cur_inode_last_extent < key->offset)
+		ret = send_hole(sctx, key->offset);
+	sctx->cur_inode_last_extent = extent_end;
+	return ret;
+}
+
 static int process_extent(struct send_ctx *sctx,
 			  struct btrfs_path *path,
 			  struct btrfs_key *key)
@@ -3995,7 +4114,7 @@ static int process_extent(struct send_ctx *sctx,
 			goto out;
 		if (ret) {
 			ret = 0;
-			goto out;
+			goto out_hole;
 		}
 	} else {
 		struct btrfs_file_extent_item *ei;
@@ -4031,7 +4150,10 @@ static int process_extent(struct send_ctx *sctx,
 		goto out;
 
 	ret = send_write_or_clone(sctx, path, key, found_clone);
-
+	if (ret)
+		goto out;
+out_hole:
+	ret = maybe_send_hole(sctx, path, key);
 out:
 	return ret;
 }
@@ -4157,6 +4279,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 	}
 
 	if (S_ISREG(sctx->cur_inode_mode)) {
+		if (need_send_hole(sctx)) {
+			if (sctx->cur_inode_last_extent == (u64)-1) {
+				ret = get_last_extent(sctx, (u64)-1);
+				if (ret)
+					goto out;
+			}
+			if (sctx->cur_inode_last_extent <
+			    sctx->cur_inode_size) {
+				ret = send_hole(sctx, sctx->cur_inode_size);
+				if (ret)
+					goto out;
+			}
+		}
 		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
 				sctx->cur_inode_size);
 		if (ret < 0)
@@ -4200,6 +4335,7 @@ static int changed_inode(struct send_ctx *sctx,
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
+	sctx->cur_inode_last_extent = (u64)-1;
 
 	/*
 	 * Set send_progress to current inode. This will tell all get_cur_xxx
@@ -4480,14 +4616,18 @@ static int changed_cb(struct btrfs_root *left_root,
 	struct send_ctx *sctx = ctx;
 
 	if (result == BTRFS_COMPARE_TREE_SAME) {
-		if (key->type != BTRFS_INODE_REF_KEY &&
-		    key->type != BTRFS_INODE_EXTREF_KEY)
-			return 0;
-		ret = compare_refs(sctx, left_path, key);
-		if (!ret)
+		if (key->type == BTRFS_INODE_REF_KEY ||
+		    key->type == BTRFS_INODE_EXTREF_KEY) {
+			ret = compare_refs(sctx, left_path, key);
+			if (!ret)
+				return 0;
+			if (ret < 0)
+				return ret;
+		} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+			return maybe_send_hole(sctx, left_path, key);
+		} else {
 			return 0;
-		if (ret < 0)
-			return ret;
+		}
 		result = BTRFS_COMPARE_TREE_CHANGED;
 		ret = 0;
 	}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9f7fc51ca33..e7d7a837512 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3194,7 +3194,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode,
 			       struct btrfs_path *dst_path,
-			       struct extent_buffer *src,
+			       struct btrfs_path *src_path, u64 *last_extent,
 			       int start_slot, int nr, int inode_only)
 {
 	unsigned long src_offset;
@@ -3202,6 +3202,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *src = src_path->nodes[0];
+	struct btrfs_key first_key, last_key, key;
 	int ret;
 	struct btrfs_key *ins_keys;
 	u32 *ins_sizes;
@@ -3209,6 +3211,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	int i;
 	struct list_head ordered_sums;
 	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	bool has_extents = false;
+	bool need_find_last_extent = (*last_extent == 0);
+	bool done = false;
 
 	INIT_LIST_HEAD(&ordered_sums);
 
@@ -3217,6 +3222,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	if (!ins_data)
 		return -ENOMEM;
 
+	first_key.objectid = (u64)-1;
+
 	ins_sizes = (u32 *)ins_data;
 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
 
@@ -3237,6 +3244,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
 
+		if ((i == (nr - 1)))
+			last_key = ins_keys[i];
+
 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
 						    dst_path->slots[0],
@@ -3248,6 +3258,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 					   src_offset, ins_sizes[i]);
 		}
 
+		/*
+		 * We set need_find_last_extent here in case we know we were
+		 * processing other items and then walk into the first extent in
+		 * the inode.  If we don't hit an extent then nothing changes,
+		 * we'll do the last search the next time around.
+		 */
+		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+			has_extents = true;
+			if (need_find_last_extent &&
+			    first_key.objectid == (u64)-1)
+				first_key = ins_keys[i];
+		} else {
+			need_find_last_extent = false;
+		}
+
 		/* take a reference on file data extents so that truncates
 		 * or deletes of this inode don't have to relog the inode
 		 * again
@@ -3312,6 +3337,126 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 		list_del(&sums->list);
 		kfree(sums);
 	}
+
+	if (!has_extents)
+		return ret;
+
+	/*
+	 * Because we use btrfs_search_forward we could skip leaves that were
+	 * not modified and then assume *last_extent is valid when it really
+	 * isn't.  So back up to the previous leaf and read the end of the last
+	 * extent before we go and fill in holes.
+	 */
+	if (need_find_last_extent) {
+		u64 len;
+
+		ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			goto fill_holes;
+		if (src_path->slots[0])
+			src_path->slots[0]--;
+		src = src_path->nodes[0];
+		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+		if (key.objectid != btrfs_ino(inode) ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			goto fill_holes;
+		extent = btrfs_item_ptr(src, src_path->slots[0],
+					struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(src, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(src, extent);
+			*last_extent = ALIGN(key.offset + len,
+					     log->sectorsize);
+		} else {
+			len = btrfs_file_extent_num_bytes(src, extent);
+			*last_extent = key.offset + len;
+		}
+	}
+fill_holes:
+	/* So we did prev_leaf, now we need to move to the next leaf, but a few
+	 * things could have happened
+	 *
+	 * 1) A merge could have happened, so we could currently be on a leaf
+	 * that holds what we were copying in the first place.
+	 * 2) A split could have happened, and now not all of the items we want
+	 * are on the same leaf.
+	 *
+	 * So we need to adjust how we search for holes, we need to drop the
+	 * path and re-search for the first extent key we found, and then walk
+	 * forward until we hit the last one we copied.
+	 */
+	if (need_find_last_extent) {
+		/* btrfs_prev_leaf could return 1 without releasing the path */
+		btrfs_release_path(src_path);
+		ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
+					src_path, 0, 0);
+		if (ret < 0)
+			return ret;
+		ASSERT(ret == 0);
+		src = src_path->nodes[0];
+		i = src_path->slots[0];
+	} else {
+		i = start_slot;
+	}
+
+	/*
+	 * Ok so here we need to go through and fill in any holes we may have
+	 * to make sure that holes are punched for those areas in case they had
+	 * extents previously.
+	 */
+	while (!done) {
+		u64 offset, len;
+		u64 extent_end;
+
+		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+			ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
+			if (ret < 0)
+				return ret;
+			ASSERT(ret == 0);
+			src = src_path->nodes[0];
+			i = 0;
+		}
+
+		btrfs_item_key_to_cpu(src, &key, i);
+		if (!btrfs_comp_cpu_keys(&key, &last_key))
+			done = true;
+		if (key.objectid != btrfs_ino(inode) ||
+		    key.type != BTRFS_EXTENT_DATA_KEY) {
+			i++;
+			continue;
+		}
+		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(src, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(src, extent);
+			extent_end = ALIGN(key.offset + len, log->sectorsize);
+		} else {
+			len = btrfs_file_extent_num_bytes(src, extent);
+			extent_end = key.offset + len;
+		}
+		i++;
+
+		if (*last_extent == key.offset) {
+			*last_extent = extent_end;
+			continue;
+		}
+		offset = *last_extent;
+		len = key.offset - *last_extent;
+		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+					       offset, 0, 0, len, 0, len, 0,
+					       0, 0);
+		if (ret)
+			break;
+		*last_extent = offset + len;
+	}
+	/*
+	 * Need to let the callers know we dropped the path so they should
+	 * re-search.
+	 */
+	if (!ret && need_find_last_extent)
+		ret = 1;
 	return ret;
 }
 
@@ -3630,6 +3775,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
+	u64 last_extent = 0;
 	int err = 0;
 	int ret;
 	int nritems;
@@ -3745,11 +3891,15 @@ again:
 			goto next_slot;
 		}
 
-		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
-				 ins_nr, inode_only);
-		if (ret) {
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 ins_start_slot, ins_nr, inode_only);
+		if (ret < 0) {
 			err = ret;
 			goto out_unlock;
+		} if (ret) {
+			ins_nr = 0;
+			btrfs_release_path(path);
+			continue;
 		}
 		ins_nr = 1;
 		ins_start_slot = path->slots[0];
@@ -3763,13 +3913,14 @@ next_slot:
 			goto again;
 		}
 		if (ins_nr) {
-			ret = copy_items(trans, inode, dst_path, src,
-					 ins_start_slot,
+			ret = copy_items(trans, inode, dst_path, path,
+					 &last_extent, ins_start_slot,
 					 ins_nr, inode_only);
-			if (ret) {
+			if (ret < 0) {
 				err = ret;
 				goto out_unlock;
 			}
+			ret = 0;
 			ins_nr = 0;
 		}
 		btrfs_release_path(path);
@@ -3784,12 +3935,13 @@ next_slot:
 		}
 	}
 	if (ins_nr) {
-		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
-				 ins_nr, inode_only);
-		if (ret) {
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 ins_start_slot, ins_nr, inode_only);
+		if (ret < 0) {
 			err = ret;
 			goto out_unlock;
 		}
+		ret = 0;
 		ins_nr = 0;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From e20d6c5ba38d066c7dc0f7d3b68da14b9ae7fe37 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 13 Nov 2013 21:11:49 -0500
Subject: Btrfs: fix check-integrity to look at the referenced data properly

We were looking at file_extent_num_bytes unconditionally when looking at
referenced data bytes, but this isn't correct for compression.  Fix this by
checking the compression of the file extent we are and setting num_bytes to
disk_num_bytes in the case of compression so that we are marking the proper
bytes as referenced.  This fixes check_int_data freaking out when running
btrfs/004.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/check-integrity.c | 12 ++++++++----
 fs/btrfs/ctree.h           |  4 ++++
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 131d82800b3..160fb509d72 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1456,10 +1456,14 @@ static int btrfsic_handle_extent_data(
 	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
 				     file_extent_item_offset,
 				     sizeof(struct btrfs_file_extent_item));
-	next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) +
-		      btrfs_stack_file_extent_offset(&file_extent_item);
-	generation = btrfs_stack_file_extent_generation(&file_extent_item);
-	num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+	next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
+	if (btrfs_stack_file_extent_compression(&file_extent_item) ==
+	    BTRFS_COMPRESS_NONE) {
+		next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
+		num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+	} else {
+		num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
+	}
 	generation = btrfs_stack_file_extent_generation(&file_extent_item);
 
 	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8be78f7d57e..1aafccda05d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2927,6 +2927,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
 			 struct btrfs_file_extent_item, generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
 			 struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+			 struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+			 struct btrfs_file_extent_item, compression, 8);
 
 static inline unsigned long
 btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
-- 
cgit v1.2.3-70-g09d2


From c46effa601f869f3d20a7386a745d9c002838eb8 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 14 Oct 2013 12:59:45 +0800
Subject: Btrfs: introduce a head ref rbtree

The way how we process delayed refs is
1) get a bunch of head refs,
2) pick up one head ref,
3) go one node back for any delayed ref updates.

The head ref is also linked in the same rbtree as the delayed ref is,
so in 1) stage, we have to walk one by one including not only head refs, but
delayed refs.

When we have a great number of delayed refs pending to process,
this'll cost time a lot.

Here we introduce a head ref specific rbtree, it only has head refs, so troubles
go away.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 126 +++++++++++++++++++++++++++++--------------------
 fs/btrfs/delayed-ref.h |   5 ++
 fs/btrfs/disk-io.c     |   3 ++
 fs/btrfs/extent-tree.c |  21 ++++++---
 fs/btrfs/transaction.c |   4 +-
 5 files changed, 99 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e4d467be2dd..9bbac6ddf41 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -161,35 +161,61 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 	return NULL;
 }
 
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+						   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_head *entry;
+	struct btrfs_delayed_ref_head *ins;
+	u64 bytenr;
+
+	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+	bytenr = ins->node.bytenr;
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+				 href_node);
+
+		if (bytenr < entry->node.bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->node.bytenr)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
 /*
  * find an head entry based on bytenr. This returns the delayed ref
  * head if it was able to find one, or NULL if nothing was in that spot.
  * If return_bigger is given, the next bigger entry is returned if no exact
  * match is found.
  */
-static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
-				  u64 bytenr,
-				  struct btrfs_delayed_ref_node **last,
-				  int return_bigger)
+static struct btrfs_delayed_ref_head *
+find_ref_head(struct rb_root *root, u64 bytenr,
+	      struct btrfs_delayed_ref_head **last, int return_bigger)
 {
 	struct rb_node *n;
-	struct btrfs_delayed_ref_node *entry;
+	struct btrfs_delayed_ref_head *entry;
 	int cmp = 0;
 
 again:
 	n = root->rb_node;
 	entry = NULL;
 	while (n) {
-		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
-		WARN_ON(!entry->in_tree);
+		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
 		if (last)
 			*last = entry;
 
-		if (bytenr < entry->bytenr)
+		if (bytenr < entry->node.bytenr)
 			cmp = -1;
-		else if (bytenr > entry->bytenr)
-			cmp = 1;
-		else if (!btrfs_delayed_ref_is_head(entry))
+		else if (bytenr > entry->node.bytenr)
 			cmp = 1;
 		else
 			cmp = 0;
@@ -203,12 +229,12 @@ again:
 	}
 	if (entry && return_bigger) {
 		if (cmp > 0) {
-			n = rb_next(&entry->rb_node);
+			n = rb_next(&entry->href_node);
 			if (!n)
 				n = rb_first(root);
-			entry = rb_entry(n, struct btrfs_delayed_ref_node,
-					 rb_node);
-			bytenr = entry->bytenr;
+			entry = rb_entry(n, struct btrfs_delayed_ref_head,
+					 href_node);
+			bytenr = entry->node.bytenr;
 			return_bigger = 0;
 			goto again;
 		}
@@ -246,6 +272,12 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 				    struct btrfs_delayed_ref_node *ref)
 {
 	rb_erase(&ref->rb_node, &delayed_refs->root);
+	if (btrfs_delayed_ref_is_head(ref)) {
+		struct btrfs_delayed_ref_head *head;
+
+		head = btrfs_delayed_node_to_head(ref);
+		rb_erase(&head->href_node, &delayed_refs->href_root);
+	}
 	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
 	delayed_refs->num_entries--;
@@ -379,42 +411,35 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 	int count = 0;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_head *head = NULL;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	if (start == 0) {
-		node = rb_first(&delayed_refs->root);
-	} else {
-		ref = NULL;
-		find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
-		if (ref) {
-			node = &ref->rb_node;
-		} else
-			node = rb_first(&delayed_refs->root);
+	node = rb_first(&delayed_refs->href_root);
+
+	if (start) {
+		find_ref_head(&delayed_refs->href_root, start + 1, &head, 1);
+		if (head)
+			node = &head->href_node;
 	}
 again:
 	while (node && count < 32) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		if (btrfs_delayed_ref_is_head(ref)) {
-			head = btrfs_delayed_node_to_head(ref);
-			if (list_empty(&head->cluster)) {
-				list_add_tail(&head->cluster, cluster);
-				delayed_refs->run_delayed_start =
-					head->node.bytenr;
-				count++;
-
-				WARN_ON(delayed_refs->num_heads_ready == 0);
-				delayed_refs->num_heads_ready--;
-			} else if (count) {
-				/* the goal of the clustering is to find extents
-				 * that are likely to end up in the same extent
-				 * leaf on disk.  So, we don't want them spread
-				 * all over the tree.  Stop now if we've hit
-				 * a head that was already in use
-				 */
-				break;
-			}
+		head = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+		if (list_empty(&head->cluster)) {
+			list_add_tail(&head->cluster, cluster);
+			delayed_refs->run_delayed_start =
+				head->node.bytenr;
+			count++;
+
+			WARN_ON(delayed_refs->num_heads_ready == 0);
+			delayed_refs->num_heads_ready--;
+		} else if (count) {
+			/* the goal of the clustering is to find extents
+			 * that are likely to end up in the same extent
+			 * leaf on disk.  So, we don't want them spread
+			 * all over the tree.  Stop now if we've hit
+			 * a head that was already in use
+			 */
+			break;
 		}
 		node = rb_next(node);
 	}
@@ -426,7 +451,7 @@ again:
 		 * clusters.  start from the beginning and try again
 		 */
 		start = 0;
-		node = rb_first(&delayed_refs->root);
+		node = rb_first(&delayed_refs->href_root);
 		goto again;
 	}
 	return 1;
@@ -612,6 +637,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		 */
 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
 	} else {
+		htree_insert(&delayed_refs->href_root, &head_ref->href_node);
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		delayed_refs->num_entries++;
@@ -869,14 +895,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 {
-	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
-	if (ref)
-		return btrfs_delayed_node_to_head(ref);
-	return NULL;
+	return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
 }
 
 void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 70b962cc177..a54c9d47918 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -83,6 +83,8 @@ struct btrfs_delayed_ref_head {
 
 	struct list_head cluster;
 
+	struct rb_node href_node;
+
 	struct btrfs_delayed_extent_op *extent_op;
 	/*
 	 * when a new extent is allocated, it is just reserved in memory
@@ -118,6 +120,9 @@ struct btrfs_delayed_data_ref {
 struct btrfs_delayed_ref_root {
 	struct rb_root root;
 
+	/* head ref rbtree */
+	struct rb_root href_root;
+
 	/* this spin lock protects the rbtree and the entries inside */
 	spinlock_t lock;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8072cfa8a3b..435ef132b80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3842,6 +3842,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
+		if (head)
+			rb_erase(&head->href_node, &delayed_refs->href_root);
+
 		delayed_refs->num_entries--;
 		spin_unlock(&delayed_refs->lock);
 		if (head) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9c01509dd8a..d15b4fc0755 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2438,6 +2438,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
+		if (btrfs_delayed_ref_is_head(ref)) {
+			rb_erase(&locked_ref->href_node,
+				 &delayed_refs->href_root);
+		}
 		delayed_refs->num_entries--;
 		if (!btrfs_delayed_ref_is_head(ref)) {
 			/*
@@ -2640,7 +2644,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 {
 	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *head;
 	struct list_head cluster;
 	int ret;
 	u64 delayed_start;
@@ -2770,18 +2774,18 @@ again:
 			spin_lock(&delayed_refs->lock);
 		}
 
-		node = rb_first(&delayed_refs->root);
+		node = rb_first(&delayed_refs->href_root);
 		if (!node)
 			goto out;
 		count = (unsigned long)-1;
 
 		while (node) {
-			ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				       rb_node);
-			if (btrfs_delayed_ref_is_head(ref)) {
-				struct btrfs_delayed_ref_head *head;
+			head = rb_entry(node, struct btrfs_delayed_ref_head,
+					href_node);
+			if (btrfs_delayed_ref_is_head(&head->node)) {
+				struct btrfs_delayed_ref_node *ref;
 
-				head = btrfs_delayed_node_to_head(ref);
+				ref = &head->node;
 				atomic_inc(&ref->refs);
 
 				spin_unlock(&delayed_refs->lock);
@@ -2795,6 +2799,8 @@ again:
 				btrfs_put_delayed_ref(ref);
 				cond_resched();
 				goto again;
+			} else {
+				WARN_ON(1);
 			}
 			node = rb_next(node);
 		}
@@ -5956,6 +5962,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	 */
 	head->node.in_tree = 0;
 	rb_erase(&head->node.rb_node, &delayed_refs->root);
+	rb_erase(&head->href_node, &delayed_refs->href_root);
 
 	delayed_refs->num_entries--;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c6a872a8a46..14516375777 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -62,7 +62,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
-		WARN_ON(transaction->delayed_refs.root.rb_node);
+		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.root));
+		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
 		while (!list_empty(&transaction->pending_chunks)) {
 			struct extent_map *em;
 
@@ -184,6 +185,7 @@ loop:
 	cur_trans->start_time = get_seconds();
 
 	cur_trans->delayed_refs.root = RB_ROOT;
+	cur_trans->delayed_refs.href_root = RB_ROOT;
 	cur_trans->delayed_refs.num_entries = 0;
 	cur_trans->delayed_refs.num_heads_ready = 0;
 	cur_trans->delayed_refs.num_heads = 0;
-- 
cgit v1.2.3-70-g09d2


From 9e5ac13acbb9e806a54f131432501bf462248c35 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 14 Oct 2013 12:59:43 +0800
Subject: Btrfs: skip merge part for delayed data refs

When we have data deduplication on, we'll hang on the merge part
because it needs to verify every queued delayed data refs related to
this disk offset but we may have millions refs.

And in the case of delayed data refs, we don't usually have too much
data refs to merge.

So it's safe to shut it down for data refs.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9bbac6ddf41..fab60c1ba06 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -352,6 +352,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 	struct rb_node *node;
 	u64 seq = 0;
 
+	/*
+	 * We don't have too much refs to merge in the case of delayed data
+	 * refs.
+	 */
+	if (head->is_data)
+		return;
+
 	spin_lock(&fs_info->tree_mod_seq_lock);
 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
 		struct seq_list *elem;
-- 
cgit v1.2.3-70-g09d2


From 2eaa055fab4e3127c9f572fda1b710cbb2acdf1c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 15 Nov 2013 15:33:55 -0500
Subject: btrfs: add ioctls to query/change feature bits online

There are some feature bits that require no offline setup and can
be enabled online. I've only reviewed extended irefs, but there will
probably be more.

We introduce three new ioctls:
- BTRFS_IOC_GET_SUPPORTED_FEATURES: query the kernel for supported features.
- BTRFS_IOC_GET_FEATURES: query the kernel for enabled features on a per-fs
  basis, as well as querying for which features are changeable with mounted.
- BTRFS_IOC_SET_FEATURES: change features on a per-fs basis.

We introduce two new masks per feature set (_SAFE_SET and _SAFE_CLEAR) that
allow us to define which features are safe to change at runtime.

The failure modes for BTRFS_IOC_SET_FEATURES are as follows:
- Enabling a completely unsupported feature: warns and returns -ENOTSUPP
- Enabling a feature that can only be done offline: warns and returns -EPERM

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h           |   9 +++
 fs/btrfs/ioctl.c           | 142 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |  12 ++++
 3 files changed, 163 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1aafccda05d..498452ebfd3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -524,7 +524,12 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
+
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
@@ -536,6 +541,10 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
 	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
 
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
+	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL
+
 /*
  * A leaf is full of items. offset and size tell us where to find
  * the item in the leaf (relative to the start of the data area)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 21da5762b0b..d4e105b5409 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4480,6 +4480,142 @@ out_unlock:
 	return ret;
 }
 
+#define INIT_FEATURE_FLAGS(suffix) \
+	{ .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
+	  .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
+	  .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
+
+static int btrfs_ioctl_get_supported_features(struct file *file,
+					      void __user *arg)
+{
+	static struct btrfs_ioctl_feature_flags features[3] = {
+		INIT_FEATURE_FLAGS(SUPP),
+		INIT_FEATURE_FLAGS(SAFE_SET),
+		INIT_FEATURE_FLAGS(SAFE_CLEAR)
+	};
+
+	if (copy_to_user(arg, &features, sizeof(features)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_ioctl_feature_flags features;
+
+	features.compat_flags = btrfs_super_compat_flags(super_block);
+	features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
+	features.incompat_flags = btrfs_super_incompat_flags(super_block);
+
+	if (copy_to_user(arg, &features, sizeof(features)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int check_feature_bits(struct btrfs_root *root, const char *type,
+			      u64 change_mask, u64 flags, u64 supported_flags,
+			      u64 safe_set, u64 safe_clear)
+{
+	u64 disallowed, unsupported;
+	u64 set_mask = flags & change_mask;
+	u64 clear_mask = ~flags & change_mask;
+
+	unsupported = set_mask & ~supported_flags;
+	if (unsupported) {
+		btrfs_warn(root->fs_info,
+			   "this kernel does not support %s bits 0x%llx",
+			   type, unsupported);
+		return -EOPNOTSUPP;
+	}
+
+	disallowed = set_mask & ~safe_set;
+	if (disallowed) {
+		btrfs_warn(root->fs_info,
+			   "can't set %s bits 0x%llx while mounted",
+			   type, disallowed);
+		return -EPERM;
+	}
+
+	disallowed = clear_mask & ~safe_clear;
+	if (disallowed) {
+		btrfs_warn(root->fs_info,
+			   "can't clear %s bits 0x%llx while mounted",
+			   type, disallowed);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+#define check_feature(root, change_mask, flags, mask_base)	\
+check_feature_bits(root, # mask_base, change_mask, flags,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SUPP,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
+
+static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_ioctl_feature_flags flags[2];
+	struct btrfs_trans_handle *trans;
+	u64 newflags;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	/* Nothing to do */
+	if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
+	    !flags[0].incompat_flags)
+		return 0;
+
+	ret = check_feature(root, flags[0].compat_flags,
+			    flags[1].compat_flags, COMPAT);
+	if (ret)
+		return ret;
+
+	ret = check_feature(root, flags[0].compat_ro_flags,
+			    flags[1].compat_ro_flags, COMPAT_RO);
+	if (ret)
+		return ret;
+
+	ret = check_feature(root, flags[0].incompat_flags,
+			    flags[1].incompat_flags, INCOMPAT);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	spin_lock(&root->fs_info->super_lock);
+	newflags = btrfs_super_compat_flags(super_block);
+	newflags |= flags[0].compat_flags & flags[1].compat_flags;
+	newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
+	btrfs_set_super_compat_flags(super_block, newflags);
+
+	newflags = btrfs_super_compat_ro_flags(super_block);
+	newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
+	newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
+	btrfs_set_super_compat_ro_flags(super_block, newflags);
+
+	newflags = btrfs_super_incompat_flags(super_block);
+	newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
+	newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
+	btrfs_set_super_incompat_flags(super_block, newflags);
+	spin_unlock(&root->fs_info->super_lock);
+
+	return btrfs_end_transaction(trans, root);
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -4598,6 +4734,12 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_set_fslabel(file, argp);
 	case BTRFS_IOC_FILE_EXTENT_SAME:
 		return btrfs_ioctl_file_extent_same(file, argp);
+	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+		return btrfs_ioctl_get_supported_features(file, argp);
+	case BTRFS_IOC_GET_FEATURES:
+		return btrfs_ioctl_get_features(file, argp);
+	case BTRFS_IOC_SET_FEATURES:
+		return btrfs_ioctl_set_features(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 45e618921c6..b4d69092fbd 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -184,6 +184,12 @@ struct btrfs_ioctl_fs_info_args {
 	__u64 reserved[124];			/* pad to 1k */
 };
 
+struct btrfs_ioctl_feature_flags {
+	__u64 compat_flags;
+	__u64 compat_ro_flags;
+	__u64 incompat_flags;
+};
+
 /* balance control ioctl modes */
 #define BTRFS_BALANCE_CTL_PAUSE		1
 #define BTRFS_BALANCE_CTL_CANCEL	2
@@ -606,5 +612,11 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
 				    struct btrfs_ioctl_dev_replace_args)
 #define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \
 					 struct btrfs_ioctl_same_args)
+#define BTRFS_IOC_GET_FEATURES _IOR(BTRFS_IOCTL_MAGIC, 57, \
+				   struct btrfs_ioctl_feature_flags)
+#define BTRFS_IOC_SET_FEATURES _IOW(BTRFS_IOCTL_MAGIC, 57, \
+				   struct btrfs_ioctl_feature_flags[2])
+#define BTRFS_IOC_GET_SUPPORTED_FEATURES _IOR(BTRFS_IOCTL_MAGIC, 57, \
+				   struct btrfs_ioctl_feature_flags[3])
 
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
cgit v1.2.3-70-g09d2


From 079b72bca30dbc74c86c7c7825b8c34eb86ce3ee Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:06:57 -0400
Subject: btrfs: publish supported featured in sysfs

This patch adds the ability to publish supported features to sysfs under
/sys/fs/btrfs/features.

The files are module-wide and export which features the kernel supports.

The content, for now, is just "0\n".

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/sysfs.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 fs/btrfs/sysfs.h

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5b326cd60a4..9e217b58190 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -26,20 +26,64 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "sysfs.h"
+
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "0\n");
+}
+
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
+BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzov2, COMPRESS_LZOv2);
+BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
+BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
+BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+
+static struct attribute *btrfs_supported_feature_attrs[] = {
+	BTRFS_FEAT_ATTR_PTR(mixed_backref),
+	BTRFS_FEAT_ATTR_PTR(default_subvol),
+	BTRFS_FEAT_ATTR_PTR(mixed_groups),
+	BTRFS_FEAT_ATTR_PTR(compress_lzo),
+	BTRFS_FEAT_ATTR_PTR(compress_lzov2),
+	BTRFS_FEAT_ATTR_PTR(big_metadata),
+	BTRFS_FEAT_ATTR_PTR(extended_iref),
+	BTRFS_FEAT_ATTR_PTR(raid56),
+	BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+	NULL
+};
+
+static const struct attribute_group btrfs_feature_attr_group = {
+	.name = "features",
+	.attrs = btrfs_supported_feature_attrs,
+};
 
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
 
 int btrfs_init_sysfs(void)
 {
+	int ret;
 	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
 	if (!btrfs_kset)
 		return -ENOMEM;
+
+	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+	if (ret) {
+		kset_unregister(btrfs_kset);
+		return ret;
+	}
+
 	return 0;
 }
 
 void btrfs_exit_sysfs(void)
 {
+	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	kset_unregister(btrfs_kset);
 }
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
new file mode 100644
index 00000000000..863e031ed1c
--- /dev/null
+++ b/fs/btrfs/sysfs.h
@@ -0,0 +1,43 @@
+#ifndef _BTRFS_SYSFS_H_
+#define _BTRFS_SYSFS_H_
+
+enum btrfs_feature_set {
+	FEAT_COMPAT,
+	FEAT_COMPAT_RO,
+	FEAT_INCOMPAT,
+	FEAT_MAX
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
+{									\
+	.attr	= { .name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,						\
+	.store	= _store,						\
+}
+
+struct btrfs_feature_attr {
+	struct kobj_attribute kobj_attr;
+	enum btrfs_feature_set feature_set;
+	u64 feature_bit;
+};
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit)	     \
+static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
+	.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,			     \
+				      btrfs_feature_attr_show, NULL),	     \
+	.feature_set	= _feature_set,					     \
+	.feature_bit	= _prefix ##_## _feature_bit,			     \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name)    (&btrfs_attr_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
+
+/* convert from attribute */
+#define to_btrfs_feature_attr(a) \
+			container_of(a, struct btrfs_feature_attr, kobj_attr)
+#endif /* _BTRFS_SYSFS_H_ */
-- 
cgit v1.2.3-70-g09d2


From 5ac1d209f11271fbfad0fa31ba56ec64c142d9ea Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:06:58 -0400
Subject: btrfs: publish per-super attributes in sysfs

This patch adds per-super attributes to sysfs.

It doesn't publish any attributes yet, but does the proper lifetime
handling as well as the basic infrastructure to add new attributes.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/disk-io.c |  9 +++++++++
 fs/btrfs/sysfs.c   | 36 ++++++++++++++++++++++++++++++++++++
 fs/btrfs/sysfs.h   | 10 ++++++++++
 4 files changed, 57 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 498452ebfd3..c5c888fbf03 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3780,6 +3780,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 /* sysfs.c */
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
 
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 435ef132b80..81f3433fe4f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -48,6 +48,7 @@
 #include "rcu-string.h"
 #include "dev-replace.h"
 #include "raid56.h"
+#include "sysfs.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -2743,6 +2744,12 @@ retry_root_backup:
 
 	btrfs_close_extra_devices(fs_info, fs_devices, 1);
 
+	ret = btrfs_sysfs_add_one(fs_info);
+	if (ret) {
+		pr_err("btrfs: failed to init sysfs interface: %d\n", ret);
+		goto fail_block_groups;
+	}
+
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
 		printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -3584,6 +3591,8 @@ int close_ctree(struct btrfs_root *root)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
+	btrfs_sysfs_remove_one(fs_info);
+
 	del_fs_roots(fs_info);
 
 	btrfs_free_block_groups(fs_info);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9e217b58190..79be4a187af 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,6 +28,25 @@
 #include "transaction.h"
 #include "sysfs.h"
 
+static void btrfs_release_super_kobj(struct kobject *kobj);
+static struct kobj_type btrfs_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.release	= btrfs_release_super_kobj,
+};
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+	if (kobj->ktype != &btrfs_ktype)
+		return NULL;
+	return container_of(kobj, struct btrfs_fs_info, super_kobj);
+}
+
+static void btrfs_release_super_kobj(struct kobject *kobj)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	complete(&fs_info->kobj_unregister);
+}
+
 static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
 				       struct kobj_attribute *a, char *buf)
 {
@@ -65,6 +84,23 @@ static const struct attribute_group btrfs_feature_attr_group = {
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
 
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+	kobject_del(&fs_info->super_kobj);
+	kobject_put(&fs_info->super_kobj);
+	wait_for_completion(&fs_info->kobj_unregister);
+}
+
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+{
+	int error;
+
+	init_completion(&fs_info->kobj_unregister);
+	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
+				     "%pU", fs_info->fsid);
+	return error;
+}
+
 int btrfs_init_sysfs(void)
 {
 	int ret;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 863e031ed1c..d7c61bdf04b 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -15,6 +15,13 @@ enum btrfs_feature_set {
 	.store	= _store,						\
 }
 
+#define BTRFS_ATTR_RW(_name, _mode, _show, _store)			\
+static struct kobj_attribute btrfs_attr_##_name =			\
+			__INIT_KOBJ_ATTR(_name, _mode, _show, _store)
+#define BTRFS_ATTR(_name, _mode, _show)					\
+	BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR_PTR(_name)    (&btrfs_attr_##_name.attr)
+
 struct btrfs_feature_attr {
 	struct kobj_attribute kobj_attr;
 	enum btrfs_feature_set feature_set;
@@ -40,4 +47,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
 /* convert from attribute */
 #define to_btrfs_feature_attr(a) \
 			container_of(a, struct btrfs_feature_attr, kobj_attr)
+#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
+#define attr_to_btrfs_feature_attr(a) \
+			to_btrfs_feature_attr(attr_to_btrfs_attr(a))
 #endif /* _BTRFS_SYSFS_H_ */
-- 
cgit v1.2.3-70-g09d2


From 510d73600aafbc64efee8d0e71c219c0e651cb7f Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:06:59 -0400
Subject: btrfs: publish per-super features in sysfs

This patch publishes information on which features are enabled in the
file system on a per-super basis. At this point, it only publishes
information on features supported by the file system implementation.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 81 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 79be4a187af..832cf62151f 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,29 +28,53 @@
 #include "transaction.h"
 #include "sysfs.h"
 
-static void btrfs_release_super_kobj(struct kobject *kobj);
-static struct kobj_type btrfs_ktype = {
-	.sysfs_ops	= &kobj_sysfs_ops,
-	.release	= btrfs_release_super_kobj,
-};
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
 
-static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+static u64 get_features(struct btrfs_fs_info *fs_info,
+			enum btrfs_feature_set set)
 {
-	if (kobj->ktype != &btrfs_ktype)
-		return NULL;
-	return container_of(kobj, struct btrfs_fs_info, super_kobj);
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	if (set == FEAT_COMPAT)
+		return btrfs_super_compat_flags(disk_super);
+	else if (set == FEAT_COMPAT_RO)
+		return btrfs_super_compat_ro_flags(disk_super);
+	else
+		return btrfs_super_incompat_flags(disk_super);
 }
 
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
 {
+	int val = 0;
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-	complete(&fs_info->kobj_unregister);
+	if (fs_info) {
+		struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+		u64 features = get_features(fs_info, fa->feature_set);
+		if (features & fa->feature_bit)
+			val = 1;
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", val);
 }
 
-static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
-				       struct kobj_attribute *a, char *buf)
+static umode_t btrfs_feature_visible(struct kobject *kobj,
+				     struct attribute *attr, int unused)
 {
-	return snprintf(buf, PAGE_SIZE, "0\n");
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	umode_t mode = attr->mode;
+
+	if (fs_info) {
+		struct btrfs_feature_attr *fa;
+		u64 features;
+
+		fa = attr_to_btrfs_feature_attr(attr);
+		features = get_features(fs_info, fa->feature_set);
+
+		if (!(features & fa->feature_bit))
+			mode = 0;
+	}
+
+	return mode;
 }
 
 BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
@@ -78,11 +102,27 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 
 static const struct attribute_group btrfs_feature_attr_group = {
 	.name = "features",
+	.is_visible = btrfs_feature_visible,
 	.attrs = btrfs_supported_feature_attrs,
 };
 
-/* /sys/fs/btrfs/ entry */
-static struct kset *btrfs_kset;
+static void btrfs_release_super_kobj(struct kobject *kobj)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	complete(&fs_info->kobj_unregister);
+}
+
+static struct kobj_type btrfs_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.release	= btrfs_release_super_kobj,
+};
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+	if (kobj->ktype != &btrfs_ktype)
+		return NULL;
+	return container_of(kobj, struct btrfs_fs_info, super_kobj);
+}
 
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
@@ -91,13 +131,22 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 	wait_for_completion(&fs_info->kobj_unregister);
 }
 
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
 int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 {
 	int error;
 
 	init_completion(&fs_info->kobj_unregister);
+	fs_info->super_kobj.kset = btrfs_kset;
 	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
 				     "%pU", fs_info->fsid);
+
+	error = sysfs_create_group(&fs_info->super_kobj,
+				   &btrfs_feature_attr_group);
+	if (error)
+		btrfs_sysfs_remove_one(fs_info);
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 79da4fa4d9dcf8c948ef8b5848f747ef08f6e732 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:07:00 -0400
Subject: btrfs: publish unknown feature bits in sysfs

With the compat and compat-ro bits, it's possible for file systems to
exist that have features that aren't supported by the kernel's file system
implementation yet still be mountable.

This patch publishes read-only info on those features using a prefix:number
format, where the number is the bit number rather than the shifted value.
e.g. "compat:12"

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 832cf62151f..4a2f23ee163 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,6 +22,7 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/kobject.h>
+#include <linux/bug.h>
 
 #include "ctree.h"
 #include "disk-io.h"
@@ -131,6 +132,101 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 	wait_for_completion(&fs_info->kobj_unregister);
 }
 
+const char * const btrfs_feature_set_names[3] = {
+	[FEAT_COMPAT]	 = "compat",
+	[FEAT_COMPAT_RO] = "compat_ro",
+	[FEAT_INCOMPAT]	 = "incompat",
+};
+
+#define NUM_FEATURE_BITS 64
+static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
+static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+
+static void init_feature_attrs(void)
+{
+	struct btrfs_feature_attr *fa;
+	int set, i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+		     ARRAY_SIZE(btrfs_feature_attrs));
+	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+		     ARRAY_SIZE(btrfs_feature_attrs[0]));
+
+	for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
+		struct btrfs_feature_attr *sfa;
+		struct attribute *a = btrfs_supported_feature_attrs[i];
+		sfa = attr_to_btrfs_feature_attr(a);
+		fa = &btrfs_feature_attrs[sfa->feature_set][sfa->feature_bit];
+
+		fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
+	}
+
+	for (set = 0; set < FEAT_MAX; set++) {
+		for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+			char *name = btrfs_unknown_feature_names[set][i];
+			fa = &btrfs_feature_attrs[set][i];
+
+			if (fa->kobj_attr.attr.name)
+				continue;
+
+			snprintf(name, 13, "%s:%u",
+				 btrfs_feature_set_names[set], i);
+
+			fa->kobj_attr.attr.name = name;
+			fa->kobj_attr.attr.mode = S_IRUGO;
+			fa->feature_set = set;
+			fa->feature_bit = 1ULL << i;
+		}
+	}
+}
+
+static u64 supported_feature_masks[3] = {
+	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
+	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+	[FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
+};
+
+static int add_unknown_feature_attrs(struct btrfs_fs_info *fs_info)
+{
+	int set;
+
+	for (set = 0; set < FEAT_MAX; set++) {
+		int i, count, ret, index = 0;
+		struct attribute **attrs;
+		struct attribute_group agroup = {
+			.name = "features",
+		};
+		u64 features = get_features(fs_info, set);
+		features &= ~supported_feature_masks[set];
+
+		count = hweight64(features);
+
+		if (!count)
+			continue;
+
+		attrs = kcalloc(count + 1, sizeof(void *), GFP_KERNEL);
+
+		for (i = 0; i < NUM_FEATURE_BITS; i++) {
+			struct btrfs_feature_attr *fa;
+
+			if (!(features & (1ULL << i)))
+				continue;
+
+			fa = &btrfs_feature_attrs[set][i];
+			attrs[index++] = &fa->kobj_attr.attr;
+		}
+
+		attrs[index] = NULL;
+		agroup.attrs = attrs;
+
+		ret = sysfs_merge_group(&fs_info->super_kobj, &agroup);
+		kfree(attrs);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
 
@@ -146,7 +242,15 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	error = sysfs_create_group(&fs_info->super_kobj,
 				   &btrfs_feature_attr_group);
 	if (error)
-		btrfs_sysfs_remove_one(fs_info);
+		goto failure;
+
+	error = add_unknown_feature_attrs(fs_info);
+	if (error)
+		goto failure;
+
+	return 0;
+failure:
+	btrfs_sysfs_remove_one(fs_info);
 	return error;
 }
 
@@ -157,6 +261,8 @@ int btrfs_init_sysfs(void)
 	if (!btrfs_kset)
 		return -ENOMEM;
 
+	init_feature_attrs();
+
 	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	if (ret) {
 		kset_unregister(btrfs_kset);
-- 
cgit v1.2.3-70-g09d2


From ba631941ef09c10e229661219dbd1707e56131d8 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:07:01 -0400
Subject: btrfs: add ability to change features via sysfs

This patch adds the ability to change (set/clear) features while the file
system is mounted. A bitmask is added for each feature set for the
support to set and clear the bits. A message indicating which bit
has been set or cleared is issued when it's been changed and also when
permission or support for a particular bit has been denied.

Since the the attributes can now be writable, we need to introduce
another struct attribute to hold the different permissions.

If neither set or clear is supported, the file will have 0444 permissions.
If either set or clear is supported, the file will have 0644 permissions
and the store handler will filter out the write based on the bitmask.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/sysfs.h |   3 +-
 2 files changed, 117 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4a2f23ee163..7d340f352b7 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -43,21 +43,131 @@ static u64 get_features(struct btrfs_fs_info *fs_info,
 		return btrfs_super_incompat_flags(disk_super);
 }
 
+static void set_features(struct btrfs_fs_info *fs_info,
+			 enum btrfs_feature_set set, u64 features)
+{
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	if (set == FEAT_COMPAT)
+		btrfs_set_super_compat_flags(disk_super, features);
+	else if (set == FEAT_COMPAT_RO)
+		btrfs_set_super_compat_ro_flags(disk_super, features);
+	else
+		btrfs_set_super_incompat_flags(disk_super, features);
+}
+
+static int can_modify_feature(struct btrfs_feature_attr *fa)
+{
+	int val = 0;
+	u64 set, clear;
+	switch (fa->feature_set) {
+	case FEAT_COMPAT:
+		set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+		break;
+	case FEAT_COMPAT_RO:
+		set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+		break;
+	case FEAT_INCOMPAT:
+		set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+		break;
+	default:
+		BUG();
+	}
+
+	if (set & fa->feature_bit)
+		val |= 1;
+	if (clear & fa->feature_bit)
+		val |= 2;
+
+	return val;
+}
+
 static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
 				       struct kobj_attribute *a, char *buf)
 {
 	int val = 0;
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
 	if (fs_info) {
-		struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
 		u64 features = get_features(fs_info, fa->feature_set);
 		if (features & fa->feature_bit)
 			val = 1;
-	}
+	} else
+		val = can_modify_feature(fa);
 
 	return snprintf(buf, PAGE_SIZE, "%d\n", val);
 }
 
+static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
+					struct kobj_attribute *a,
+					const char *buf, size_t count)
+{
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+	struct btrfs_trans_handle *trans;
+	u64 features, set, clear;
+	unsigned long val;
+	int ret;
+
+	fs_info = to_fs_info(kobj);
+	if (!fs_info)
+		return -EPERM;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &val);
+	if (ret)
+		return ret;
+
+	if (fa->feature_set == FEAT_COMPAT) {
+		set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+	} else if (fa->feature_set == FEAT_COMPAT_RO) {
+		set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+	} else {
+		set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+	}
+
+	features = get_features(fs_info, fa->feature_set);
+
+	/* Nothing to do */
+	if ((val && (features & fa->feature_bit)) ||
+	    (!val && !(features & fa->feature_bit)))
+		return count;
+
+	if ((val && !(set & fa->feature_bit)) ||
+	    (!val && !(clear & fa->feature_bit))) {
+		btrfs_info(fs_info,
+			"%sabling feature %s on mounted fs is not supported.",
+			val ? "En" : "Dis", fa->kobj_attr.attr.name);
+		return -EPERM;
+	}
+
+	btrfs_info(fs_info, "%s %s feature flag",
+		   val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
+
+	trans = btrfs_start_transaction(fs_info->fs_root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	spin_lock(&fs_info->super_lock);
+	features = get_features(fs_info, fa->feature_set);
+	if (val)
+		features |= fa->feature_bit;
+	else
+		features &= ~fa->feature_bit;
+	set_features(fs_info, fa->feature_set, features);
+	spin_unlock(&fs_info->super_lock);
+
+	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
 static umode_t btrfs_feature_visible(struct kobject *kobj,
 				     struct attribute *attr, int unused)
 {
@@ -71,7 +181,9 @@ static umode_t btrfs_feature_visible(struct kobject *kobj,
 		fa = attr_to_btrfs_feature_attr(attr);
 		features = get_features(fs_info, fa->feature_set);
 
-		if (!(features & fa->feature_bit))
+		if (can_modify_feature(fa))
+			mode |= S_IWUSR;
+		else if (!(features & fa->feature_bit))
 			mode = 0;
 	}
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index d7c61bdf04b..58c4b1f689d 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -31,7 +31,8 @@ struct btrfs_feature_attr {
 #define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit)	     \
 static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
 	.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,			     \
-				      btrfs_feature_attr_show, NULL),	     \
+				      btrfs_feature_attr_show,		     \
+				      btrfs_feature_attr_store),	     \
 	.feature_set	= _feature_set,					     \
 	.feature_bit	= _prefix ##_## _feature_bit,			     \
 }
-- 
cgit v1.2.3-70-g09d2


From 3b02a68a636400590dd6831a5fc046f0a7909a77 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:07:02 -0400
Subject: btrfs: use feature attribute names to print better error messages

Now that we have the feature name strings available in the kernel via
the sysfs attributes, we can use them for printing better failure
messages from the ioctl path.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 35 ++++++++++++++++++++++++++++++-----
 fs/btrfs/sysfs.c | 33 ++++++++++++++++++++++++++++++++-
 fs/btrfs/sysfs.h |  2 ++
 3 files changed, 64 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d4e105b5409..93b01fe6c73 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,7 @@
 #include "rcu-string.h"
 #include "send.h"
 #include "dev-replace.h"
+#include "sysfs.h"
 
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -4516,17 +4517,27 @@ static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
 	return 0;
 }
 
-static int check_feature_bits(struct btrfs_root *root, const char *type,
+static int check_feature_bits(struct btrfs_root *root,
+			      enum btrfs_feature_set set,
 			      u64 change_mask, u64 flags, u64 supported_flags,
 			      u64 safe_set, u64 safe_clear)
 {
+	const char *type = btrfs_feature_set_names[set];
+	char *names;
 	u64 disallowed, unsupported;
 	u64 set_mask = flags & change_mask;
 	u64 clear_mask = ~flags & change_mask;
 
 	unsupported = set_mask & ~supported_flags;
 	if (unsupported) {
-		btrfs_warn(root->fs_info,
+		names = btrfs_printable_features(set, unsupported);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "this kernel does not support the %s feature bit%s",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
 			   "this kernel does not support %s bits 0x%llx",
 			   type, unsupported);
 		return -EOPNOTSUPP;
@@ -4534,7 +4545,14 @@ static int check_feature_bits(struct btrfs_root *root, const char *type,
 
 	disallowed = set_mask & ~safe_set;
 	if (disallowed) {
-		btrfs_warn(root->fs_info,
+		names = btrfs_printable_features(set, disallowed);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "can't set the %s feature bit%s while mounted",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
 			   "can't set %s bits 0x%llx while mounted",
 			   type, disallowed);
 		return -EPERM;
@@ -4542,7 +4560,14 @@ static int check_feature_bits(struct btrfs_root *root, const char *type,
 
 	disallowed = clear_mask & ~safe_clear;
 	if (disallowed) {
-		btrfs_warn(root->fs_info,
+		names = btrfs_printable_features(set, disallowed);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "can't clear the %s feature bit%s while mounted",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
 			   "can't clear %s bits 0x%llx while mounted",
 			   type, disallowed);
 		return -EPERM;
@@ -4552,7 +4577,7 @@ static int check_feature_bits(struct btrfs_root *root, const char *type,
 }
 
 #define check_feature(root, change_mask, flags, mask_base)	\
-check_feature_bits(root, # mask_base, change_mask, flags,	\
+check_feature_bits(root, FEAT_##mask_base, change_mask, flags,	\
 		   BTRFS_FEATURE_ ## mask_base ## _SUPP,	\
 		   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,	\
 		   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7d340f352b7..562e346994d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -254,6 +254,31 @@ const char * const btrfs_feature_set_names[3] = {
 static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
 static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
 
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
+{
+	size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
+	int len = 0;
+	int i;
+	char *str;
+
+	str = kmalloc(bufsize, GFP_KERNEL);
+	if (!str)
+		return str;
+
+	for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+		const char *name;
+
+		if (!(flags & (1ULL << i)))
+			continue;
+
+		name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
+		len += snprintf(str + len, bufsize - len, "%s%s",
+				len ? "," : "", name);
+	}
+
+	return str;
+}
+
 static void init_feature_attrs(void)
 {
 	struct btrfs_feature_attr *fa;
@@ -264,11 +289,17 @@ static void init_feature_attrs(void)
 	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
 		     ARRAY_SIZE(btrfs_feature_attrs[0]));
 
+	memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+	memset(btrfs_unknown_feature_names, 0,
+	       sizeof(btrfs_unknown_feature_names));
+
 	for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
 		struct btrfs_feature_attr *sfa;
 		struct attribute *a = btrfs_supported_feature_attrs[i];
+		int bit;
 		sfa = attr_to_btrfs_feature_attr(a);
-		fa = &btrfs_feature_attrs[sfa->feature_set][sfa->feature_bit];
+		bit = ilog2(sfa->feature_bit);
+		fa = &btrfs_feature_attrs[sfa->feature_set][bit];
 
 		fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
 	}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 58c4b1f689d..c49fd25c911 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -51,4 +51,6 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
 #define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
 #define attr_to_btrfs_feature_attr(a) \
 			to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
+extern const char * const btrfs_feature_set_names[3];
 #endif /* _BTRFS_SYSFS_H_ */
-- 
cgit v1.2.3-70-g09d2


From 01e219e8069516cdb98594d417b8bb8d906ed30d Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:07:03 -0400
Subject: btrfs: add ioctl to export size of global metadata reservation

btrfs filesystem df output will show the size of the metadata space
and how much of it is used, and the user assumes that the difference
is all usable space. Since that's not actually the case due to the
global metadata reservation, we should provide the full picture to the
user.

This patch adds an ioctl that exports the size of the global metadata
reservation so that btrfs filesystem df can report it.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c           | 16 ++++++++++++++++
 include/uapi/linux/btrfs.h |  1 +
 2 files changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 93b01fe6c73..60d3d37ef9a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3498,6 +3498,20 @@ out:
 	return ret;
 }
 
+static long btrfs_ioctl_global_rsv(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+	u64 reserved;
+
+	spin_lock(&block_rsv->lock);
+	reserved = block_rsv->reserved;
+	spin_unlock(&block_rsv->lock);
+
+	if (arg && copy_to_user(arg, &reserved, sizeof(reserved)))
+		return -EFAULT;
+	return 0;
+}
+
 /*
  * there are many ways the trans_start and trans_end ioctls can lead
  * to deadlocks.  They should only be used by applications that
@@ -4706,6 +4720,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_logical_to_ino(root, argp);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(root, argp);
+	case BTRFS_IOC_GLOBAL_RSV:
+		return btrfs_ioctl_global_rsv(root, argp);
 	case BTRFS_IOC_SYNC: {
 		int ret;
 
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index b4d69092fbd..1b8a0f4c959 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -558,6 +558,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
 #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, __u64)
 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
 				    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_GLOBAL_RSV _IOR(BTRFS_IOCTL_MAGIC, 20, __u64)
 #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-- 
cgit v1.2.3-70-g09d2


From 6ab0a2029ceaedb78af807871820708b7353e3be Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:07:04 -0400
Subject: btrfs: publish allocation data in sysfs

While trying to debug ENOSPC issues, it's helpful to understand what the
kernel's view of the available space is. We export this information
via ioctl, but sysfs files are more easily used.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |   5 ++
 fs/btrfs/extent-tree.c |  82 +++++++++++++++++++++++++--
 fs/btrfs/sysfs.c       | 148 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/sysfs.h       |   8 +++
 4 files changed, 238 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c5c888fbf03..f608306b5d8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1151,6 +1151,9 @@ struct btrfs_space_info {
 	spinlock_t lock;
 	struct rw_semaphore groups_sem;
 	wait_queue_head_t wait;
+
+	struct kobject kobj;
+	struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES];
 };
 
 #define	BTRFS_BLOCK_RSV_GLOBAL		1
@@ -1526,6 +1529,7 @@ struct btrfs_fs_info {
 	int thread_pool_size;
 
 	struct kobject super_kobj;
+	struct kobject *space_info_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
@@ -3178,6 +3182,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int get_block_group_index(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d15b4fc0755..e094d02ea76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,6 +35,7 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "math.h"
+#include "sysfs.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -3408,6 +3409,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
 	return readonly;
 }
 
+static const char *alloc_name(u64 flags)
+{
+	switch (flags) {
+	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
+		return "mixed";
+	case BTRFS_BLOCK_GROUP_METADATA:
+		return "metadata";
+	case BTRFS_BLOCK_GROUP_DATA:
+		return "data";
+	case BTRFS_BLOCK_GROUP_SYSTEM:
+		return "system";
+	default:
+		WARN_ON(1);
+		return "invalid-combination";
+	};
+}
+
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 			     u64 total_bytes, u64 bytes_used,
 			     struct btrfs_space_info **space_info)
@@ -3463,11 +3481,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->chunk_alloc = 0;
 	found->flush = 0;
 	init_waitqueue_head(&found->wait);
+
+	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+				    info->space_info_kobj, "%s",
+				    alloc_name(found->flags));
+	if (ret) {
+		kfree(found);
+		return ret;
+	}
+
 	*space_info = found;
 	list_add_rcu(&found->list, &info->space_info);
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
 		info->data_sinfo = found;
-	return 0;
+
+	return ret;
 }
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
@@ -6152,11 +6180,29 @@ int __get_raid_index(u64 flags)
 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
 	return __get_raid_index(cache->flags);
 }
 
+static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10]	= "raid10",
+	[BTRFS_RAID_RAID1]	= "raid1",
+	[BTRFS_RAID_DUP]	= "dup",
+	[BTRFS_RAID_RAID0]	= "raid0",
+	[BTRFS_RAID_SINGLE]	= "single",
+	[BTRFS_RAID_RAID5]	= "raid5",
+	[BTRFS_RAID_RAID6]	= "raid6",
+};
+
+const char *get_raid_name(enum btrfs_raid_types type)
+{
+	if (type >= BTRFS_NR_RAID_TYPES)
+		return NULL;
+
+	return btrfs_raid_type_names[type];
+}
+
 enum btrfs_loop_type {
 	LOOP_CACHING_NOWAIT = 0,
 	LOOP_CACHING_WAIT = 1,
@@ -8340,6 +8386,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	release_global_block_rsv(info);
 
 	while (!list_empty(&info->space_info)) {
+		int i;
+
 		space_info = list_entry(info->space_info.next,
 					struct btrfs_space_info,
 					list);
@@ -8350,9 +8398,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 				dump_space_info(space_info, 0, 0);
 			}
 		}
-		percpu_counter_destroy(&space_info->total_bytes_pinned);
 		list_del(&space_info->list);
-		kfree(space_info);
+		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+			struct kobject *kobj;
+			kobj = &space_info->block_group_kobjs[i];
+			if (kobj->parent) {
+				kobject_del(kobj);
+				kobject_put(kobj);
+			}
+		}
+		kobject_del(&space_info->kobj);
+		kobject_put(&space_info->kobj);
 	}
 	return 0;
 }
@@ -8363,6 +8419,19 @@ static void __link_block_group(struct btrfs_space_info *space_info,
 	int index = get_block_group_index(cache);
 
 	down_write(&space_info->groups_sem);
+	if (list_empty(&space_info->block_groups[index])) {
+		struct kobject *kobj = &space_info->block_group_kobjs[index];
+		int ret;
+
+		kobject_get(&space_info->kobj); /* put in release */
+		ret = kobject_init_and_add(kobj, &btrfs_raid_ktype,
+					   &space_info->kobj,
+					   get_raid_name(index));
+		if (ret) {
+			pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n");
+			kobject_put(&space_info->kobj);
+		}
+	}
 	list_add_tail(&cache->list, &space_info->block_groups[index]);
 	up_write(&space_info->groups_sem);
 }
@@ -8803,8 +8872,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * are still on the list after taking the semaphore
 	 */
 	list_del_init(&block_group->list);
-	if (list_empty(&block_group->space_info->block_groups[index]))
+	if (list_empty(&block_group->space_info->block_groups[index])) {
+		kobject_del(&block_group->space_info->block_group_kobjs[index]);
+		kobject_put(&block_group->space_info->block_group_kobjs[index]);
 		clear_avail_alloc_bits(root->fs_info, block_group->flags);
+	}
 	up_write(&block_group->space_info->groups_sem);
 
 	if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 562e346994d..e060958a638 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -219,6 +219,140 @@ static const struct attribute_group btrfs_feature_attr_group = {
 	.attrs = btrfs_supported_feature_attrs,
 };
 
+static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
+{
+	u64 val;
+	if (lock)
+		spin_lock(lock);
+	val = *value_ptr;
+	if (lock)
+		spin_unlock(lock);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static ssize_t global_rsv_size_show(struct kobject *kobj,
+				    struct kobj_attribute *ka, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
+
+static ssize_t global_rsv_reserved_show(struct kobject *kobj,
+					struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+
+#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf);
+BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
+BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
+	struct btrfs_block_group_cache *block_group;
+	int index = kobj - sinfo->block_group_kobjs;
+	u64 val = 0;
+
+	down_read(&sinfo->groups_sem);
+	list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
+		if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+			val += block_group->key.offset;
+		else
+			val += btrfs_block_group_used(&block_group->item);
+	}
+	up_read(&sinfo->groups_sem);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static struct attribute *raid_attributes[] = {
+	BTRFS_RAID_ATTR_PTR(total_bytes),
+	BTRFS_RAID_ATTR_PTR(used_bytes),
+	NULL
+};
+
+static void release_raid_kobj(struct kobject *kobj)
+{
+	kobject_put(kobj->parent);
+}
+
+struct kobj_type btrfs_raid_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = release_raid_kobj,
+	.default_attrs = raid_attributes,
+};
+
+#define SPACE_INFO_ATTR(field)						\
+static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,	\
+					     struct kobj_attribute *a,	\
+					     char *buf)			\
+{									\
+	struct btrfs_space_info *sinfo = to_space_info(kobj);		\
+	return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf);	\
+}									\
+BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+
+static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
+						       struct kobj_attribute *a,
+						       char *buf)
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
+	return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+}
+
+SPACE_INFO_ATTR(flags);
+SPACE_INFO_ATTR(total_bytes);
+SPACE_INFO_ATTR(bytes_used);
+SPACE_INFO_ATTR(bytes_pinned);
+SPACE_INFO_ATTR(bytes_reserved);
+SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(disk_used);
+SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+
+static struct attribute *space_info_attrs[] = {
+	BTRFS_ATTR_PTR(flags),
+	BTRFS_ATTR_PTR(total_bytes),
+	BTRFS_ATTR_PTR(bytes_used),
+	BTRFS_ATTR_PTR(bytes_pinned),
+	BTRFS_ATTR_PTR(bytes_reserved),
+	BTRFS_ATTR_PTR(bytes_may_use),
+	BTRFS_ATTR_PTR(disk_used),
+	BTRFS_ATTR_PTR(disk_total),
+	BTRFS_ATTR_PTR(total_bytes_pinned),
+	NULL,
+};
+
+static void space_info_release(struct kobject *kobj)
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	percpu_counter_destroy(&sinfo->total_bytes_pinned);
+	kfree(sinfo);
+}
+
+struct kobj_type space_info_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = space_info_release,
+	.default_attrs = space_info_attrs,
+};
+
+static const struct attribute *allocation_attrs[] = {
+	BTRFS_ATTR_PTR(global_rsv_reserved),
+	BTRFS_ATTR_PTR(global_rsv_size),
+	NULL,
+};
+
 static void btrfs_release_super_kobj(struct kobject *kobj)
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
@@ -239,6 +373,9 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
+	sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+	kobject_del(fs_info->space_info_kobj);
+	kobject_put(fs_info->space_info_kobj);
 	kobject_del(&fs_info->super_kobj);
 	kobject_put(&fs_info->super_kobj);
 	wait_for_completion(&fs_info->kobj_unregister);
@@ -391,6 +528,17 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	if (error)
 		goto failure;
 
+	fs_info->space_info_kobj = kobject_create_and_add("allocation",
+						  &fs_info->super_kobj);
+	if (!fs_info->space_info_kobj) {
+		error = -ENOMEM;
+		goto failure;
+	}
+
+	error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+	if (error)
+		goto failure;
+
 	return 0;
 failure:
 	btrfs_sysfs_remove_one(fs_info);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index c49fd25c911..f3cea3710d4 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -22,6 +22,12 @@ static struct kobj_attribute btrfs_attr_##_name =			\
 	BTRFS_ATTR_RW(_name, _mode, _show, NULL)
 #define BTRFS_ATTR_PTR(_name)    (&btrfs_attr_##_name.attr)
 
+#define BTRFS_RAID_ATTR(_name, _show)					\
+static struct kobj_attribute btrfs_raid_attr_##_name =			\
+			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+#define BTRFS_RAID_ATTR_PTR(_name)    (&btrfs_raid_attr_##_name.attr)
+
+
 struct btrfs_feature_attr {
 	struct kobj_attribute kobj_attr;
 	enum btrfs_feature_set feature_set;
@@ -53,4 +59,6 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
 			to_btrfs_feature_attr(attr_to_btrfs_attr(a))
 char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
 extern const char * const btrfs_feature_set_names[3];
+extern struct kobj_type space_info_ktype;
+extern struct kobj_type btrfs_raid_ktype;
 #endif /* _BTRFS_SYSFS_H_ */
-- 
cgit v1.2.3-70-g09d2


From 29e5be240a3caf175364fdeecb0441dff500d5d9 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:07:05 -0400
Subject: btrfs: publish device membership in sysfs

Now that we have the infrastructure for per-super attributes, we can
publish device membership in /sys/fs/btrfs/<fsid>/devices. The information
is published as symlinks to the block devices.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/sysfs.c | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f608306b5d8..dbe9b313cdf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1530,6 +1530,7 @@ struct btrfs_fs_info {
 
 	struct kobject super_kobj;
 	struct kobject *space_info_kobj;
+	struct kobject *device_dir_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e060958a638..ec631539111 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -23,11 +23,13 @@
 #include <linux/buffer_head.h>
 #include <linux/kobject.h>
 #include <linux/bug.h>
+#include <linux/genhd.h>
 
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "sysfs.h"
+#include "volumes.h"
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
 
@@ -374,6 +376,8 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
 	sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+	kobject_del(fs_info->device_dir_kobj);
+	kobject_put(fs_info->device_dir_kobj);
 	kobject_del(fs_info->space_info_kobj);
 	kobject_put(fs_info->space_info_kobj);
 	kobject_del(&fs_info->super_kobj);
@@ -507,6 +511,30 @@ static int add_unknown_feature_attrs(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
+static int add_device_membership(struct btrfs_fs_info *fs_info)
+{
+	int error = 0;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *dev;
+
+	fs_info->device_dir_kobj = kobject_create_and_add("devices",
+						&fs_info->super_kobj);
+	if (!fs_info->device_dir_kobj)
+		return -ENOMEM;
+
+	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+		struct hd_struct *disk = dev->bdev->bd_part;
+		struct kobject *disk_kobj = &part_to_dev(disk)->kobj;
+
+		error = sysfs_create_link(fs_info->device_dir_kobj,
+					  disk_kobj, disk_kobj->name);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
 
@@ -528,6 +556,10 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	if (error)
 		goto failure;
 
+	error = add_device_membership(fs_info);
+	if (error)
+		goto failure;
+
 	fs_info->space_info_kobj = kobject_create_and_add("allocation",
 						  &fs_info->super_kobj);
 	if (!fs_info->space_info_kobj) {
-- 
cgit v1.2.3-70-g09d2


From f8ba9c11f8320be0d553d4b18000e35f7ad672ac Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 1 Nov 2013 13:07:06 -0400
Subject: btrfs: publish fs label in sysfs

This adds a writeable attribute which describes the label.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ec631539111..669fdf777b6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -355,6 +355,49 @@ static const struct attribute *allocation_attrs[] = {
 	NULL,
 };
 
+static ssize_t btrfs_label_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
+}
+
+static ssize_t btrfs_label_store(struct kobject *kobj,
+				 struct kobj_attribute *a,
+				 const char *buf, size_t len)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = fs_info->fs_root;
+	int ret;
+
+	if (len >= BTRFS_LABEL_SIZE) {
+		pr_err("btrfs: unable to set label with more than %d bytes\n",
+		       BTRFS_LABEL_SIZE - 1);
+		return -EINVAL;
+	}
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	spin_lock(&root->fs_info->super_lock);
+	strcpy(fs_info->super_copy->label, buf);
+	spin_unlock(&root->fs_info->super_lock);
+	ret = btrfs_commit_transaction(trans, root);
+
+	if (!ret)
+		return len;
+
+	return ret;
+}
+BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
+
+static struct attribute *btrfs_attrs[] = {
+	BTRFS_ATTR_PTR(label),
+	NULL,
+};
+
 static void btrfs_release_super_kobj(struct kobject *kobj)
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
@@ -364,6 +407,7 @@ static void btrfs_release_super_kobj(struct kobject *kobj)
 static struct kobj_type btrfs_ktype = {
 	.sysfs_ops	= &kobj_sysfs_ops,
 	.release	= btrfs_release_super_kobj,
+	.default_attrs	= btrfs_attrs,
 };
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
-- 
cgit v1.2.3-70-g09d2


From 99e22f783bbcd048819975b8a1463f39a9966bcf Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:22 +0100
Subject: btrfs: remove unused variable from btrfs_new_inode

Variable owner in btrfs_new_inode is unused since commit
d82a6f1d7e8b61ed5996334d0db66651bb43641d
(Btrfs: kill BTRFS_I(inode)->block_group)

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0c0dc8f07f..41079c6ed96 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5372,7 +5372,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	u32 sizes[2];
 	unsigned long ptr;
 	int ret;
-	int owner;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -5418,11 +5417,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 */
 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 
-	if (S_ISDIR(mode))
-		owner = 0;
-	else
-		owner = 1;
-
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
 	key[0].offset = 0;
-- 
cgit v1.2.3-70-g09d2


From 71db2a7751b6c4befad06e66a26d7f2b8dd3c1b9 Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:23 +0100
Subject: btrfs: remove unused variables from disk-io.c

Remove unused variables:
* tree from csum_dirty_buffer,
* tree from btree_readpage_end_io_hook,
* tree from btree_writepages,
* bytenr from btrfs_create_tree,
* fs_info from end_workqueue_fn.

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81f3433fe4f..d84667379f0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -465,13 +465,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
-	struct extent_io_tree *tree;
 	u64 start = page_offset(page);
 	u64 found_start;
 	struct extent_buffer *eb;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-
 	eb = (struct extent_buffer *)page->private;
 	if (page != eb->pages[0])
 		return 0;
@@ -570,7 +567,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 				      u64 phy_offset, struct page *page,
 				      u64 start, u64 end, int mirror)
 {
-	struct extent_io_tree *tree;
 	u64 found_start;
 	int found_level;
 	struct extent_buffer *eb;
@@ -581,7 +577,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	if (!page->private)
 		goto out;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	eb = (struct extent_buffer *)page->private;
 
 	/* the pending IO might have been the only thing that kept this buffer
@@ -968,11 +963,9 @@ static int btree_migratepage(struct address_space *mapping,
 static int btree_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct extent_io_tree *tree;
 	struct btrfs_fs_info *fs_info;
 	int ret;
 
-	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 
 		if (wbc->for_kupdate)
@@ -1274,7 +1267,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root;
 	struct btrfs_key key;
 	int ret = 0;
-	u64 bytenr;
 	uuid_le uuid;
 
 	root = btrfs_alloc_root(fs_info);
@@ -1296,7 +1288,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 		goto fail;
 	}
 
-	bytenr = leaf->start;
 	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(leaf, leaf->start);
 	btrfs_set_header_generation(leaf, trans->transid);
@@ -1685,12 +1676,10 @@ static void end_workqueue_fn(struct btrfs_work *work)
 {
 	struct bio *bio;
 	struct end_io_wq *end_io_wq;
-	struct btrfs_fs_info *fs_info;
 	int error;
 
 	end_io_wq = container_of(work, struct end_io_wq, work);
 	bio = end_io_wq->bio;
-	fs_info = end_io_wq->info;
 
 	error = end_io_wq->error;
 	bio->bi_private = end_io_wq->private;
-- 
cgit v1.2.3-70-g09d2


From 4b447bfac6c6b367a594a22cb220152f835fc0ed Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:24 +0100
Subject: btrfs: remove unused variable from find_free_extent

The variable found_uncached_bg in find_free_extent is not used since commit
285ff5af6ce358e73f53b55c9efadd4335f4c2ff
(Btrfs: remove the ideal caching code)

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e094d02ea76..fe651f4d1b9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6239,7 +6239,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	int index = __get_raid_index(flags);
 	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
-	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
@@ -6357,7 +6356,6 @@ search:
 have_block_group:
 		cached = block_group_cache_done(block_group);
 		if (unlikely(!cached)) {
-			found_uncached_bg = true;
 			ret = cache_block_group(block_group, 0);
 			BUG_ON(ret < 0);
 			ret = 0;
-- 
cgit v1.2.3-70-g09d2


From 50892bac3b93afa5cc8de6541a8013a21bef3f74 Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:25 +0100
Subject: btrfs: remove unused variables from extent_io.c

Remove unused variables:
* tree from end_bio_extent_writepage,
* item from extent_fiemap.

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ff43802a7c8..d97250a4e8e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2333,13 +2333,11 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
 
 	do {
 		struct page *page = bvec->bv_page;
-		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 		/* We always issue full-page reads, but if some block
 		 * in a page fails to read, blk_update_request() will
@@ -4082,12 +4080,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
-	struct btrfs_file_extent_item *item;
 	int end = 0;
 	u64 em_start = 0;
 	u64 em_len = 0;
 	u64 em_end = 0;
-	unsigned long emflags;
 
 	if (len == 0)
 		return -EINVAL;
@@ -4112,8 +4108,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	}
 	WARN_ON(!ret);
 	path->slots[0]--;
-	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			      struct btrfs_file_extent_item);
 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
 	found_type = btrfs_key_type(&found_key);
 
@@ -4181,7 +4175,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			offset_in_extent = em_start - em->start;
 		em_end = extent_map_end(em);
 		em_len = em_end - em_start;
-		emflags = em->flags;
 		disko = 0;
 		flags = 0;
 
-- 
cgit v1.2.3-70-g09d2


From f0265bb4099887b1ffb45779026d29c109bfa5bf Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:26 +0100
Subject: btrfs: remove unused variable from setup_cluster_no_bitmap

The variable window_start in setup_cluster_no_bitmap is not used since commit
1bb91902dc90e25449893e693ad45605cb08fbe5
(Btrfs: revamp clustered allocation logic)

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 057be95b1e1..332aa33d02f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2421,7 +2421,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 	struct btrfs_free_space *entry = NULL;
 	struct btrfs_free_space *last;
 	struct rb_node *node;
-	u64 window_start;
 	u64 window_free;
 	u64 max_extent;
 	u64 total_size = 0;
@@ -2443,7 +2442,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 	}
 
-	window_start = entry->offset;
 	window_free = entry->bytes;
 	max_extent = entry->bytes;
 	first = entry;
-- 
cgit v1.2.3-70-g09d2


From ce3e7f1073ee4958a30e5677e868f79292bc53a6 Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:27 +0100
Subject: btrfs: remove unused variable from scrub_fixup_nodatasum

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1fd3f33c330..e5481ae1275 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -704,13 +704,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 	struct scrub_fixup_nodatasum *fixup;
 	struct scrub_ctx *sctx;
 	struct btrfs_trans_handle *trans = NULL;
-	struct btrfs_fs_info *fs_info;
 	struct btrfs_path *path;
 	int uncorrectable = 0;
 
 	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 	sctx = fixup->sctx;
-	fs_info = fixup->root->fs_info;
 
 	path = btrfs_alloc_path();
 	if (!path) {
-- 
cgit v1.2.3-70-g09d2


From e94acd86d48d61a5d919d807ed1efa0d8c1cd5ae Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:28 +0100
Subject: btrfs: replace path->slots[0] with otherwise unused variable 'slot'

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3775947429b..826b98c211a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1683,8 +1683,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 		btrfs_release_path(path);
 
 		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		item_size = btrfs_item_size_nr(leaf, slot);
+		ptr = btrfs_item_ptr_offset(leaf, slot);
 		cur_offset = 0;
 
 		while (cur_offset < item_size) {
-- 
cgit v1.2.3-70-g09d2


From a3df41ee377b2766a7525f9ca05207698efe4551 Mon Sep 17 00:00:00 2001
From: Valentina Giusti <valentina.giusti@microon.de>
Date: Mon, 4 Nov 2013 22:34:29 +0100
Subject: btrfs: fix unused variables in qgroup.c

Use otherwise unused local variables slot in update_qgroup_limit_item and
in update_qgroup_info_item, and remove unused variable ins from
btrfs_qgroup_account_ref.

Signed-off-by: Valentina Giusti <valentina.giusti@microon.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e6ef490619..bd0b058f2a2 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -644,8 +644,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 
 	l = path->nodes[0];
 	slot = path->slots[0];
-	qgroup_limit = btrfs_item_ptr(l, path->slots[0],
-				      struct btrfs_qgroup_limit_item);
+	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 	btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
 	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
 	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
@@ -687,8 +686,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 
 	l = path->nodes[0];
 	slot = path->slots[0];
-	qgroup_info = btrfs_item_ptr(l, path->slots[0],
-				 struct btrfs_qgroup_info_item);
+	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
@@ -1349,7 +1347,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 			     struct btrfs_delayed_ref_node *node,
 			     struct btrfs_delayed_extent_op *extent_op)
 {
-	struct btrfs_key ins;
 	struct btrfs_root *quota_root;
 	u64 ref_root;
 	struct btrfs_qgroup *qgroup;
@@ -1363,10 +1360,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!fs_info->quota_root);
 
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-
 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		struct btrfs_delayed_tree_ref *ref;
-- 
cgit v1.2.3-70-g09d2


From e33d5c3d6d61518c7f115af6d11d3dffa230d31f Mon Sep 17 00:00:00 2001
From: Kelley Nielsen <kelleynnn@gmail.com>
Date: Mon, 4 Nov 2013 19:33:33 -0800
Subject: btrfs: bootstrap generic btrfs_find_item interface

There are many btrfs functions that manually search the tree for an
item. They all reimplement the same mechanism and differ in the
conditions that they use to find the item. __inode_info() is one such
example. Zach Brown proposed creating a new interface to take the place
of these functions.

This patch is the first step to creating the interface. A new function,
btrfs_find_item, has been added to ctree.c and prototyped in ctree.h.
It is identical to __inode_info, except that the order of the parameters
has been rearranged to more closely those of similar functions elsewhere
in the code (now, root and path come first, then the objectid, offset
and type, and the key to be filled in last). __inode_info's callers have
been set to call this new function instead, and __inode_info itself has
been removed.

Signed-off-by: Kelley Nielsen <kelleynnn@gmail.com>
Suggested-by: Zach Brown <zab@redhat.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 40 ++++------------------------------------
 fs/btrfs/ctree.c   | 37 +++++++++++++++++++++++++++++++++++++
 fs/btrfs/ctree.h   |  2 ++
 3 files changed, 43 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 826b98c211a..6a3f7f50ad3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1107,38 +1107,6 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-
-static int __inode_info(u64 inum, u64 ioff, u8 key_type,
-			struct btrfs_root *fs_root, struct btrfs_path *path,
-			struct btrfs_key *found_key)
-{
-	int ret;
-	struct btrfs_key key;
-	struct extent_buffer *eb;
-
-	key.type = key_type;
-	key.objectid = inum;
-	key.offset = ioff;
-
-	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
-	if (ret < 0)
-		return ret;
-
-	eb = path->nodes[0];
-	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
-		ret = btrfs_next_leaf(fs_root, path);
-		if (ret)
-			return ret;
-		eb = path->nodes[0];
-	}
-
-	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
-	if (found_key->type != key.type || found_key->objectid != key.objectid)
-		return 1;
-
-	return 0;
-}
-
 /*
  * this makes the path point to (inum INODE_ITEM ioff)
  */
@@ -1146,16 +1114,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 			struct btrfs_path *path)
 {
 	struct btrfs_key key;
-	return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
-				&key);
+	return btrfs_find_item(fs_root, path, inum, ioff,
+			BTRFS_INODE_ITEM_KEY, &key);
 }
 
 static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 				struct btrfs_path *path,
 				struct btrfs_key *found_key)
 {
-	return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
-				found_key);
+	return btrfs_find_item(fs_root, path, inum, ioff,
+			BTRFS_INODE_REF_KEY, found_key);
 }
 
 int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bcd0bd85e3e..141c15ca294 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2461,6 +2461,43 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
 	return 0;
 }
 
+/* Proposed generic search function, meant to take the place of the
+* various small search helper functions throughout the code and standardize
+* the search interface. Right now, it only replaces the former __inode_info
+* in backref.c.
+*/
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+		u64 iobjectid, u64 ioff, u8 key_type,
+		struct btrfs_key *found_key)
+{
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+
+	key.type = key_type;
+	key.objectid = iobjectid;
+	key.offset = ioff;
+
+	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	eb = path->nodes[0];
+	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+		ret = btrfs_next_leaf(fs_root, path);
+		if (ret)
+			return ret;
+		eb = path->nodes[0];
+	}
+
+	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+	if (found_key->type != key.type ||
+			found_key->objectid != key.objectid)
+		return 1;
+
+	return 0;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dbe9b313cdf..a58611fbf00 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3371,6 +3371,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 struct btrfs_path *path,
 			 struct btrfs_key *new_key);
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+		u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
-- 
cgit v1.2.3-70-g09d2


From 75ac2dd907013b44edbdec16f8969d14811149c9 Mon Sep 17 00:00:00 2001
From: Kelley Nielsen <kelleynnn@gmail.com>
Date: Mon, 4 Nov 2013 19:35:58 -0800
Subject: btrfs: expand btrfs_find_item() to include find_root_ref
 functionality

This patch is the second step in bootstrapping the btrfs_find_item
interface. The btrfs_find_root_ref() is similar to the former
__inode_info(); it accepts four of its parameters, and duplicates the
first half of its functionality.

Replace the one former call to btrfs_find_root_ref() with a call to
btrfs_find_item(), along with the defined key type that was used
internally by btrfs_find_root ref, and a null found key. In
btrfs_find_item(), add a test for the null key at the place where
the functionality of btrfs_find_root_ref() ends; btrfs_find_item()
then returns if the test passes. Finally, remove btrfs_find_root_ref().

Signed-off-by: Kelley Nielsen <kelleynnn@gmail.com>
Suggested-by: Zach Brown <zab@redhat.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c     | 10 ++++++++--
 fs/btrfs/inode.c     |  6 +++---
 fs/btrfs/root-tree.c | 15 ---------------
 3 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 141c15ca294..64b87894460 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2464,7 +2464,13 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
 /* Proposed generic search function, meant to take the place of the
 * various small search helper functions throughout the code and standardize
 * the search interface. Right now, it only replaces the former __inode_info
-* in backref.c.
+* in backref.c, and the former btrfs_find_root_ref in root-tree.c.
+*
+* If a null key is passed, it returns immediately after running
+* btrfs_search_slot, leaving the path filled as it is and passing its
+* return value upward. If a real key is passed, it will set the caller's
+* path to point to the first item in the tree after its specified
+* objectid, type, and offset for which objectid and type match the input.
 */
 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
 		u64 iobjectid, u64 ioff, u8 key_type,
@@ -2479,7 +2485,7 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
 	key.offset = ioff;
 
 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
-	if (ret < 0)
+	if ((ret < 0) || (found_key == NULL))
 		return ret;
 
 	eb = path->nodes[0];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 41079c6ed96..5a5de36d39f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4673,9 +4673,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	}
 
 	err = -ENOENT;
-	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
-				  BTRFS_I(dir)->root->root_key.objectid,
-				  location->objectid);
+	ret = btrfs_find_item(root->fs_info->tree_root, path,
+				BTRFS_I(dir)->root->root_key.objectid,
+				location->objectid, BTRFS_ROOT_REF_KEY, NULL);
 	if (ret) {
 		if (ret < 0)
 			err = ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ec71ea44d2b..fcc10ebb71a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -400,21 +400,6 @@ out:
 	return err;
 }
 
-int btrfs_find_root_ref(struct btrfs_root *tree_root,
-		   struct btrfs_path *path,
-		   u64 root_id, u64 ref_id)
-{
-	struct btrfs_key key;
-	int ret;
-
-	key.objectid = root_id;
-	key.type = BTRFS_ROOT_REF_KEY;
-	key.offset = ref_id;
-
-	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
-	return ret;
-}
-
 /*
  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
  * or BTRFS_ROOT_BACKREF_KEY.
-- 
cgit v1.2.3-70-g09d2


From 3f870c28990015a1fd6c67807efcdb02a75b35e1 Mon Sep 17 00:00:00 2001
From: Kelley Nielsen <kelleynnn@gmail.com>
Date: Mon, 4 Nov 2013 19:37:39 -0800
Subject: btrfs: expand btrfs_find_item() to include find_orphan_item
 functionality

This is the third step in bootstrapping the btrfs_find_item interface.
The function find_orphan_item(), in orphan.c, is similar to the two
functions already replaced by the new interface. It uses two parameters,
which are already present in the interface, and is nearly identical to
the function brought in in the previous patch.

Replace the two calls to find_orphan_item() with calls to
btrfs_find_item(), with the defined objectid and type that was used
internally by find_orphan_item(), a null path, and a null key. Add a
test for a null path to btrfs_find_item, and if it passes, allocate and
free the path. Finally, remove find_orphan_item().

Signed-off-by: Kelley Nielsen <kelleynnn@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c    | 26 +++++++++++++-------------
 fs/btrfs/disk-io.c  |  3 ++-
 fs/btrfs/orphan.c   | 20 --------------------
 fs/btrfs/tree-log.c |  3 ++-
 4 files changed, 17 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 64b87894460..22629809c9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2461,32 +2461,32 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
 	return 0;
 }
 
-/* Proposed generic search function, meant to take the place of the
-* various small search helper functions throughout the code and standardize
-* the search interface. Right now, it only replaces the former __inode_info
-* in backref.c, and the former btrfs_find_root_ref in root-tree.c.
-*
-* If a null key is passed, it returns immediately after running
-* btrfs_search_slot, leaving the path filled as it is and passing its
-* return value upward. If a real key is passed, it will set the caller's
-* path to point to the first item in the tree after its specified
-* objectid, type, and offset for which objectid and type match the input.
-*/
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
 		u64 iobjectid, u64 ioff, u8 key_type,
 		struct btrfs_key *found_key)
 {
 	int ret;
 	struct btrfs_key key;
 	struct extent_buffer *eb;
+	struct btrfs_path *path;
 
 	key.type = key_type;
 	key.objectid = iobjectid;
 	key.offset = ioff;
 
+	if (found_path == NULL) {
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+	} else
+		path = found_path;
+
 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
-	if ((ret < 0) || (found_key == NULL))
+	if ((ret < 0) || (found_key == NULL)) {
+		if (path != found_path)
+			btrfs_free_path(path);
 		return ret;
+	}
 
 	eb = path->nodes[0];
 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d84667379f0..4ecee0a009c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1608,7 +1608,8 @@ again:
 	if (ret)
 		goto fail;
 
-	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+	ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
+			location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
 	if (ret < 0)
 		goto fail;
 	if (ret == 0)
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 24cad1695af..65793edb38c 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -69,23 +69,3 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
-
-int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
-{
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	int ret;
-
-	key.objectid = BTRFS_ORPHAN_OBJECTID;
-	key.type = BTRFS_ORPHAN_ITEM_KEY;
-	key.offset = offset;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
-	btrfs_free_path(path);
-	return ret;
-}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e7d7a837512..ba2f15109da 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1238,7 +1238,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root, u64 offset)
 {
 	int ret;
-	ret = btrfs_find_orphan_item(root, offset);
+	ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
+			offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
 	if (ret > 0)
 		ret = btrfs_insert_orphan_item(trans, root, offset);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 33b98f22711142b7f92ad8e10f21fc3f921ffde2 Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@gentoo.org>
Date: Wed, 13 Nov 2013 17:46:48 +0800
Subject: btrfs: cleanup: removed unused 'btrfs_get_inode_ref_index'

Found by uselex.rb:
> btrfs_get_inode_ref_index: [R]: exported from:
fs/btrfs/inode-item.o fs/btrfs/btrfs.o fs/btrfs/built-in.o

Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Reviewed-by: David Stebra <dsterba@suse.cz>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h      |  6 -----
 fs/btrfs/inode-item.c | 65 ---------------------------------------------------
 2 files changed, 71 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a58611fbf00..47835f5850d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3587,12 +3587,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct btrfs_path *path,
-			      const char *name, int name_len,
-			      u64 inode_objectid, u64 ref_objectid, int mod,
-			      u64 *ret_index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index ec82fae0709..2be38df703c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -91,32 +91,6 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
 	return 0;
 }
 
-static struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       struct btrfs_path *path,
-		       const char *name, int name_len,
-		       u64 inode_objectid, u64 ref_objectid, int ins_len,
-		       int cow)
-{
-	int ret;
-	struct btrfs_key key;
-	struct btrfs_inode_ref *ref;
-
-	key.objectid = inode_objectid;
-	key.type = BTRFS_INODE_REF_KEY;
-	key.offset = ref_objectid;
-
-	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	if (ret > 0)
-		return NULL;
-	if (!find_name_in_backref(path, name, name_len, &ref))
-		return NULL;
-	return ref;
-}
-
 /* Returns NULL if no extref found */
 struct btrfs_inode_extref *
 btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
@@ -144,45 +118,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 	return extref;
 }
 
-int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct btrfs_path *path,
-			      const char *name, int name_len,
-			      u64 inode_objectid, u64 ref_objectid, int mod,
-			      u64 *ret_index)
-{
-	struct btrfs_inode_ref *ref;
-	struct btrfs_inode_extref *extref;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
-
-	ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
-				     inode_objectid, ref_objectid, ins_len,
-				     cow);
-	if (IS_ERR(ref))
-		return PTR_ERR(ref);
-
-	if (ref != NULL) {
-		*ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
-		return 0;
-	}
-
-	btrfs_release_path(path);
-
-	extref = btrfs_lookup_inode_extref(trans, root, path, name,
-					   name_len, inode_objectid,
-					   ref_objectid, ins_len, cow);
-	if (IS_ERR(extref))
-		return PTR_ERR(extref);
-
-	if (extref) {
-		*ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
-		return 0;
-	}
-
-	return -ENOENT;
-}
-
 static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  const char *name, int name_len,
-- 
cgit v1.2.3-70-g09d2


From 27a0dd61a5cf742ebcd7a82c47be2502b1113eff Mon Sep 17 00:00:00 2001
From: Frank Holton <fholton@gmail.com>
Date: Tue, 12 Nov 2013 19:22:53 -0500
Subject: Btrfs: make btrfs_debug match pr_debug handling related to DEBUG

The kernel macro pr_debug is defined as a empty statement when DEBUG is
not defined. Make btrfs_debug match pr_debug to avoid spamming
the kernel log with debug messages

Signed-off-by: Frank Holton <fholton@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 47835f5850d..7158c97fdc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3816,8 +3816,14 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 	btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
 #define btrfs_info(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+#ifdef DEBUG
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+    no_printk(KERN_DEBUG fmt, ##args)
+#endif
 
 #ifdef CONFIG_BTRFS_ASSERT
 
-- 
cgit v1.2.3-70-g09d2


From 43d87fa23154d135a2a1006bc6656ae73ae84190 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 18 Nov 2013 14:24:20 +0100
Subject: btrfs: reserve no transaction units in btrfs_feature_attr_store

Added in patch "btrfs: add ability to change features via sysfs",
modifications to superblock don't need to reserve metadata blocks when
starting a transaction.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 669fdf777b6..8fdc0520e29 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -150,7 +150,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
 	btrfs_info(fs_info, "%s %s feature flag",
 		   val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
 
-	trans = btrfs_start_transaction(fs_info->fs_root, 1);
+	trans = btrfs_start_transaction(fs_info->fs_root, 0);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-- 
cgit v1.2.3-70-g09d2


From cc37bb04201217b7acb11213e16cb5530c30da8f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 19 Nov 2013 13:36:21 +0100
Subject: btrfs: replace BUG in can_modify_feature

We don't need to crash hard here, it's just reading a sysfs file. The
values considered in switch are from a fixed set, the default case
should not happen at all.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 8fdc0520e29..b535285642d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -75,7 +75,9 @@ static int can_modify_feature(struct btrfs_feature_attr *fa)
 		clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
 		break;
 	default:
-		BUG();
+		printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n",
+				fa->feature_set);
+		return 0;
 	}
 
 	if (set & fa->feature_bit)
-- 
cgit v1.2.3-70-g09d2


From b37392ea86761e97d46b567667ff158e8bb67b72 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 10 Dec 2013 19:25:03 +0800
Subject: Btrfs: cleanup unnecessary parameter and variant of prepare_pages()

- the caller has gotten the inode object, needn't pass the file object.
  And if so, we needn't define a inode pointer variant.
- the position should be aligned by the page size not sector size, so
  we also needn't pass the root object into prepare_pages().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c77da440146..5591c67b1b4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1239,22 +1239,20 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
  * waits for data=ordered extents to finish before allowing the pages to be
  * modified.
  */
-static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
-			 struct page **pages, size_t num_pages,
-			 loff_t pos, unsigned long first_index,
-			 size_t write_bytes, bool force_uptodate)
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
+				  size_t num_pages, loff_t pos,
+				  size_t write_bytes, bool force_uptodate)
 {
 	struct extent_state *cached_state = NULL;
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
-	struct inode *inode = file_inode(file);
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int err = 0;
 	int faili = 0;
 	u64 start_pos;
 	u64 last_pos;
 
-	start_pos = pos & ~((u64)root->sectorsize - 1);
+	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
 	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 
 again:
@@ -1462,8 +1460,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		 * pages we want, so we don't really need to worry about the
 		 * contents of pages from loop to loop
 		 */
-		ret = prepare_pages(root, file, pages, num_pages,
-				    pos, first_index, write_bytes,
+		ret = prepare_pages(inode, pages, num_pages,
+				    pos, write_bytes,
 				    force_page_uptodate);
 		if (ret)
 			break;
-- 
cgit v1.2.3-70-g09d2


From 376cc685cb3b43a6509de0b6a343c4079f23c239 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 10 Dec 2013 19:25:04 +0800
Subject: Btrfs: fix the reserved space leak caused by the race between nonlock
 dio and buffered io

When we ran sysbench on the fs with compression, the following WARN_ONs were
triggered:
 fs/btrfs/inode.c:7829	WARN_ON(BTRFS_I(inode)->outstanding_extents);
 fs/btrfs/inode.c:7830	WARN_ON(BTRFS_I(inode)->reserved_extents);
 fs/btrfs/inode.c:7832	WARN_ON(BTRFS_I(inode)->csum_bytes);

Steps to reproduce:
 # mkfs.btrfs -f <dev>
 # mount -o compress <dev> <mnt>
 # cd <mnt>
 # sysbench --test=fileio --num-threads=8 --file-total-size=8G \
 > --file-block-size=32K --file-io-mode=rndwr --file-fsync-freq=0 \
 > --file-fsync-end=no --max-requests=300000 --file-extra-flags=direct \
 > --file-test-mode=sync prepare
 # cd -
 # umount <mnt>
 # mount -o compress <dev> <mnt>
 # cd <mnt>
 # sysbench --test=fileio --num-threads=8 --file-total-size=8G \
 > --file-block-size=32K --file-io-mode=rndwr --file-fsync-freq=0 \
 > --file-fsync-end=no --max-requests=300000 --file-extra-flags=direct \
 > --file-test-mode=sync run
 # cd -
 # umount <mnt>

The reason of this problem is:
Task0				Task1
btrfs_direct_IO
  unlock(&inode->i_mutex)
				lock(&inode->i_mutex)
				reserve_space()
				prepare_pages()
				  lock_extent()
				  clear_extent()
				  unlock_extent()
  lock_extent()
  test_extent(uptodate)
    return false
				copy_data()
				set_delalloc_extent()
  extent need compress
    go back to buffered write
  clear_extent(DELALLOC | DIRTY)
  unlock_extent()

Task 0 and 1 wrote the same place, and task0 cleared the delalloc flag which
was set by task1, it made the dirty pages in that extents couldn't be flushed
into the disk, so the reserved space for that extent was not released at
the end.

This patch fixes the above bug by unlocking the extent after the delalloc.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 131 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 84 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5591c67b1b4..6cd003c3f05 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1235,27 +1235,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
 }
 
 /*
- * this gets pages into the page cache and locks them down, it also properly
- * waits for data=ordered extents to finish before allowing the pages to be
- * modified.
+ * this just gets pages into the page cache and locks them down.
  */
 static noinline int prepare_pages(struct inode *inode, struct page **pages,
 				  size_t num_pages, loff_t pos,
 				  size_t write_bytes, bool force_uptodate)
 {
-	struct extent_state *cached_state = NULL;
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-	int err = 0;
-	int faili = 0;
-	u64 start_pos;
-	u64 last_pos;
-
-	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
-	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+	int err;
+	int faili;
 
-again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
 					       mask | __GFP_WRITE);
@@ -1278,57 +1269,85 @@ again:
 		}
 		wait_on_page_writeback(pages[i]);
 	}
-	faili = num_pages - 1;
-	err = 0;
+
+	return 0;
+fail:
+	while (faili >= 0) {
+		unlock_page(pages[faili]);
+		page_cache_release(pages[faili]);
+		faili--;
+	}
+	return err;
+
+}
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+				size_t num_pages, loff_t pos,
+				u64 *lockstart, u64 *lockend,
+				struct extent_state **cached_state)
+{
+	u64 start_pos;
+	u64 last_pos;
+	int i;
+	int ret = 0;
+
+	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
-				 start_pos, last_pos - 1, 0, &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    last_pos - 1);
+				 start_pos, last_pos, 0, cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
-		    ordered->file_offset < last_pos) {
+		    ordered->file_offset <= last_pos) {
 			btrfs_put_ordered_extent(ordered);
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-					     start_pos, last_pos - 1,
-					     &cached_state, GFP_NOFS);
+					     start_pos, last_pos,
+					     cached_state, GFP_NOFS);
 			for (i = 0; i < num_pages; i++) {
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
 			}
-			err = btrfs_wait_ordered_range(inode, start_pos,
-						       last_pos - start_pos);
-			if (err)
-				goto fail;
-			goto again;
+			ret = btrfs_wait_ordered_range(inode, start_pos,
+						last_pos - start_pos + 1);
+			if (ret)
+				return ret;
+			else
+				return -EAGAIN;
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
-				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+				  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
 				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-				  0, 0, &cached_state, GFP_NOFS);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-				     start_pos, last_pos - 1, &cached_state,
-				     GFP_NOFS);
+				  0, 0, cached_state, GFP_NOFS);
+		*lockstart = start_pos;
+		*lockend = last_pos;
+		ret = 1;
 	}
+
 	for (i = 0; i < num_pages; i++) {
 		if (clear_page_dirty_for_io(pages[i]))
 			account_page_redirty(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
-	return 0;
-fail:
-	while (faili >= 0) {
-		unlock_page(pages[faili]);
-		page_cache_release(pages[faili]);
-		faili--;
-	}
-	return err;
 
+	return ret;
 }
 
 static noinline int check_can_nocow(struct inode *inode, loff_t pos,
@@ -1379,13 +1398,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page **pages = NULL;
+	struct extent_state *cached_state = NULL;
 	u64 release_bytes = 0;
+	u64 lockstart;
+	u64 lockend;
 	unsigned long first_index;
 	size_t num_written = 0;
 	int nrptrs;
 	int ret = 0;
 	bool only_release_metadata = false;
 	bool force_page_uptodate = false;
+	bool need_unlock;
 
 	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1454,7 +1477,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		}
 
 		release_bytes = reserve_bytes;
-
+		need_unlock = false;
+again:
 		/*
 		 * This is going to setup the pages array with the number of
 		 * pages we want, so we don't really need to worry about the
@@ -1466,6 +1490,18 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		if (ret)
 			break;
 
+		ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+						      pos, &lockstart, &lockend,
+						      &cached_state);
+		if (ret < 0) {
+			if (ret == -EAGAIN)
+				goto again;
+			break;
+		} else if (ret > 0) {
+			need_unlock = true;
+			ret = 0;
+		}
+
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, i);
 
@@ -1510,19 +1546,20 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		}
 
 		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
-		if (copied > 0) {
+
+		if (copied > 0)
 			ret = btrfs_dirty_pages(root, inode, pages,
 						dirty_pages, pos, copied,
 						NULL);
-			if (ret) {
-				btrfs_drop_pages(pages, num_pages);
-				break;
-			}
-		}
-
-		release_bytes = 0;
+		if (need_unlock)
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     lockstart, lockend, &cached_state,
+					     GFP_NOFS);
 		btrfs_drop_pages(pages, num_pages);
+		if (ret)
+			break;
 
+		release_bytes = 0;
 		if (only_release_metadata && copied > 0) {
 			u64 lockstart = round_down(pos, root->sectorsize);
 			u64 lockend = lockstart +
-- 
cgit v1.2.3-70-g09d2


From 6126e3caf7468a07cc1a8239d9e95090acedd3ca Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 19 Nov 2013 16:19:24 +0000
Subject: Btrfs: fix ordered extent check in btrfs_punch_hole

If the ordered extent's last byte was 1 less than our region's
start byte, we would unnecessarily wait for the completion of
that ordered extent, because it doesn't intersect our target
range.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6cd003c3f05..740ae8c7170 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2164,7 +2164,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		 * we need to try again.
 		 */
 		if ((!ordered ||
-		    (ordered->file_offset + ordered->len < lockstart ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
 		     ordered->file_offset > lockend)) &&
 		     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
 				     lockend, EXTENT_UPTODATE, 0,
-- 
cgit v1.2.3-70-g09d2


From 0647bf564f1e35975e84f152dcba1a1ad54fbe7e Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 20 Nov 2013 09:01:52 +0800
Subject: Btrfs: improve forever loop when doing balance relocation

We hit a forever loop when doing balance relocation,the reason
is that we firstly reserve 4M(node size is 16k).and within transaction
we will try to add extra reservation for snapshot roots,this will
return -EAGAIN if there has been a thread flushing space to reserve
space.We will do this again and again with filesystem becoming nearly
full.

If the above '-EAGAIN' case happens, we try to refill reservation more
outsize of transaction, and this will return eariler in enospc case,however,
this dosen't really hurt because it makes no sense doing balance relocation
with the filesystem nearly full.

Miao Xie helped a lot to track this issue, thanks.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/relocation.c | 74 +++++++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 429c73c374b..63708f77c8e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -94,6 +94,7 @@ struct backref_edge {
 
 #define LOWER	0
 #define UPPER	1
+#define RELOCATION_RESERVED_NODES	256
 
 struct backref_cache {
 	/* red black tree of all backref nodes in the cache */
@@ -176,6 +177,8 @@ struct reloc_control {
 	u64 merging_rsv_size;
 	/* size of relocated tree nodes */
 	u64 nodes_relocated;
+	/* reserved size for block group relocation*/
+	u64 reserved_bytes;
 
 	u64 search_start;
 	u64 extents_found;
@@ -184,7 +187,6 @@ struct reloc_control {
 	unsigned int create_reloc_tree:1;
 	unsigned int merge_reloc_tree:1;
 	unsigned int found_file_extent:1;
-	unsigned int commit_transaction:1;
 };
 
 /* stages of data relocation */
@@ -2590,28 +2592,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = rc->extent_root;
 	u64 num_bytes;
 	int ret;
+	u64 tmp;
 
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
-				  BTRFS_RESERVE_FLUSH_ALL);
+	rc->reserved_bytes += num_bytes;
+	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
+				BTRFS_RESERVE_FLUSH_ALL);
 	if (ret) {
-		if (ret == -EAGAIN)
-			rc->commit_transaction = 1;
+		if (ret == -EAGAIN) {
+			tmp = rc->extent_root->nodesize *
+				RELOCATION_RESERVED_NODES;
+			while (tmp <= rc->reserved_bytes)
+				tmp <<= 1;
+			/*
+			 * only one thread can access block_rsv at this point,
+			 * so we don't need hold lock to protect block_rsv.
+			 * we expand more reservation size here to allow enough
+			 * space for relocation and we will return eailer in
+			 * enospc case.
+			 */
+			rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+					      RELOCATION_RESERVED_NODES;
+		}
 		return ret;
 	}
 
 	return 0;
 }
 
-static void release_metadata_space(struct reloc_control *rc,
-				   struct backref_node *node)
-{
-	u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
-	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
-}
-
 /*
  * relocate a block tree, and then update pointers in upper level
  * blocks that reference the block to point to the new location.
@@ -2898,7 +2908,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 				struct btrfs_path *path)
 {
 	struct btrfs_root *root;
-	int release = 0;
 	int ret = 0;
 
 	if (!node)
@@ -2915,7 +2924,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 		ret = reserve_metadata_space(trans, rc, node);
 		if (ret)
 			goto out;
-		release = 1;
 	}
 
 	if (root) {
@@ -2940,11 +2948,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 		ret = do_relocation(trans, rc, node, key, path, 1);
 	}
 out:
-	if (ret || node->level == 0 || node->cowonly) {
-		if (release)
-			release_metadata_space(rc, node);
+	if (ret || node->level == 0 || node->cowonly)
 		remove_backref_node(&rc->backref_cache, node);
-	}
 	return ret;
 }
 
@@ -3867,29 +3872,20 @@ static noinline_for_stack
 int prepare_to_relocate(struct reloc_control *rc)
 {
 	struct btrfs_trans_handle *trans;
-	int ret;
 
 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
 					      BTRFS_BLOCK_RSV_TEMP);
 	if (!rc->block_rsv)
 		return -ENOMEM;
 
-	/*
-	 * reserve some space for creating reloc trees.
-	 * btrfs_init_reloc_root will use them when there
-	 * is no reservation in transaction handle.
-	 */
-	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256,
-				  BTRFS_RESERVE_FLUSH_ALL);
-	if (ret)
-		return ret;
-
 	memset(&rc->cluster, 0, sizeof(rc->cluster));
 	rc->search_start = rc->block_group->key.objectid;
 	rc->extents_found = 0;
 	rc->nodes_relocated = 0;
 	rc->merging_rsv_size = 0;
+	rc->reserved_bytes = 0;
+	rc->block_rsv->size = rc->extent_root->nodesize *
+			      RELOCATION_RESERVED_NODES;
 
 	rc->create_reloc_tree = 1;
 	set_reloc_control(rc);
@@ -3933,6 +3929,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	}
 
 	while (1) {
+		rc->reserved_bytes = 0;
+		ret = btrfs_block_rsv_refill(rc->extent_root,
+					rc->block_rsv, rc->block_rsv->size,
+					BTRFS_RESERVE_FLUSH_ALL);
+		if (ret) {
+			err = ret;
+			break;
+		}
 		progress++;
 		trans = btrfs_start_transaction(rc->extent_root, 0);
 		if (IS_ERR(trans)) {
@@ -4020,14 +4024,8 @@ restart:
 			}
 		}
 
-		if (rc->commit_transaction) {
-			rc->commit_transaction = 0;
-			ret = btrfs_commit_transaction(trans, rc->extent_root);
-			BUG_ON(ret);
-		} else {
-			btrfs_end_transaction_throttle(trans, rc->extent_root);
-			btrfs_btree_balance_dirty(rc->extent_root);
-		}
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		btrfs_btree_balance_dirty(rc->extent_root);
 		trans = NULL;
 
 		if (rc->stage == MOVE_DATA_EXTENTS &&
-- 
cgit v1.2.3-70-g09d2


From 131e404a2a54d30f894425ef723f9867a43bff4c Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 19 Nov 2013 22:29:35 +0000
Subject: Btrfs: fix very slow inode eviction and fs unmount

The inode eviction can be very slow, because during eviction we
tell the VFS to truncate all of the inode's pages. This results
in calls to btrfs_invalidatepage() which in turn does calls to
lock_extent_bits() and clear_extent_bit(). These calls result in
too many merges and splits of extent_state structures, which
consume a lot of time and cpu when the inode has many pages. In
some scenarios I have experienced umount times higher than 15
minutes, even when there's no pending IO (after a btrfs fs sync).

A quick way to reproduce this issue:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ cd /mnt/btrfs
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ time btrfs fi sync .
FSSync '.'

real	0m25.457s
user	0m0.000s
sys	0m0.092s
$ cd ..
$ time umount /mnt/btrfs

real	1m38.234s
user	0m0.000s
sys	1m25.760s

The same test on ext4 runs much faster:

$ mkfs.ext4 /dev/sdb3
$ mount /dev/sdb3 /mnt/ext4
$ cd /mnt/ext4
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ sync
$ cd ..
$ time umount /mnt/ext4

real	0m3.626s
user	0m0.004s
sys	0m3.012s

After this patch, the unmount (inode evictions) is much faster:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ cd /mnt/btrfs
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ time btrfs fi sync .
FSSync '.'

real	0m26.774s
user	0m0.000s
sys	0m0.084s
$ cd ..
$ time umount /mnt/btrfs

real	0m1.811s
user	0m0.000s
sys	0m1.564s

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 84 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5a5de36d39f..e889779c9b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return err;
 }
 
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+	struct rb_node *node;
+
+	ASSERT(inode->i_state & I_FREEING);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	write_lock(&map_tree->lock);
+	while (!RB_EMPTY_ROOT(&map_tree->map)) {
+		struct extent_map *em;
+
+		node = rb_first(&map_tree->map);
+		em = rb_entry(node, struct extent_map, rb_node);
+		remove_extent_mapping(map_tree, em);
+		free_extent_map(em);
+	}
+	write_unlock(&map_tree->lock);
+
+	spin_lock(&io_tree->lock);
+	while (!RB_EMPTY_ROOT(&io_tree->state)) {
+		struct extent_state *state;
+		struct extent_state *cached_state = NULL;
+
+		node = rb_first(&io_tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		atomic_inc(&state->refs);
+		spin_unlock(&io_tree->lock);
+
+		lock_extent_bits(io_tree, state->start, state->end,
+				 0, &cached_state);
+		clear_extent_bit(io_tree, state->start, state->end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+		free_extent_state(state);
+
+		spin_lock(&io_tree->lock);
+	}
+	spin_unlock(&io_tree->lock);
+}
+
 void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
@@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode)
 
 	trace_btrfs_inode_evict(inode);
 
-	truncate_inode_pages(&inode->i_data, 0);
+	evict_inode_truncate_pages(inode);
+
 	if (inode->i_nlink &&
 	    ((btrfs_root_refs(&root->root_item) != 0 &&
 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
@@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	int inode_evicting = inode->i_state & I_FREEING;
 
 	/*
 	 * we have the page locked, so new writeback can't start,
@@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+
+	if (!inode_evicting)
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
 		/*
 		 * IO on this page will never be started, so we need
 		 * to account for any ordered extents now
 		 */
-		clear_extent_bit(tree, page_start, page_end,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
-				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
+		if (!inode_evicting)
+			clear_extent_bit(tree, page_start, page_end,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+					 EXTENT_DEFRAG, 1, 0, &cached_state,
+					 GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 				btrfs_finish_ordered_io(ordered);
 		}
 		btrfs_put_ordered_extent(ordered);
-		cached_state = NULL;
-		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+		if (!inode_evicting) {
+			cached_state = NULL;
+			lock_extent_bits(tree, page_start, page_end, 0,
+					 &cached_state);
+		}
+	}
+
+	if (!inode_evicting) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+
+		__btrfs_releasepage(page, GFP_NOFS);
 	}
-	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
-		 &cached_state, GFP_NOFS);
-	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
 	if (PagePrivate(page)) {
-- 
cgit v1.2.3-70-g09d2


From 1b8e5df6d9b676f6d31fb098ffdc7d18732729d7 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 20 Nov 2013 16:50:23 -0500
Subject: btrfs: fix static checker warnings

This patch fixes the following warnings:
fs/btrfs/extent-tree.c:6201:12: sparse: symbol 'get_raid_name' was not declared. Should it be static?
fs/btrfs/extent-tree.c:8430:9: error: format not a string literal and no format arguments [-Werror=format-security] get_raid_name(index));

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe651f4d1b9..f08f6dda949 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6195,7 +6195,7 @@ static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID6]	= "raid6",
 };
 
-const char *get_raid_name(enum btrfs_raid_types type)
+static const char *get_raid_name(enum btrfs_raid_types type)
 {
 	if (type >= BTRFS_NR_RAID_TYPES)
 		return NULL;
@@ -8423,7 +8423,7 @@ static void __link_block_group(struct btrfs_space_info *space_info,
 
 		kobject_get(&space_info->kobj); /* put in release */
 		ret = kobject_init_and_add(kobj, &btrfs_raid_ktype,
-					   &space_info->kobj,
+					   &space_info->kobj, "%s",
 					   get_raid_name(index));
 		if (ret) {
 			pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n");
-- 
cgit v1.2.3-70-g09d2


From e453d989e0bb33defaaa5be4e9f577cea946e2a6 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 21 Nov 2013 10:37:16 -0500
Subject: btrfs: fix leaks during sysfs teardown

Filipe noticed that we were leaking the features attribute group
after umount. His fix of just calling sysfs_remove_group() wasn't enough
since that removes just the supported features and not the unsupported
features.

This patch changes the unknown feature handling to add them individually
so we can skip the kmalloc and uses the same iteration to tear them down
later.

We also fix the error handling during mount so that we catch the
failing creation of the per-super kobject, and handle proper teardown
of a half-setup sysfs context.

Tested properly with kmemleak enabled this time.

Reported-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Tested-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 133 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 73 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b535285642d..f25deb9c132 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -419,28 +419,84 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 	return container_of(kobj, struct btrfs_fs_info, super_kobj);
 }
 
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+#define NUM_FEATURE_BITS 64
+static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
+static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+
+static u64 supported_feature_masks[3] = {
+	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
+	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+	[FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
+};
+
+static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
+{
+	int set;
+
+	for (set = 0; set < FEAT_MAX; set++) {
+		int i;
+		struct attribute *attrs[2];
+		struct attribute_group agroup = {
+			.name = "features",
+			.attrs = attrs,
+		};
+		u64 features = get_features(fs_info, set);
+		features &= ~supported_feature_masks[set];
+
+		if (!features)
+			continue;
+
+		attrs[1] = NULL;
+		for (i = 0; i < NUM_FEATURE_BITS; i++) {
+			struct btrfs_feature_attr *fa;
+
+			if (!(features & (1ULL << i)))
+				continue;
+
+			fa = &btrfs_feature_attrs[set][i];
+			attrs[0] = &fa->kobj_attr.attr;
+			if (add) {
+				int ret;
+				ret = sysfs_merge_group(&fs_info->super_kobj,
+							&agroup);
+				if (ret)
+					return ret;
+			} else
+				sysfs_unmerge_group(&fs_info->super_kobj,
+						    &agroup);
+		}
+
+	}
+	return 0;
+}
+
+static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
-	sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
-	kobject_del(fs_info->device_dir_kobj);
-	kobject_put(fs_info->device_dir_kobj);
-	kobject_del(fs_info->space_info_kobj);
-	kobject_put(fs_info->space_info_kobj);
 	kobject_del(&fs_info->super_kobj);
 	kobject_put(&fs_info->super_kobj);
 	wait_for_completion(&fs_info->kobj_unregister);
 }
 
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->space_info_kobj) {
+		sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+		kobject_del(fs_info->space_info_kobj);
+		kobject_put(fs_info->space_info_kobj);
+	}
+	kobject_del(fs_info->device_dir_kobj);
+	kobject_put(fs_info->device_dir_kobj);
+	addrm_unknown_feature_attrs(fs_info, false);
+	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
+	__btrfs_sysfs_remove_one(fs_info);
+}
+
 const char * const btrfs_feature_set_names[3] = {
 	[FEAT_COMPAT]	 = "compat",
 	[FEAT_COMPAT_RO] = "compat_ro",
 	[FEAT_INCOMPAT]	 = "incompat",
 };
 
-#define NUM_FEATURE_BITS 64
-static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
-static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
-
 char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
 {
 	size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
@@ -510,53 +566,6 @@ static void init_feature_attrs(void)
 	}
 }
 
-static u64 supported_feature_masks[3] = {
-	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
-	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
-	[FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
-};
-
-static int add_unknown_feature_attrs(struct btrfs_fs_info *fs_info)
-{
-	int set;
-
-	for (set = 0; set < FEAT_MAX; set++) {
-		int i, count, ret, index = 0;
-		struct attribute **attrs;
-		struct attribute_group agroup = {
-			.name = "features",
-		};
-		u64 features = get_features(fs_info, set);
-		features &= ~supported_feature_masks[set];
-
-		count = hweight64(features);
-
-		if (!count)
-			continue;
-
-		attrs = kcalloc(count + 1, sizeof(void *), GFP_KERNEL);
-
-		for (i = 0; i < NUM_FEATURE_BITS; i++) {
-			struct btrfs_feature_attr *fa;
-
-			if (!(features & (1ULL << i)))
-				continue;
-
-			fa = &btrfs_feature_attrs[set][i];
-			attrs[index++] = &fa->kobj_attr.attr;
-		}
-
-		attrs[index] = NULL;
-		agroup.attrs = attrs;
-
-		ret = sysfs_merge_group(&fs_info->super_kobj, &agroup);
-		kfree(attrs);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
 static int add_device_membership(struct btrfs_fs_info *fs_info)
 {
 	int error = 0;
@@ -592,13 +601,17 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	fs_info->super_kobj.kset = btrfs_kset;
 	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
 				     "%pU", fs_info->fsid);
+	if (error)
+		return error;
 
 	error = sysfs_create_group(&fs_info->super_kobj,
 				   &btrfs_feature_attr_group);
-	if (error)
-		goto failure;
+	if (error) {
+		__btrfs_sysfs_remove_one(fs_info);
+		return error;
+	}
 
-	error = add_unknown_feature_attrs(fs_info);
+	error = addrm_unknown_feature_attrs(fs_info, true);
 	if (error)
 		goto failure;
 
-- 
cgit v1.2.3-70-g09d2


From 1b8e7e45e57cc98bf6c8ab9b3087d634636ba2e6 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 22 Nov 2013 18:54:58 +0000
Subject: Btrfs: avoid unnecessary ordered extent cache resets

After an ordered extent completes, don't blindly reset the
inode's ordered tree last accessed ordered extent pointer.

While running the xfstests I noticed that about 29% of the
time the ordered extent to which tree->last pointed was not
the same as our just completed ordered extent. After that I
ran the following sysbench test (after a prepare phase) and
noticed that about 68% of the time tree->last pointed to
a different ordered extent too.

sysbench --test=fileio --file-num=32 --file-total-size=4G \
    --file-test-mode=rndwr --num-threads=512 \
    --file-block-size=32768 --max-time=60 --max-requests=0 run

Therefore reset tree->last on ordered extent removal only if
it pointed to the ordered extent we're removing from the tree.

Results from 4 runs of the following test before and after
applying this patch:

$ sysbench --test=fileio --file-num=32 --file-total-size=4G \
  --file-test-mode=seqwr --num-threads=512 \
  --file-block-size=32768 --max-time=60 --file-io-mode=sync prepare
$ sysbench --test=fileio --file-num=32 --file-total-size=4G \
  --file-test-mode=seqwr --num-threads=512 \
  --file-block-size=32768 --max-time=60 --file-io-mode=sync run

Before this path:

run 1 - 64.049Mb/sec
run 2 - 63.455Mb/sec
run 3 - 64.656Mb/sec
run 4 - 63.833Mb/sec

After this patch:

run 1 - 66.149Mb/sec
run 2 - 68.459Mb/sec
run 3 - 66.338Mb/sec
run 4 - 66.176Mb/sec

With random writes (--file-test-mode=rndwr) I had huge fluctuations
on the results (+- 35% easily).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ordered-data.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 69582d5b69d..b8c2ded75fe 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -520,7 +520,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
-	tree->last = NULL;
+	if (tree->last == node)
+		tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 	spin_unlock_irq(&tree->lock);
 
-- 
cgit v1.2.3-70-g09d2


From 5a4267ca20d4c452a97dace4612f1dfc04147fbd Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 25 Nov 2013 03:20:46 +0000
Subject: Btrfs: try harder to avoid btree node splits

When attempting to move items from our target leaf to its neighbor
leaves (right and left), we only need to free data_size - free_space
bytes from our leaf in order to add the new item (which has size of
data_size bytes). Therefore attempt to move items to the right and
left leaves if they have at least data_size - free_space bytes free,
instead of data_size bytes free.

After 5 runs of the following test, I got a smaller number of btree
node splits overall:

sysbench --test=fileio --file-num=512 --file-total-size=5G \
  --file-test-mode=seqwr --num-threads=512 \
   --file-block-size=8192 --max-requests=100000 --file-io-mode=sync

Before this change:
* 6171 splits (average of 5 test runs)
* 61.508Mb/sec of throughput (average of 5 test runs)

After this change:
* 6036 splits (average of 5 test runs)
* 63.533Mb/sec of throughput (average of 5 test runs)

An ideal test would not just have multiple threads/processes writing
to a file (insertion of file extent items) but also do other operations
that result in insertion of items with varied sizes, like file/directory
creations, creation of links, symlinks, xattrs, etc.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 22629809c9c..11f9a1848c7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3929,14 +3929,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
 	int progress = 0;
 	int slot;
 	u32 nritems;
+	int space_needed = data_size;
 
 	slot = path->slots[0];
+	if (slot < btrfs_header_nritems(path->nodes[0]))
+		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
 
 	/*
 	 * try to push all the items after our slot into the
 	 * right leaf
 	 */
-	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
 	if (ret < 0)
 		return ret;
 
@@ -3956,7 +3959,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
 
 	/* try to push all the items before our slot into the next leaf */
 	slot = path->slots[0];
-	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
 	if (ret < 0)
 		return ret;
 
@@ -4000,13 +4003,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 
 	/* first try to make some room by pushing left and right */
 	if (data_size && path->nodes[1]) {
-		wret = push_leaf_right(trans, root, path, data_size,
-				       data_size, 0, 0);
+		int space_needed = data_size;
+
+		if (slot < btrfs_header_nritems(l))
+			space_needed -= btrfs_leaf_free_space(root, l);
+
+		wret = push_leaf_right(trans, root, path, space_needed,
+				       space_needed, 0, 0);
 		if (wret < 0)
 			return wret;
 		if (wret) {
-			wret = push_leaf_left(trans, root, path, data_size,
-					      data_size, 0, (u32)-1);
+			wret = push_leaf_left(trans, root, path, space_needed,
+					      space_needed, 0, (u32)-1);
 			if (wret < 0)
 				return wret;
 		}
-- 
cgit v1.2.3-70-g09d2


From 68ba990f7d161c31e9eddd98727ba8393089047f Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 25 Nov 2013 03:22:07 +0000
Subject: Btrfs: fix extent boundary check in bio_readpage_error

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d97250a4e8e..3721820687d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2156,7 +2156,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 			return -EIO;
 		}
 
-		if (em->start > start || em->start + em->len < start) {
+		if (em->start > start || em->start + em->len <= start) {
 			free_extent_map(em);
 			em = NULL;
 		}
-- 
cgit v1.2.3-70-g09d2


From 32193c147f451652c6c089b5fa1c9852d53d65ee Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 25 Nov 2013 03:23:51 +0000
Subject: Btrfs: faster and more efficient extent map insertion

Before this change, adding an extent map to the extent map tree of an
inode required 2 tree nevigations:

1) doing a tree navigation to search for an existing extent map starting
   at the same offset or an extent map that overlaps the extent map we
   want to insert;

2) Another tree navigation to add the extent map to the tree (if the
   former tree search didn't found anything).

This change just merges these 2 steps into a single one.
While running first few btrfs xfstests I had noticed these trees easily
had a few hundred elements, and then with the following sysbench test it
reached over 1100 elements very often.

Test:

  sysbench --test=fileio --file-num=32 --file-total-size=10G \
    --file-test-mode=seqwr --num-threads=512 --file-block-size=8192 \
    --max-requests=1000000 --file-io-mode=sync [prepare|run]

(fs created with mkfs.btrfs -l 4096 -f /dev/sdb3 before each sysbench
prepare phase)

Before this patch:

run 1 - 41.894Mb/sec
run 2 - 40.527Mb/sec
run 3 - 40.922Mb/sec
run 4 - 49.433Mb/sec
run 5 - 40.959Mb/sec

average - 42.75Mb/sec

After this patch:

run 1 - 48.036Mb/sec
run 2 - 50.21Mb/sec
run 3 - 50.929Mb/sec
run 4 - 46.881Mb/sec
run 5 - 53.192Mb/sec

average - 49.85Mb/sec

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_map.c | 72 +++++++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a4a7a1a8da9..b60955d4f9b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -79,12 +79,21 @@ void free_extent_map(struct extent_map *em)
 	}
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
-				   struct rb_node *node)
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+	if (start + len < start)
+		return (u64)-1;
+	return start + len;
+}
+
+static int tree_insert(struct rb_root *root, struct extent_map *em)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
-	struct extent_map *entry;
+	struct extent_map *entry = NULL;
+	struct rb_node *orig_parent = NULL;
+	u64 end = range_end(em->start, em->len);
 
 	while (*p) {
 		parent = *p;
@@ -92,19 +101,37 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 
 		WARN_ON(!entry->in_tree);
 
-		if (offset < entry->start)
+		if (em->start < entry->start)
 			p = &(*p)->rb_left;
-		else if (offset >= extent_map_end(entry))
+		else if (em->start >= extent_map_end(entry))
 			p = &(*p)->rb_right;
 		else
-			return parent;
+			return -EEXIST;
 	}
 
-	entry = rb_entry(node, struct extent_map, rb_node);
-	entry->in_tree = 1;
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-	return NULL;
+	orig_parent = parent;
+	while (parent && em->start >= extent_map_end(entry)) {
+		parent = rb_next(parent);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+	}
+	if (parent)
+		if (end > entry->start && em->start < extent_map_end(entry))
+			return -EEXIST;
+
+	parent = orig_parent;
+	entry = rb_entry(parent, struct extent_map, rb_node);
+	while (parent && em->start < entry->start) {
+		parent = rb_prev(parent);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+	}
+	if (parent)
+		if (end > entry->start && em->start < extent_map_end(entry))
+			return -EEXIST;
+
+	em->in_tree = 1;
+	rb_link_node(&em->rb_node, orig_parent, p);
+	rb_insert_color(&em->rb_node, root);
+	return 0;
 }
 
 /*
@@ -310,20 +337,11 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em, int modified)
 {
 	int ret = 0;
-	struct rb_node *rb;
-	struct extent_map *exist;
 
-	exist = lookup_extent_mapping(tree, em->start, em->len);
-	if (exist) {
-		free_extent_map(exist);
-		ret = -EEXIST;
-		goto out;
-	}
-	rb = tree_insert(&tree->map, em->start, &em->rb_node);
-	if (rb) {
-		ret = -EEXIST;
+	ret = tree_insert(&tree->map, em);
+	if (ret)
 		goto out;
-	}
+
 	atomic_inc(&em->refs);
 
 	em->mod_start = em->start;
@@ -337,14 +355,6 @@ out:
 	return ret;
 }
 
-/* simple helper to do math around the end of an extent, handling wrap */
-static u64 range_end(u64 start, u64 len)
-{
-	if (start + len < start)
-		return (u64)-1;
-	return start + len;
-}
-
 static struct extent_map *
 __lookup_extent_mapping(struct extent_map_tree *tree,
 			u64 start, u64 len, int strict)
-- 
cgit v1.2.3-70-g09d2


From c42ac0bc9530d51029b938e09b60b5ee86e5ee70 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 26 Nov 2013 15:01:34 +0000
Subject: Btrfs: add missing extent state caching calls

When we didn't find a matching extent state, we inserted a new one
but didn't cache it in the **cached_state parameter, which makes a
subsequent call do a tree lookup to get it.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3721820687d..01a14124586 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -817,6 +817,7 @@ again:
 		if (err)
 			extent_io_tree_panic(tree, err);
 
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		goto out;
 	}
@@ -1040,9 +1041,10 @@ again:
 			goto out;
 		}
 		err = insert_state(tree, prealloc, start, end, &bits);
-		prealloc = NULL;
 		if (err)
 			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
+		prealloc = NULL;
 		goto out;
 	}
 	state = rb_entry(node, struct extent_state, rb_node);
-- 
cgit v1.2.3-70-g09d2


From 12cfbad90e02793b7a71b7591ebd5c3f9228dc5d Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 26 Nov 2013 15:41:47 +0000
Subject: Btrfs: more efficient extent state insertions

Currently we do 2 traversals of an inode's extent_io_tree
before inserting an extent state structure: 1 to see if a
matching extent state already exists and 1 to do the insertion
if the fist traversal didn't found such extent state.

This change just combines those tree traversals into a single one.
While running sysbench tests (random writes) I captured the number
of elements in extent_io_tree trees for a while (into a procfs file
backed by a seq_list from seq_file module) and got this histogram:

Count: 9310
Range: 51.000 - 21386.000; Mean: 11785.243; Median: 18743.500; Stddev: 8923.688
Percentiles:  90th: 20985.000; 95th: 21155.000; 99th: 21369.000
  51.000 -   93.933:   693 ########
  93.933 -  172.314:   938 ##########
 172.314 -  315.408:   856 #########
 315.408 -  576.646:    95 #
 576.646 - 6415.830:   888 ##########
6415.830 - 11713.809:  1024 ###########
11713.809 - 21386.000:  4816 #####################################################

So traversing such trees can take some significant time that can
easily be avoided.

Ran the following sysbench tests, 5 times each, for sequential and
random writes, and got the following results:

  sysbench --test=fileio --file-num=1 --file-total-size=2G \
    --file-test-mode=seqwr --num-threads=16 --file-block-size=65536 \
    --max-requests=0 --max-time=60 --file-io-mode=sync

  sysbench --test=fileio --file-num=1 --file-total-size=2G \
    --file-test-mode=rndwr --num-threads=16 --file-block-size=65536 \
    --max-requests=0 --max-time=60 --file-io-mode=sync

Before this change:

sequential writes: 69.28Mb/sec (average of 5 runs)
random writes:     4.14Mb/sec  (average of 5 runs)

After this change:

sequential writes: 69.91Mb/sec (average of 5 runs)
random writes:     5.69Mb/sec  (average of 5 runs)

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 76 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 01a14124586..5fc9481515f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -224,12 +224,20 @@ void free_extent_state(struct extent_state *state)
 }
 
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
-				   struct rb_node *node)
+				   struct rb_node *node,
+				   struct rb_node ***p_in,
+				   struct rb_node **parent_in)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 
+	if (p_in && parent_in) {
+		p = *p_in;
+		parent = *parent_in;
+		goto do_insert;
+	}
+
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -242,35 +250,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 			return parent;
 	}
 
+do_insert:
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
 	return NULL;
 }
 
 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
-				     struct rb_node **prev_ret,
-				     struct rb_node **next_ret)
+				      struct rb_node **prev_ret,
+				      struct rb_node **next_ret,
+				      struct rb_node ***p_ret,
+				      struct rb_node **parent_ret)
 {
 	struct rb_root *root = &tree->state;
-	struct rb_node *n = root->rb_node;
+	struct rb_node **n = &root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
-	while (n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
-		prev = n;
+	while (*n) {
+		prev = *n;
+		entry = rb_entry(prev, struct tree_entry, rb_node);
 		prev_entry = entry;
 
 		if (offset < entry->start)
-			n = n->rb_left;
+			n = &(*n)->rb_left;
 		else if (offset > entry->end)
-			n = n->rb_right;
+			n = &(*n)->rb_right;
 		else
-			return n;
+			return *n;
 	}
 
+	if (p_ret)
+		*p_ret = n;
+	if (parent_ret)
+		*parent_ret = prev;
+
 	if (prev_ret) {
 		orig_prev = prev;
 		while (prev && offset > prev_entry->end) {
@@ -292,18 +308,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 	return NULL;
 }
 
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
-					  u64 offset)
+static inline struct rb_node *
+tree_search_for_insert(struct extent_io_tree *tree,
+		       u64 offset,
+		       struct rb_node ***p_ret,
+		       struct rb_node **parent_ret)
 {
 	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 
-	ret = __etree_search(tree, offset, &prev, NULL);
+	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
 	if (!ret)
 		return prev;
 	return ret;
 }
 
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+					  u64 offset)
+{
+	return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 		     struct extent_state *other)
 {
@@ -385,6 +410,8 @@ static void set_state_bits(struct extent_io_tree *tree,
  */
 static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
+			struct rb_node ***p,
+			struct rb_node **parent,
 			unsigned long *bits)
 {
 	struct rb_node *node;
@@ -397,7 +424,7 @@ static int insert_state(struct extent_io_tree *tree,
 
 	set_state_bits(tree, state, bits);
 
-	node = tree_insert(&tree->state, end, &state->rb_node);
+	node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
@@ -444,7 +471,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	prealloc->state = orig->state;
 	orig->start = split;
 
-	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
+			   NULL, NULL);
 	if (node) {
 		free_extent_state(prealloc);
 		return -EEXIST;
@@ -783,6 +811,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
+	struct rb_node **p;
+	struct rb_node *parent;
 	int err = 0;
 	u64 last_start;
 	u64 last_end;
@@ -809,11 +839,12 @@ again:
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(tree, start);
+	node = tree_search_for_insert(tree, start, &p, &parent);
 	if (!node) {
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
-		err = insert_state(tree, prealloc, start, end, &bits);
+		err = insert_state(tree, prealloc, start, end,
+				   &p, &parent, &bits);
 		if (err)
 			extent_io_tree_panic(tree, err);
 
@@ -920,7 +951,7 @@ hit_next:
 		 * the later extent.
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
-				   &bits);
+				   NULL, NULL, &bits);
 		if (err)
 			extent_io_tree_panic(tree, err);
 
@@ -1006,6 +1037,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
+	struct rb_node **p;
+	struct rb_node *parent;
 	int err = 0;
 	u64 last_start;
 	u64 last_end;
@@ -1033,14 +1066,15 @@ again:
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(tree, start);
+	node = tree_search_for_insert(tree, start, &p, &parent);
 	if (!node) {
 		prealloc = alloc_extent_state_atomic(prealloc);
 		if (!prealloc) {
 			err = -ENOMEM;
 			goto out;
 		}
-		err = insert_state(tree, prealloc, start, end, &bits);
+		err = insert_state(tree, prealloc, start, end,
+				   &p, &parent, &bits);
 		if (err)
 			extent_io_tree_panic(tree, err);
 		cache_state(prealloc, cached_state);
@@ -1137,7 +1171,7 @@ hit_next:
 		 * the later extent.
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
-				   &bits);
+				   NULL, NULL, &bits);
 		if (err)
 			extent_io_tree_panic(tree, err);
 		cache_state(prealloc, cached_state);
-- 
cgit v1.2.3-70-g09d2


From 878f2d2cb355da2dabbffb2ae51b7541a91ce4e3 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Wed, 27 Nov 2013 16:06:16 +0000
Subject: Btrfs: fix max dir item size calculation

We were accounting for sizeof(struct btrfs_item) twice, once
in the data_size variable and another time in the if statement
below.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/dir-item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c031ea3fd70..9a89ceb25e8 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -261,7 +261,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 	 * see if there is room in the item to insert this
 	 * name
 	 */
-	data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+	data_size = sizeof(*di) + name_len;
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 	if (data_size + btrfs_item_size_nr(leaf, slot) +
-- 
cgit v1.2.3-70-g09d2


From 11850392ee1c600c7d40d93119daa72715bc959c Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <mina86@mina86.com>
Date: Fri, 29 Nov 2013 18:01:15 +0100
Subject: btrfs: remove dead code

[commit 8185554d: fix incorrect inode acl reset] introduced a dead
code by adding a condition which can never be true to an else
branch.  The condition can never be true because it is already
checked by a previous if statement which causes function to return.

Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Reviewed-By: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/acl.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0890c83643e..460f36bae67 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -225,13 +225,8 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
 		if (ret < 0)
 			return ret;
-
-		if (ret > 0) {
-			/* we need an acl */
+		if (ret > 0) /* we need an acl */
 			ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
-		} else if (ret < 0) {
-			cache_no_acl(inode);
-		}
 	} else {
 		cache_no_acl(inode);
 	}
-- 
cgit v1.2.3-70-g09d2


From d527afe1e5865d25a336f6c4157f1086cfbddc81 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sat, 30 Nov 2013 11:28:35 +0000
Subject: Btrfs: fix extent_map block_len after merging

When merging an extent_map with its right neighbor, increment
its block_len with the neighbor's block_len.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b60955d4f9b..996ad56b57d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -255,7 +255,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		merge = rb_entry(rb, struct extent_map, rb_node);
 	if (rb && mergable_maps(em, merge)) {
 		em->len += merge->len;
-		em->block_len += merge->len;
+		em->block_len += merge->block_len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
 		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
-- 
cgit v1.2.3-70-g09d2


From 5a0f4e2c2b47a755e37dbbb6f691e6504e3147b3 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 3 Dec 2013 15:55:48 +0000
Subject: Btrfs: fix pass of transid with wrong endianness in send.c

fs/btrfs/send.c:2190:9: warning: incorrect type in argument 3 (different base types)
fs/btrfs/send.c:2190:9:    expected unsigned long long [unsigned] [usertype] value
fs/btrfs/send.c:2190:9:    got restricted __le64 [usertype] ctransid
fs/btrfs/send.c:2195:17: warning: incorrect type in argument 3 (different base types)
fs/btrfs/send.c:2195:17:    expected unsigned long long [unsigned] [usertype] value
fs/btrfs/send.c:2195:17:    got restricted __le64 [usertype] ctransid
fs/btrfs/send.c:3716:9: warning: incorrect type in argument 3 (different base types)
fs/btrfs/send.c:3716:9:    expected unsigned long long [unsigned] [usertype] value
fs/btrfs/send.c:3716:9:    got restricted __le64 [usertype] ctransid

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 29803b4129f..1896e394d59 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2188,12 +2188,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
 			sctx->send_root->root_item.uuid);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
-			sctx->send_root->root_item.ctransid);
+		    le64_to_cpu(sctx->send_root->root_item.ctransid));
 	if (parent_root) {
 		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
 				sctx->parent_root->root_item.uuid);
 		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-				sctx->parent_root->root_item.ctransid);
+			    le64_to_cpu(sctx->parent_root->root_item.ctransid));
 	}
 
 	ret = send_cmd(sctx);
@@ -3714,7 +3714,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
 			clone_root->root->root_item.uuid);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-			clone_root->root->root_item.ctransid);
+		    le64_to_cpu(clone_root->root->root_item.ctransid));
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
 			clone_root->offset);
-- 
cgit v1.2.3-70-g09d2


From 3cb0929ad24c95c5fd8f08eb41a702a65954b4c6 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 4 Dec 2013 21:15:19 +0800
Subject: Btrfs: fix wrong super generation mismatch when scrubbing supers

We came a race condition when scrubbing superblocks, the story is:

In commiting transaction, we will update @last_trans_commited after
writting superblocks, if scrubber start after writting superblocks
and before updating @last_trans_commited, generation mismatch happens!

We fix this by checking @scrub_pause_req, and we won't start a srubber
until commiting transaction is finished.(after btrfs_scrub_continue()
finished.)

Reported-by: Sebastian Ochmann <ochmann@informatik.uni-bonn.de>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e5481ae1275..6acb573e7d6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 			    int mirror_num, u64 physical_for_dev_replace);
 static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -269,6 +270,16 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 	wake_up(&sctx->list_wait);
 }
 
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+	while (atomic_read(&fs_info->scrub_pause_req)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+		   atomic_read(&fs_info->scrub_pause_req) == 0);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+}
+
 /*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
@@ -2310,14 +2321,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		btrfs_reada_wait(reada2);
 
 	mutex_lock(&fs_info->scrub_lock);
-	while (atomic_read(&fs_info->scrub_pause_req)) {
-		mutex_unlock(&fs_info->scrub_lock);
-		wait_event(fs_info->scrub_pause_wait,
-		   atomic_read(&fs_info->scrub_pause_req) == 0);
-		mutex_lock(&fs_info->scrub_lock);
-	}
+	scrub_blocked_if_needed(fs_info);
 	atomic_dec(&fs_info->scrubs_paused);
 	mutex_unlock(&fs_info->scrub_lock);
+
 	wake_up(&fs_info->scrub_pause_wait);
 
 	/*
@@ -2357,15 +2364,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
 			atomic_inc(&fs_info->scrubs_paused);
 			wake_up(&fs_info->scrub_pause_wait);
+
 			mutex_lock(&fs_info->scrub_lock);
-			while (atomic_read(&fs_info->scrub_pause_req)) {
-				mutex_unlock(&fs_info->scrub_lock);
-				wait_event(fs_info->scrub_pause_wait,
-				   atomic_read(&fs_info->scrub_pause_req) == 0);
-				mutex_lock(&fs_info->scrub_lock);
-			}
+			scrub_blocked_if_needed(fs_info);
 			atomic_dec(&fs_info->scrubs_paused);
 			mutex_unlock(&fs_info->scrub_lock);
+
 			wake_up(&fs_info->scrub_pause_wait);
 		}
 
@@ -2687,14 +2691,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			   atomic_read(&sctx->workers_pending) == 0);
 
 		mutex_lock(&fs_info->scrub_lock);
-		while (atomic_read(&fs_info->scrub_pause_req)) {
-			mutex_unlock(&fs_info->scrub_lock);
-			wait_event(fs_info->scrub_pause_wait,
-			   atomic_read(&fs_info->scrub_pause_req) == 0);
-			mutex_lock(&fs_info->scrub_lock);
-		}
+		scrub_blocked_if_needed(fs_info);
 		atomic_dec(&fs_info->scrubs_paused);
 		mutex_unlock(&fs_info->scrub_lock);
+
 		wake_up(&fs_info->scrub_pause_wait);
 
 		btrfs_put_block_group(cache);
@@ -2906,7 +2906,13 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	}
 	sctx->readonly = readonly;
 	dev->scrub_device = sctx;
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
+	/*
+	 * checking @scrub_pause_req here, we can avoid
+	 * race between committing transaction and scrubbing.
+	 */
+	scrub_blocked_if_needed(fs_info);
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
 
@@ -2915,9 +2921,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		 * by holding device list mutex, we can
 		 * kick off writing super in log tree sync.
 		 */
+		mutex_lock(&fs_info->fs_devices->device_list_mutex);
 		ret = scrub_supers(sctx, dev);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 	}
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
 	if (!ret)
 		ret = scrub_enumerate_chunks(sctx, dev, start, end,
-- 
cgit v1.2.3-70-g09d2


From cb7ab02156e4ba999df90e9fa8e96107683586fd Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 4 Dec 2013 21:16:53 +0800
Subject: Btrfs: wrap repeated code into scrub_blocked_if_needed()

Just wrap same code into one function scrub_blocked_if_needed().

This make a change that we will move waiting (@workers_pending = 0)
before we can wake up commiting transaction(atomic_inc(@scrub_paused)),
we must take carefully to not deadlock here.

Thread 1			Thread 2
				|->btrfs_commit_transaction()
					|->set trans type(COMMIT_DOING)
					|->btrfs_scrub_paused()(blocked)
|->join_transaction(blocked)

Move btrfs_scrub_paused() before setting trans type which means we can
still join a transaction when commiting_transaction is blocked.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Suggested-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c       | 43 +++++++++++++++++--------------------------
 fs/btrfs/transaction.c |  3 ++-
 2 files changed, 19 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6acb573e7d6..adebe12e497 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 			    int mirror_num, u64 physical_for_dev_replace);
 static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 
 
@@ -270,7 +271,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 	wake_up(&sctx->list_wait);
 }
 
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 {
 	while (atomic_read(&fs_info->scrub_pause_req)) {
 		mutex_unlock(&fs_info->scrub_lock);
@@ -280,6 +281,19 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 	}
 }
 
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+	atomic_inc(&fs_info->scrubs_paused);
+	wake_up(&fs_info->scrub_pause_wait);
+
+	mutex_lock(&fs_info->scrub_lock);
+	__scrub_blocked_if_needed(fs_info);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	wake_up(&fs_info->scrub_pause_wait);
+}
+
 /*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
@@ -2295,8 +2309,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
 	wait_event(sctx->list_wait,
 		   atomic_read(&sctx->bios_in_flight) == 0);
-	atomic_inc(&fs_info->scrubs_paused);
-	wake_up(&fs_info->scrub_pause_wait);
+	scrub_blocked_if_needed(fs_info);
 
 	/* FIXME it might be better to start readahead at commit root */
 	key_start.objectid = logical;
@@ -2320,12 +2333,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	if (!IS_ERR(reada2))
 		btrfs_reada_wait(reada2);
 
-	mutex_lock(&fs_info->scrub_lock);
-	scrub_blocked_if_needed(fs_info);
-	atomic_dec(&fs_info->scrubs_paused);
-	mutex_unlock(&fs_info->scrub_lock);
-
-	wake_up(&fs_info->scrub_pause_wait);
 
 	/*
 	 * collect all data csums for the stripe to avoid seeking during
@@ -2362,15 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			wait_event(sctx->list_wait,
 				   atomic_read(&sctx->bios_in_flight) == 0);
 			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
-			atomic_inc(&fs_info->scrubs_paused);
-			wake_up(&fs_info->scrub_pause_wait);
-
-			mutex_lock(&fs_info->scrub_lock);
 			scrub_blocked_if_needed(fs_info);
-			atomic_dec(&fs_info->scrubs_paused);
-			mutex_unlock(&fs_info->scrub_lock);
-
-			wake_up(&fs_info->scrub_pause_wait);
 		}
 
 		key.objectid = logical;
@@ -2685,17 +2684,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->bios_in_flight) == 0);
 		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
-		atomic_inc(&fs_info->scrubs_paused);
-		wake_up(&fs_info->scrub_pause_wait);
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->workers_pending) == 0);
-
-		mutex_lock(&fs_info->scrub_lock);
 		scrub_blocked_if_needed(fs_info);
-		atomic_dec(&fs_info->scrubs_paused);
-		mutex_unlock(&fs_info->scrub_lock);
-
-		wake_up(&fs_info->scrub_pause_wait);
 
 		btrfs_put_block_group(cache);
 		if (ret)
@@ -2912,7 +2903,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	 * checking @scrub_pause_req here, we can avoid
 	 * race between committing transaction and scrubbing.
 	 */
-	scrub_blocked_if_needed(fs_info);
+	__scrub_blocked_if_needed(fs_info);
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 14516375777..026f1fea963 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1748,6 +1748,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		goto cleanup_transaction;
 
 	btrfs_wait_delalloc_flush(root->fs_info);
+
+	btrfs_scrub_pause(root);
 	/*
 	 * Ok now we need to make sure to block out any other joins while we
 	 * commit the transaction.  We could have started a join before setting
@@ -1812,7 +1814,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	WARN_ON(cur_trans != trans->transaction);
 
-	btrfs_scrub_pause(root);
 	/* btrfs_commit_tree_roots is responsible for getting the
 	 * various roots consistent with each other.  Every pointer
 	 * in the tree of tree roots has to point to the most up to date
-- 
cgit v1.2.3-70-g09d2


From 2ef1fed285d58e77ce777d9a525fed4788b5a6d0 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Wed, 4 Dec 2013 22:17:39 +0000
Subject: Btrfs: more efficient push_leaf_right

Currently when finding the leaf to insert a key into a btree, if the
leaf doesn't have enough space to store the item we attempt to move
off some items from our leaf to its right neighbor leaf, and if this
fails to create enough free space in our leaf, we try to move off more
items to the left neighbor leaf as well.

When trying to move off items to the right neighbor leaf, if it has
enough room to store the new key but not not enough room to move off
at least one item from our target leaf, __push_leaf_right returns 1 and
we have to attempt to move items to the left neighbor (push_leaf_left
function) without touching the right neighbor leaf.
For the case where the right leaf has enough room to store at least 1
item from our leaf, we end up modifying (and dirtying) both our leaf
and the right leaf. This is non-optimal for the case where the new key
is greater than any key in our target leaf because it can be inserted at
slot 0 of the right neighbor leaf and we don't need to touch our leaf
at all nor to attempt to move off items to the left neighbor leaf.

Therefore this change just selects the right neighbor leaf as our new
target leaf if it has enough room for the new key without modifying our
initial target leaf - we do this only if the new key is higher than any
key in the initial target leaf.

While running the following test, push_leaf_right was called by split_leaf
4802 times. Out of those 4802 calls, for 2571 calls (53.5%) we hit this
special case (right leaf has enough room and new key is higher than any key
in the initial target leaf).

Test:

  sysbench --test=fileio --file-num=512 --file-total-size=5G \
    --file-test-mode=[seqwr|rndwr] --num-threads=512 --file-block-size=8192 \
    --max-requests=100000 --file-io-mode=sync [prepare|run]

Results:

sequential writes

Throughput before this change: 65.71Mb/sec (average of 10 runs)
Throughput after this change:  66.58Mb/sec (average of 10 runs)

random writes

Throughput before this change: 10.75Mb/sec (average of 10 runs)
Throughput after this change:  11.56Mb/sec (average of 10 runs)

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 11f9a1848c7..a57507aa670 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3613,6 +3613,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (left_nritems == 0)
 		goto out_unlock;
 
+	if (path->slots[0] == left_nritems && !empty) {
+		/* Key greater than all keys in the leaf, right neighbor has
+		 * enough room for it and we're not emptying our leaf to delete
+		 * it, therefore use right neighbor to insert the new item and
+		 * no need to touch/dirty our left leaft. */
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+		path->nodes[0] = right;
+		path->slots[0] = 0;
+		path->slots[1]++;
+		return 0;
+	}
+
 	return __push_leaf_right(trans, root, path, min_data_size, empty,
 				right, free_space, left_nritems, min_slot);
 out_unlock:
-- 
cgit v1.2.3-70-g09d2


From a5dee37d390f3713f06e286a33b262f0fdb2b0ff Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 13 Dec 2013 10:02:44 -0500
Subject: Btrfs: deal with io_tree->mapping being NULL

I need to add infrastructure to allocate dummy extent buffers for running sanity
tests, and to do this I need to not have to worry about having an
address_mapping for an io_tree, so just fix up the places where we assume that
all io_tree's have a non-NULL ->mapping.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5fc9481515f..1d8244d39b6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,13 +77,19 @@ void btrfs_leak_debug_check(void)
 	}
 }
 
-#define btrfs_debug_check_extent_io_range(inode, start, end)		\
-	__btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+#define btrfs_debug_check_extent_io_range(tree, start, end)		\
+	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
-		struct inode *inode, u64 start, u64 end)
+		struct extent_io_tree *tree, u64 start, u64 end)
 {
-	u64 isize = i_size_read(inode);
+	struct inode *inode;
+	u64 isize;
 
+	if (!tree->mapping)
+		return;
+
+	inode = tree->mapping->host;
+	isize = i_size_read(inode);
 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 		printk_ratelimited(KERN_DEBUG
 		    "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
@@ -124,6 +130,8 @@ static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
 {
+	if (!tree->mapping)
+		return NULL;
 	return btrfs_sb(tree->mapping->host->i_sb);
 }
 
@@ -570,7 +578,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	int err;
 	int clear = 0;
 
-	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+	btrfs_debug_check_extent_io_range(tree, start, end);
 
 	if (bits & EXTENT_DELALLOC)
 		bits |= EXTENT_NORESERVE;
@@ -730,7 +738,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct rb_node *node;
 
-	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+	btrfs_debug_check_extent_io_range(tree, start, end);
 
 	spin_lock(&tree->lock);
 again:
@@ -817,7 +825,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 last_start;
 	u64 last_end;
 
-	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+	btrfs_debug_check_extent_io_range(tree, start, end);
 
 	bits |= EXTENT_FIRST_DELALLOC;
 again:
@@ -1043,7 +1051,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 last_start;
 	u64 last_end;
 
-	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+	btrfs_debug_check_extent_io_range(tree, start, end);
 
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
-- 
cgit v1.2.3-70-g09d2


From 34b41acec1ccc06373ec584de19618d48ceb09fc Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 13 Dec 2013 10:41:51 -0500
Subject: Btrfs: use a bit to track if we're in the radix tree

For creating a dummy in-memory btree I need to be able to use the radix tree to
keep track of the buffers like normal extent buffers.  With dummy buffers we
skip the radix tree step, and we still want to do that for the tree mod log
dummy buffers but for my test buffers we need to be able to remove them from the
radix tree like normal.  This will give me a way to do that.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 7 ++++---
 fs/btrfs/extent_io.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1d8244d39b6..92a3ad4656a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4617,6 +4617,7 @@ again:
 	}
 	/* add one reference for the tree */
 	check_buffer_tree_ref(eb);
+	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
 
 	/*
 	 * there is a race where release page may have
@@ -4660,9 +4661,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 {
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	if (atomic_dec_and_test(&eb->refs)) {
-		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
-			spin_unlock(&eb->refs_lock);
-		} else {
+		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
 			struct extent_io_tree *tree = eb->tree;
 
 			spin_unlock(&eb->refs_lock);
@@ -4671,6 +4670,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
 			radix_tree_delete(&tree->buffer,
 					  eb->start >> PAGE_CACHE_SHIFT);
 			spin_unlock(&tree->buffer_lock);
+		} else {
+			spin_unlock(&eb->refs_lock);
 		}
 
 		/* Should be safe to release our pages at this point */
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 19620c58f09..92e4347315e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -43,6 +43,7 @@
 #define EXTENT_BUFFER_WRITEBACK 7
 #define EXTENT_BUFFER_IOERR 8
 #define EXTENT_BUFFER_DUMMY 9
+#define EXTENT_BUFFER_IN_TREE 10
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define PAGE_UNLOCK		(1 << 0)
-- 
cgit v1.2.3-70-g09d2


From f28491e0a6c46d99cbbef0f8ef7e314afa2359c8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 16 Dec 2013 13:24:27 -0500
Subject: Btrfs: move the extent buffer radix tree into the fs_info

I need to create a fake tree to test qgroups and I don't want to have to setup a
fake btree_inode.  The fact is we only use the radix tree for the fs_info, so
everybody else who allocates an extent_io_tree is just wasting the space anyway.
This patch moves the radix tree and its lock into btrfs_fs_info so there is less
stuff I have to fake to do qgroup sanity tests.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h     |  4 ++++
 fs/btrfs/disk-io.c   | 14 ++++----------
 fs/btrfs/extent_io.c | 47 +++++++++++++++++++++++------------------------
 fs/btrfs/extent_io.h |  8 +++-----
 4 files changed, 34 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7158c97fdc5..a924274a503 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1659,6 +1659,10 @@ struct btrfs_fs_info {
 	spinlock_t reada_lock;
 	struct radix_tree_root reada_tree;
 
+	/* Extent buffer radix tree */
+	spinlock_t buffer_lock;
+	struct radix_tree_root buffer_radix;
+
 	/* next backup root to be overwritten */
 	int backup_root_index;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ecee0a009c..4a1871cf797 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1089,21 +1089,13 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_buffer *eb;
-	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr);
-	return eb;
+	return find_extent_buffer(root->fs_info, bytenr);
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 bytenr, u32 blocksize)
 {
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_buffer *eb;
-
-	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				 bytenr, blocksize);
-	return eb;
+	return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
 }
 
 
@@ -2145,6 +2137,7 @@ int open_ctree(struct super_block *sb,
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2158,6 +2151,7 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->free_chunk_lock);
 	spin_lock_init(&fs_info->tree_mod_seq_lock);
 	spin_lock_init(&fs_info->super_lock);
+	spin_lock_init(&fs_info->buffer_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->reloc_mutex);
 	seqlock_init(&fs_info->profiles_lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 92a3ad4656a..2fa23b57d8d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -194,11 +194,9 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 			 struct address_space *mapping)
 {
 	tree->state = RB_ROOT;
-	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
-	spin_lock_init(&tree->buffer_lock);
 	tree->mapping = mapping;
 }
 
@@ -3489,6 +3487,7 @@ static int write_one_eb(struct extent_buffer *eb,
 			struct extent_page_data *epd)
 {
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 	u64 offset = eb->start;
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
@@ -3506,7 +3505,7 @@ static int write_one_eb(struct extent_buffer *eb,
 
 		clear_page_dirty_for_io(p);
 		set_page_writeback(p);
-		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+		ret = submit_extent_page(rw, tree, p, offset >> 9,
 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
 					 -1, end_bio_extent_buffer_writepage,
 					 0, epd->bio_flags, bio_flags);
@@ -4370,10 +4369,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 	__free_extent_buffer(eb);
 }
 
-static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
-						   u64 start,
-						   unsigned long len,
-						   gfp_t mask)
+static struct extent_buffer *
+__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+		      unsigned long len, gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
 
@@ -4382,7 +4380,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 		return NULL;
 	eb->start = start;
 	eb->len = len;
-	eb->tree = tree;
+	eb->fs_info = fs_info;
 	eb->bflags = 0;
 	rwlock_init(&eb->lock);
 	atomic_set(&eb->write_locks, 0);
@@ -4514,13 +4512,14 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
 	}
 }
 
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-					 		u64 start)
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+					 u64 start)
 {
 	struct extent_buffer *eb;
 
 	rcu_read_lock();
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	eb = radix_tree_lookup(&fs_info->buffer_radix,
+			       start >> PAGE_CACHE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
 		mark_extent_buffer_accessed(eb);
@@ -4531,7 +4530,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 	return NULL;
 }
 
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start, unsigned long len)
 {
 	unsigned long num_pages = num_extent_pages(start, len);
@@ -4540,16 +4539,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	struct extent_buffer *eb;
 	struct extent_buffer *exists = NULL;
 	struct page *p;
-	struct address_space *mapping = tree->mapping;
+	struct address_space *mapping = fs_info->btree_inode->i_mapping;
 	int uptodate = 1;
 	int ret;
 
-
-	eb = find_extent_buffer(tree, start);
+	eb = find_extent_buffer(fs_info, start);
 	if (eb)
 		return eb;
 
-	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
+	eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
 	if (!eb)
 		return NULL;
 
@@ -4604,12 +4602,13 @@ again:
 	if (ret)
 		goto free_eb;
 
-	spin_lock(&tree->buffer_lock);
-	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
-	spin_unlock(&tree->buffer_lock);
+	spin_lock(&fs_info->buffer_lock);
+	ret = radix_tree_insert(&fs_info->buffer_radix,
+				start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&fs_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = find_extent_buffer(tree, start);
+		exists = find_extent_buffer(fs_info, start);
 		if (exists)
 			goto free_eb;
 		else
@@ -4662,14 +4661,14 @@ static int release_extent_buffer(struct extent_buffer *eb)
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	if (atomic_dec_and_test(&eb->refs)) {
 		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
-			struct extent_io_tree *tree = eb->tree;
+			struct btrfs_fs_info *fs_info = eb->fs_info;
 
 			spin_unlock(&eb->refs_lock);
 
-			spin_lock(&tree->buffer_lock);
-			radix_tree_delete(&tree->buffer,
+			spin_lock(&fs_info->buffer_lock);
+			radix_tree_delete(&fs_info->buffer_radix,
 					  eb->start >> PAGE_CACHE_SHIFT);
-			spin_unlock(&tree->buffer_lock);
+			spin_unlock(&fs_info->buffer_lock);
 		} else {
 			spin_unlock(&eb->refs_lock);
 		}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 92e4347315e..58b27e5ab52 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -95,12 +95,10 @@ struct extent_io_ops {
 
 struct extent_io_tree {
 	struct rb_root state;
-	struct radix_tree_root buffer;
 	struct address_space *mapping;
 	u64 dirty_bytes;
 	int track_uptodate;
 	spinlock_t lock;
-	spinlock_t buffer_lock;
 	struct extent_io_ops *ops;
 };
 
@@ -131,7 +129,7 @@ struct extent_buffer {
 	unsigned long map_start;
 	unsigned long map_len;
 	unsigned long bflags;
-	struct extent_io_tree *tree;
+	struct btrfs_fs_info *fs_info;
 	spinlock_t refs_lock;
 	atomic_t refs;
 	atomic_t io_pages;
@@ -267,11 +265,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start, unsigned long len);
 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 					 u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
-- 
cgit v1.2.3-70-g09d2


From 783577663507411e36e459390ef056556e93ef29 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Thu, 12 Dec 2013 19:19:52 +0000
Subject: Btrfs: return immediately if tree log mod is not necessary

In ctree.c:tree_mod_log_set_node_key() we were calling
__tree_mod_log_insert_key() even when the modification doesn't need
to be logged. This would allocate a tree_mod_elem structure, fill it
and pass it to  __tree_mod_log_insert(), which would just acquire
the tree mod log write lock and then free the tree_mod_elem structure
and return (that is, a no-op).

Therefore call tree_mod_log_insert() instead of __tree_mod_log_insert()
which just returns immediately if the modification doesn't need to be
logged (without allocating the structure, fill it, acquire write lock,
free structure).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a57507aa670..59664f6cbc4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -771,7 +771,7 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 {
 	int ret;
 
-	ret = __tree_mod_log_insert_key(fs_info, eb, slot,
+	ret = tree_mod_log_insert_key(fs_info, eb, slot,
 					MOD_LOG_KEY_REPLACE,
 					atomic ? GFP_ATOMIC : GFP_NOFS);
 	BUG_ON(ret < 0);
-- 
cgit v1.2.3-70-g09d2


From 5662344b3c0d9ddd9afd48716d795166f982d5e2 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Fri, 13 Dec 2013 09:51:42 +0900
Subject: Btrfs: fix error check of btrfs_lookup_dentry()

Clean up btrfs_lookup_dentry() to never return NULL, but PTR_ERR(-ENOENT)
instead. This keeps the return value convention consistent.

Callers who use btrfs_lookup_dentry() require a trivial update.

create_snapshot() in particular looks like it can also lose a BUG_ON(!inode)
which is not really needed - there seems less harm in returning ENOENT to
userspace at that point in the stack than there is to crash the machine.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 15 +++++++++++----
 fs/btrfs/ioctl.c | 13 ++++++++++---
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e889779c9b3..2bd4f7590c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4992,7 +4992,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return ERR_PTR(ret);
 
 	if (location.objectid == 0)
-		return NULL;
+		return ERR_PTR(-ENOENT);
 
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
@@ -5056,10 +5056,17 @@ static void btrfs_dentry_release(struct dentry *dentry)
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
 {
-	struct dentry *ret;
+	struct inode *inode;
 
-	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
-	return ret;
+	inode = btrfs_lookup_dentry(dir, dentry);
+	if (IS_ERR(inode)) {
+		if (PTR_ERR(inode) == -ENOENT)
+			inode = NULL;
+		else
+			return ERR_CAST(inode);
+	}
+
+	return d_splice_alias(inode, dentry);
 }
 
 unsigned char btrfs_filetype_table[] = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 60d3d37ef9a..89c2f6169b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -393,6 +393,7 @@ static noinline int create_subvol(struct inode *dir,
 	struct btrfs_root *new_root;
 	struct btrfs_block_rsv block_rsv;
 	struct timespec cur_time = CURRENT_TIME;
+	struct inode *inode;
 	int ret;
 	int err;
 	u64 objectid;
@@ -554,8 +555,14 @@ fail:
 	if (err && !ret)
 		ret = err;
 
-	if (!ret)
-		d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+	if (!ret) {
+		inode = btrfs_lookup_dentry(dir, dentry);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			goto out;
+		}
+		d_instantiate(dentry, inode);
+	}
 out:
 	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 	return ret;
@@ -643,7 +650,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 		ret = PTR_ERR(inode);
 		goto fail;
 	}
-	BUG_ON(!inode);
+
 	d_instantiate(dentry, inode);
 	ret = 0;
 fail:
-- 
cgit v1.2.3-70-g09d2


From 54eb72c05f7731b4b148da47419b90a5f2108036 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Fri, 13 Dec 2013 18:30:44 +0800
Subject: Btrfs: remove unnecessary filemap writting and waiting after block
 group relocation

We have commited transaction before, remove redundant filemap writting and
waiting here, it can speed up balance relocation process.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/relocation.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 63708f77c8e..d8a82b84998 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4283,11 +4283,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		}
 	}
 
-	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
-				     rc->block_group->key.objectid,
-				     rc->block_group->key.objectid +
-				     rc->block_group->key.offset - 1);
-
 	WARN_ON(rc->block_group->pinned > 0);
 	WARN_ON(rc->block_group->reserved > 0);
 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
-- 
cgit v1.2.3-70-g09d2


From fc28b62d648dde3cfbec6584c0fa19ea7350e7e9 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 13 Dec 2013 19:39:34 +0000
Subject: Btrfs: fix use of uninitialized err variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs/btrfs/file.c: In function ‘prepare_pages.isra.18’:
fs/btrfs/file.c:1265:6: warning: ‘err’ may be used uninitialized in this function [-Wuninitialized]

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 740ae8c7170..35bf83876f3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1244,7 +1244,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-	int err;
+	int err = 0;
 	int faili;
 
 	for (i = 0; i < num_pages; i++) {
-- 
cgit v1.2.3-70-g09d2


From e223cfcd3eed7401e4217c7fb7391017c8124d8c Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 13 Dec 2013 19:39:54 +0000
Subject: Btrfs: remove field tree_mod_seq_elem from btrfs_fs_info struct

It's not used anywhere, so just drop it.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a924274a503..3c9053a153b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1462,7 +1462,6 @@ struct btrfs_fs_info {
 	spinlock_t tree_mod_seq_lock;
 	atomic64_t tree_mod_seq;
 	struct list_head tree_mod_seq_list;
-	struct seq_list tree_mod_seq_elem;
 
 	/* this protects tree_mod_log */
 	rwlock_t tree_mod_log_lock;
-- 
cgit v1.2.3-70-g09d2


From 663df053309c8d9200b87cc1a129729b8e97eb26 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sun, 15 Dec 2013 11:39:42 +0800
Subject: Btrfs: remove dead comments for read_csums()

Chris introduced hleper function  read_csums() and this function
has been removed, but we forgot to remove its corresponding comments.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bd4f7590c8..b3b3142f173 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2543,12 +2543,6 @@ out_kfree:
 	return NULL;
 }
 
-/*
- * helper function for btrfs_finish_ordered_io, this
- * just reads in some of the csum leaves to prime them into ram
- * before we start the transaction.  It limits the amount of btree
- * reads required while inside the transaction.
- */
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
-- 
cgit v1.2.3-70-g09d2


From 3fe81ce206f3805e0eb5d886aabbf91064655144 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 15 Dec 2013 12:43:58 +0000
Subject: Btrfs: fix deadlock when iterating inode refs and running delayed
 inodes

While running btrfs/004 from xfstests, after 503 iterations, dmesg reported
a deadlock between tasks iterating inode refs and tasks running delayed inodes
(during a transaction commit).

It turns out that iterating inode refs implies doing one tree search and
release all nodes in the path except the leaf node, and then passing that
leaf node to btrfs_ref_to_path(), which in turn does another tree search
without releasing the lock on the leaf node it received as parameter.

This is a problem when other task wants to write to the btree as well and
ends up updating the leaf that is read locked - the writer task locks the
parent of the leaf and then blocks waiting for the leaf's lock to be
released - at the same time, the task executing btrfs_ref_to_path()
does a second tree search, without releasing the lock on the first leaf,
and wants to access a leaf (the same or another one) that is a child of
the same parent, resulting in a deadlock.

The trace reported by lockdep follows.

[84314.936373] INFO: task fsstress:11930 blocked for more than 120 seconds.
[84314.936381]       Tainted: G        W  O 3.12.0-fdm-btrfs-next-16+ #70
[84314.936383] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84314.936386] fsstress        D ffff8806e1bf8000     0 11930  11926 0x00000000
[84314.936393]  ffff8804d6d89b78 0000000000000046 ffff8804d6d89b18 ffffffff810bd8bd
[84314.936399]  ffff8806e1bf8000 ffff8804d6d89fd8 ffff8804d6d89fd8 ffff8804d6d89fd8
[84314.936405]  ffff880806308000 ffff8806e1bf8000 ffff8804d6d89c08 ffff8804deb8f190
[84314.936410] Call Trace:
[84314.936421]  [<ffffffff810bd8bd>] ? trace_hardirqs_on+0xd/0x10
[84314.936428]  [<ffffffff81774269>] schedule+0x29/0x70
[84314.936451]  [<ffffffffa0715bf5>] btrfs_tree_lock+0x75/0x270 [btrfs]
[84314.936457]  [<ffffffff810715c0>] ? __init_waitqueue_head+0x60/0x60
[84314.936470]  [<ffffffffa06ba231>] btrfs_search_slot+0x7f1/0x930 [btrfs]
[84314.936489]  [<ffffffffa0731c2a>] ? __btrfs_run_delayed_items+0x13a/0x1e0 [btrfs]
[84314.936504]  [<ffffffffa06d2e1f>] btrfs_lookup_inode+0x2f/0xa0 [btrfs]
[84314.936510]  [<ffffffff810bd6ef>] ? trace_hardirqs_on_caller+0x1f/0x1e0
[84314.936528]  [<ffffffffa073173c>] __btrfs_update_delayed_inode+0x4c/0x1d0 [btrfs]
[84314.936543]  [<ffffffffa0731c2a>] ? __btrfs_run_delayed_items+0x13a/0x1e0 [btrfs]
[84314.936558]  [<ffffffffa0731c2a>] ? __btrfs_run_delayed_items+0x13a/0x1e0 [btrfs]
[84314.936573]  [<ffffffffa0731c82>] __btrfs_run_delayed_items+0x192/0x1e0 [btrfs]
[84314.936589]  [<ffffffffa0731d03>] btrfs_run_delayed_items+0x13/0x20 [btrfs]
[84314.936604]  [<ffffffffa06dbcd4>] btrfs_flush_all_pending_stuffs+0x24/0x80 [btrfs]
[84314.936620]  [<ffffffffa06ddc13>] btrfs_commit_transaction+0x223/0xa20 [btrfs]
[84314.936630]  [<ffffffffa06ae5ae>] btrfs_sync_fs+0x6e/0x110 [btrfs]
[84314.936635]  [<ffffffff811d0b50>] ? __sync_filesystem+0x60/0x60
[84314.936639]  [<ffffffff811d0b50>] ? __sync_filesystem+0x60/0x60
[84314.936643]  [<ffffffff811d0b70>] sync_fs_one_sb+0x20/0x30
[84314.936648]  [<ffffffff811a3541>] iterate_supers+0xf1/0x100
[84314.936652]  [<ffffffff811d0c45>] sys_sync+0x55/0x90
[84314.936658]  [<ffffffff8177ef12>] system_call_fastpath+0x16/0x1b
[84314.936660] INFO: lockdep is turned off.
[84314.936663] INFO: task btrfs:11955 blocked for more than 120 seconds.
[84314.936666]       Tainted: G        W  O 3.12.0-fdm-btrfs-next-16+ #70
[84314.936668] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84314.936670] btrfs           D ffff880541729a88     0 11955  11608 0x00000000
[84314.936674]  ffff880541729a38 0000000000000046 ffff8805417299d8 ffffffff810bd8bd
[84314.936680]  ffff88075430c8a0 ffff880541729fd8 ffff880541729fd8 ffff880541729fd8
[84314.936685]  ffffffff81c104e0 ffff88075430c8a0 ffff8804de8b00b8 ffff8804de8b0000
[84314.936690] Call Trace:
[84314.936695]  [<ffffffff810bd8bd>] ? trace_hardirqs_on+0xd/0x10
[84314.936700]  [<ffffffff81774269>] schedule+0x29/0x70
[84314.936717]  [<ffffffffa0715815>] btrfs_tree_read_lock+0xd5/0x140 [btrfs]
[84314.936721]  [<ffffffff810715c0>] ? __init_waitqueue_head+0x60/0x60
[84314.936733]  [<ffffffffa06ba201>] btrfs_search_slot+0x7c1/0x930 [btrfs]
[84314.936746]  [<ffffffffa06bd505>] btrfs_find_item+0x55/0x160 [btrfs]
[84314.936763]  [<ffffffffa06ff689>] ? free_extent_buffer+0x49/0xc0 [btrfs]
[84314.936780]  [<ffffffffa073c9ca>] btrfs_ref_to_path+0xba/0x1e0 [btrfs]
[84314.936797]  [<ffffffffa06f9719>] ? release_extent_buffer+0xb9/0xe0 [btrfs]
[84314.936813]  [<ffffffffa06ff689>] ? free_extent_buffer+0x49/0xc0 [btrfs]
[84314.936830]  [<ffffffffa073cb50>] inode_to_path+0x60/0xd0 [btrfs]
[84314.936846]  [<ffffffffa073d365>] paths_from_inode+0x115/0x3c0 [btrfs]
[84314.936851]  [<ffffffff8118dd44>] ? kmem_cache_alloc_trace+0x114/0x200
[84314.936868]  [<ffffffffa0714494>] btrfs_ioctl+0xf14/0x2030 [btrfs]
[84314.936873]  [<ffffffff817762db>] ? _raw_spin_unlock+0x2b/0x50
[84314.936877]  [<ffffffff8116598f>] ? handle_mm_fault+0x34f/0xb00
[84314.936882]  [<ffffffff81075563>] ? up_read+0x23/0x40
[84314.936886]  [<ffffffff8177a41c>] ? __do_page_fault+0x20c/0x5a0
[84314.936892]  [<ffffffff811b2946>] do_vfs_ioctl+0x96/0x570
[84314.936896]  [<ffffffff81776e23>] ? error_sti+0x5/0x6
[84314.936901]  [<ffffffff810b71e8>] ? trace_hardirqs_off_caller+0x28/0xd0
[84314.936906]  [<ffffffff81776a09>] ? retint_swapgs+0xe/0x13
[84314.936910]  [<ffffffff811b2eb1>] SyS_ioctl+0x91/0xb0
[84314.936915]  [<ffffffff813eecde>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[84314.936920]  [<ffffffff8177ef12>] system_call_fastpath+0x16/0x1b
[84314.936922] INFO: lockdep is turned off.
[84434.866873] INFO: task btrfs-transacti:11921 blocked for more than 120 seconds.
[84434.866881]       Tainted: G        W  O 3.12.0-fdm-btrfs-next-16+ #70
[84434.866883] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84434.866886] btrfs-transacti D ffff880755b6a478     0 11921      2 0x00000000
[84434.866893]  ffff8800735b9ce8 0000000000000046 ffff8800735b9c88 ffffffff810bd8bd
[84434.866899]  ffff8805a1b848a0 ffff8800735b9fd8 ffff8800735b9fd8 ffff8800735b9fd8
[84434.866904]  ffffffff81c104e0 ffff8805a1b848a0 ffff880755b6a478 ffff8804cece78f0
[84434.866910] Call Trace:
[84434.866920]  [<ffffffff810bd8bd>] ? trace_hardirqs_on+0xd/0x10
[84434.866927]  [<ffffffff81774269>] schedule+0x29/0x70
[84434.866948]  [<ffffffffa06dd2ef>] wait_current_trans.isra.33+0xbf/0x120 [btrfs]
[84434.866954]  [<ffffffff810715c0>] ? __init_waitqueue_head+0x60/0x60
[84434.866970]  [<ffffffffa06dec18>] start_transaction+0x388/0x5a0 [btrfs]
[84434.866985]  [<ffffffffa06db9b5>] ? transaction_kthread+0xb5/0x280 [btrfs]
[84434.866999]  [<ffffffffa06dee97>] btrfs_attach_transaction+0x17/0x20 [btrfs]
[84434.867012]  [<ffffffffa06dba9e>] transaction_kthread+0x19e/0x280 [btrfs]
[84434.867026]  [<ffffffffa06db900>] ? open_ctree+0x2260/0x2260 [btrfs]
[84434.867030]  [<ffffffff81070dad>] kthread+0xed/0x100
[84434.867035]  [<ffffffff81070cc0>] ? flush_kthread_worker+0x190/0x190
[84434.867040]  [<ffffffff8177ee6c>] ret_from_fork+0x7c/0xb0
[84434.867044]  [<ffffffff81070cc0>] ? flush_kthread_worker+0x190/0x190

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6a3f7f50ad3..835b6c9a26a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1569,7 +1569,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 	struct btrfs_key found_key;
 
 	while (!ret) {
-		path->leave_spinning = 1;
 		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
 				     &found_key);
 		if (ret < 0)
@@ -1582,9 +1581,12 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 
 		parent = found_key.offset;
 		slot = path->slots[0];
-		eb = path->nodes[0];
-		/* make sure we can use eb after releasing the path */
-		atomic_inc(&eb->refs);
+		eb = btrfs_clone_extent_buffer(path->nodes[0]);
+		if (!eb) {
+			ret = -ENOMEM;
+			break;
+		}
+		extent_buffer_get(eb);
 		btrfs_tree_read_lock(eb);
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
@@ -1642,9 +1644,12 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 		++found;
 
 		slot = path->slots[0];
-		eb = path->nodes[0];
-		/* make sure we can use eb after releasing the path */
-		atomic_inc(&eb->refs);
+		eb = btrfs_clone_extent_buffer(path->nodes[0]);
+		if (!eb) {
+			ret = -ENOMEM;
+			break;
+		}
+		extent_buffer_get(eb);
 
 		btrfs_tree_read_lock(eb);
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
-- 
cgit v1.2.3-70-g09d2


From 95bc79d50d0ec20c0cdb071629dc3f276a053782 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 16 Dec 2013 17:34:10 +0100
Subject: btrfs: send: clean up dead code

Remove ifdefed code:

- tlv_put for 8, 16 and 32, add a generic tempalte if needed in future
- tlv_put_timespec - the btrfs_timespec fields are used
- fs_path_remove obsoleted long ago

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 58 ++++++++-------------------------------------------------
 1 file changed, 8 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 1896e394d59..8230d11f2cc 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -344,16 +344,6 @@ out:
 	return ret;
 }
 
-#if 0
-static void fs_path_remove(struct fs_path *p)
-{
-	BUG_ON(p->reversed);
-	while (p->start != p->end && *p->end != '/')
-		p->end--;
-	*p->end = 0;
-}
-#endif
-
 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 {
 	int ret;
@@ -444,30 +434,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
 	return 0;
 }
 
-#if 0
-static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value)
-{
-	return tlv_put(sctx, attr, &value, sizeof(value));
-}
-
-static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value)
-{
-	__le16 tmp = cpu_to_le16(value);
-	return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-
-static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
-{
-	__le32 tmp = cpu_to_le32(value);
-	return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-#endif
+#define TLV_PUT_DEFINE_INT(bits) \
+	static int tlv_put_u##bits(struct send_ctx *sctx,	 	\
+			u##bits attr, u##bits value)			\
+	{								\
+		__le##bits __tmp = cpu_to_le##bits(value);		\
+		return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));	\
+	}
 
-static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value)
-{
-	__le64 tmp = cpu_to_le64(value);
-	return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
+TLV_PUT_DEFINE_INT(64)
 
 static int tlv_put_string(struct send_ctx *sctx, u16 attr,
 			  const char *str, int len)
@@ -483,17 +458,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
 	return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
 }
 
-#if 0
-static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
-			    struct timespec *ts)
-{
-	struct btrfs_timespec bts;
-	bts.sec = cpu_to_le64(ts->tv_sec);
-	bts.nsec = cpu_to_le32(ts->tv_nsec);
-	return tlv_put(sctx, attr, &bts, sizeof(bts));
-}
-#endif
-
 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
 				  struct extent_buffer *eb,
 				  struct btrfs_timespec *ts)
@@ -541,12 +505,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
 		if (ret < 0) \
 			goto tlv_put_failure; \
 	} while (0)
-#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
-	do { \
-		ret = tlv_put_timespec(sctx, attrtype, ts); \
-		if (ret < 0) \
-			goto tlv_put_failure; \
-	} while (0)
 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
 	do { \
 		ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
-- 
cgit v1.2.3-70-g09d2


From a8d89f5ba0e17622cde8f5ac48ef745a9fb1e13b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 16 Dec 2013 17:34:14 +0100
Subject: btrfs: remove unused mnt from send_ctx

Unused since ed2590953bd06b892f0411fc94e19175d32f197a
"Btrfs: stop using vfs_read in send".

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8230d11f2cc..e98c9bc003c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -88,8 +88,6 @@ struct send_ctx {
 	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
 	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */
 
-	struct vfsmount *mnt;
-
 	struct btrfs_root *send_root;
 	struct btrfs_root *parent_root;
 	struct clone_root *clone_roots;
@@ -4851,8 +4849,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 		goto out;
 	}
 
-	sctx->mnt = mnt_file->f_path.mnt;
-
 	sctx->send_root = send_root;
 	sctx->clone_roots_cnt = arg->clone_sources_count;
 
-- 
cgit v1.2.3-70-g09d2


From 2c68653787f91c62f8891209dc1f617088c822e4 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 16 Dec 2013 17:34:17 +0100
Subject: btrfs: Check read-only status of roots during send

All the subvolues that are involved in send must be read-only during the
whole operation. The ioctl SUBVOL_SETFLAGS could be used to change the
status to read-write and the result of send stream is undefined if the
data change unexpectedly.

Fix that by adding a refcount for all involved roots and verify that
there's no send in progress during SUBVOL_SETFLAGS ioctl call that does
read-only -> read-write transition.

We need refcounts because there are no restrictions on number of send
parallel operations currently run on a single subvolume, be it source,
parent or one of the multiple clone sources.

Kernel is silent when the RO checks fail and returns EPERM. The same set
of checks is done already in userspace before send starts.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h |  6 ++++++
 fs/btrfs/ioctl.c | 22 +++++++++++++++++++---
 fs/btrfs/send.c  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3c9053a153b..9318c7520f5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1814,6 +1814,12 @@ struct btrfs_root {
 	struct list_head ordered_extents;
 	struct list_head ordered_root;
 	u64 nr_ordered_extents;
+
+	/*
+	 * Number of currently running SEND ioctls to prevent
+	 * manipulation with the read-only status via SUBVOL_SETFLAGS
+	 */
+	int send_in_progress;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 89c2f6169b9..c0dc05467ce 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1706,12 +1706,28 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 		goto out_drop_sem;
 
 	root_flags = btrfs_root_flags(&root->root_item);
-	if (flags & BTRFS_SUBVOL_RDONLY)
+	if (flags & BTRFS_SUBVOL_RDONLY) {
 		btrfs_set_root_flags(&root->root_item,
 				     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
-	else
-		btrfs_set_root_flags(&root->root_item,
+	} else {
+		/*
+		 * Block RO -> RW transition if this subvolume is involved in
+		 * send
+		 */
+		spin_lock(&root->root_item_lock);
+		if (root->send_in_progress == 0) {
+			btrfs_set_root_flags(&root->root_item,
 				     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+			spin_unlock(&root->root_item_lock);
+		} else {
+			spin_unlock(&root->root_item_lock);
+			btrfs_warn(root->fs_info,
+			"Attempt to set subvolume %llu read-write during send",
+					root->root_key.objectid);
+			ret = -EPERM;
+			goto out_drop_sem;
+		}
+	}
 
 	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e98c9bc003c..572e8c75871 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4769,6 +4769,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	struct send_ctx *sctx = NULL;
 	u32 i;
 	u64 *clone_sources_tmp = NULL;
+	int clone_sources_to_rollback = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -4776,6 +4777,14 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	send_root = BTRFS_I(file_inode(mnt_file))->root;
 	fs_info = send_root->fs_info;
 
+	/*
+	 * The subvolume must remain read-only during send, protect against
+	 * making it RW.
+	 */
+	spin_lock(&send_root->root_item_lock);
+	send_root->send_in_progress++;
+	spin_unlock(&send_root->root_item_lock);
+
 	/*
 	 * This is done when we lookup the root, it should already be complete
 	 * by the time we get here.
@@ -4811,6 +4820,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 		up_read(&send_root->fs_info->extent_commit_sem);
 	}
 
+	/*
+	 * Userspace tools do the checks and warn the user if it's
+	 * not RO.
+	 */
+	if (!btrfs_root_readonly(send_root)) {
+		ret = -EPERM;
+		goto out;
+	}
+
 	arg = memdup_user(arg_, sizeof(*arg));
 	if (IS_ERR(arg)) {
 		ret = PTR_ERR(arg);
@@ -4897,6 +4915,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 				ret = PTR_ERR(clone_root);
 				goto out;
 			}
+			clone_sources_to_rollback = i + 1;
+			spin_lock(&clone_root->root_item_lock);
+			clone_root->send_in_progress++;
+			if (!btrfs_root_readonly(clone_root)) {
+				spin_unlock(&clone_root->root_item_lock);
+				ret = -EPERM;
+				goto out;
+			}
+			spin_unlock(&clone_root->root_item_lock);
 			sctx->clone_roots[i].root = clone_root;
 		}
 		vfree(clone_sources_tmp);
@@ -4912,6 +4939,14 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 			ret = PTR_ERR(sctx->parent_root);
 			goto out;
 		}
+		spin_lock(&sctx->parent_root->root_item_lock);
+		sctx->parent_root->send_in_progress++;
+		if (!btrfs_root_readonly(sctx->parent_root)) {
+			spin_unlock(&sctx->parent_root->root_item_lock);
+			ret = -EPERM;
+			goto out;
+		}
+		spin_unlock(&sctx->parent_root->root_item_lock);
 	}
 
 	/*
@@ -4940,6 +4975,25 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	}
 
 out:
+	for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
+		struct btrfs_root *r = sctx->clone_roots[i].root;
+
+		spin_lock(&r->root_item_lock);
+		r->send_in_progress--;
+		spin_unlock(&r->root_item_lock);
+	}
+	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
+		struct btrfs_root *r = sctx->parent_root;
+
+		spin_lock(&r->root_item_lock);
+		r->send_in_progress--;
+		spin_unlock(&r->root_item_lock);
+	}
+
+	spin_lock(&send_root->root_item_lock);
+	send_root->send_in_progress--;
+	spin_unlock(&send_root->root_item_lock);
+
 	kfree(arg);
 	vfree(clone_sources_tmp);
 
-- 
cgit v1.2.3-70-g09d2


From 180589efde8a01b4a30af273f670ac81c8abf9c5 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 14 Dec 2013 15:27:31 +0800
Subject: Btrfs: fix a warning when iput a file

See the warning below:

[ 1209.102076]  [<ffffffffa04721b9>] remove_extent_mapping+0x69/0x70 [btrfs]
[ 1209.102084]  [<ffffffffa0466b06>] btrfs_evict_inode+0x96/0x4d0 [btrfs]
[ 1209.102089]  [<ffffffff81073010>] ? wake_atomic_t_function+0x40/0x40
[ 1209.102092]  [<ffffffff8118ab2e>] evict+0x9e/0x190
[ 1209.102094]  [<ffffffff8118b313>] iput+0xf3/0x180
[ 1209.102101]  [<ffffffffa0461fd1>] btrfs_run_delayed_iputs+0xb1/0xd0 [btrfs]
[ 1209.102107]  [<ffffffffa045d358>] __btrfs_end_transaction+0x268/0x350 [btrfs]

clear extent bit here to avoid triggering WARN_ON() in remove_extent_mapping()

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3b3142f173..2ccf8e6b1e1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4509,6 +4509,8 @@ static void evict_inode_truncate_pages(struct inode *inode)
 
 		node = rb_first(&map_tree->map);
 		em = rb_entry(node, struct extent_map, rb_node);
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
 		remove_extent_mapping(map_tree, em);
 		free_extent_map(em);
 	}
-- 
cgit v1.2.3-70-g09d2


From 536cd96401dcb986f681bac385b5146b45a44e66 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 17 Dec 2013 12:01:12 +0800
Subject: Btrfs: fix double initialization of the raid kobject

We met the following oops when doing space balance:
 kobject (ffff88081b590278): tried to init an initialized object, something is seriously wrong.
 ...
 Call Trace:
  [<ffffffff81937262>] dump_stack+0x49/0x5f
  [<ffffffff8137d259>] kobject_init+0x89/0xa0
  [<ffffffff8137d36a>] kobject_init_and_add+0x2a/0x70
  [<ffffffffa009bd79>] ? clear_extent_bit+0x199/0x470 [btrfs]
  [<ffffffffa005e82c>] __link_block_group+0xfc/0x120 [btrfs]
  [<ffffffffa006b9db>] btrfs_make_block_group+0x24b/0x370 [btrfs]
  [<ffffffffa00a899b>] __btrfs_alloc_chunk+0x54b/0x7e0 [btrfs]
  [<ffffffffa00a8c6f>] btrfs_alloc_chunk+0x3f/0x50 [btrfs]
  [<ffffffffa0060123>] do_chunk_alloc+0x363/0x440 [btrfs]
  [<ffffffffa00633d4>] btrfs_check_data_free_space+0x104/0x310 [btrfs]
  [<ffffffffa0069f4d>] btrfs_write_dirty_block_groups+0x48d/0x600 [btrfs]
  [<ffffffffa007aad4>] commit_cowonly_roots+0x184/0x250 [btrfs]
  ...

Steps to reproduce:
 # mkfs.btrfs -f <dev>
 # mount -o nospace_cache <dev> <mnt>
 # btrfs balance start <mnt>
 # dd if=/dev/zero of=<mnt>/tmpfile bs=1M count=1

The reason of this problem is that we initialized the raid kobject when we added
a block group into a empty raid list. As we know, when we mounted a btrfs filesystem,
the raid list was empty, we would initialize the raid kobject when we added the first
block group. But if there was not data stored in the block group, the block group
would be freed when doing balance, and the raid list would be empty. And then if we
allocated a new block group and added it into the raid list, we would initialize
the raid kobject again, the oops happened.

Fix this problem by initializing the raid kobject just when mounting the fs.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reported-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f08f6dda949..9e524b0a7b3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3463,8 +3463,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		return ret;
 	}
 
-	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
 		INIT_LIST_HEAD(&found->block_groups[i]);
+		kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
+	}
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
 	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -8422,9 +8424,8 @@ static void __link_block_group(struct btrfs_space_info *space_info,
 		int ret;
 
 		kobject_get(&space_info->kobj); /* put in release */
-		ret = kobject_init_and_add(kobj, &btrfs_raid_ktype,
-					   &space_info->kobj, "%s",
-					   get_raid_name(index));
+		ret = kobject_add(kobj, &space_info->kobj, "%s",
+				  get_raid_name(index));
 		if (ret) {
 			pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n");
 			kobject_put(&space_info->kobj);
-- 
cgit v1.2.3-70-g09d2


From 41ce9970a8a6a362ae8df145f7a03d789e9ef9d2 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 17 Dec 2013 19:57:21 +0800
Subject: Btrfs: remove transaction from btrfs send

Since daivd did the work that force us to use readonly snapshot,
we can safely remove transaction protection from btrfs send.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 572e8c75871..78a43b2e5c8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4618,7 +4618,6 @@ out:
 static int full_send_tree(struct send_ctx *sctx)
 {
 	int ret;
-	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_root *send_root = sctx->send_root;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -4640,19 +4639,6 @@ static int full_send_tree(struct send_ctx *sctx)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
-join_trans:
-	/*
-	 * We need to make sure the transaction does not get committed
-	 * while we do anything on commit roots. Join a transaction to prevent
-	 * this.
-	 */
-	trans = btrfs_join_transaction(send_root);
-	if (IS_ERR(trans)) {
-		ret = PTR_ERR(trans);
-		trans = NULL;
-		goto out;
-	}
-
 	/*
 	 * Make sure the tree has not changed after re-joining. We detect this
 	 * by comparing start_ctransid and ctransid. They should always match.
@@ -4676,19 +4662,6 @@ join_trans:
 		goto out_finish;
 
 	while (1) {
-		/*
-		 * When someone want to commit while we iterate, end the
-		 * joined transaction and rejoin.
-		 */
-		if (btrfs_should_end_transaction(trans, send_root)) {
-			ret = btrfs_end_transaction(trans, send_root);
-			trans = NULL;
-			if (ret < 0)
-				goto out;
-			btrfs_release_path(path);
-			goto join_trans;
-		}
-
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -4716,12 +4689,6 @@ out_finish:
 
 out:
 	btrfs_free_path(path);
-	if (trans) {
-		if (!ret)
-			ret = btrfs_end_transaction(trans, send_root);
-		else
-			btrfs_end_transaction(trans, send_root);
-	}
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 66ef7d65c3fc6e5300b9359f1c6537efb23781bb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 17 Dec 2013 15:07:20 +0100
Subject: btrfs: check balance of send_in_progress

Warn if the balance goes below zero, which appears to be unlikely
though. Otherwise cleans up the code a bit.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 78a43b2e5c8..8877adc4539 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4725,6 +4725,21 @@ out:
 	return ret;
 }
 
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+	spin_lock(&root->root_item_lock);
+	root->send_in_progress--;
+	/*
+	 * Not much left to do, we don't know why it's unbalanced and
+	 * can't blindly reset it to 0.
+	 */
+	if (root->send_in_progress < 0)
+		btrfs_err(root->fs_info,
+			"send_in_progres unbalanced %d root %llu\n",
+			root->send_in_progress, root->root_key.objectid);
+	spin_unlock(&root->root_item_lock);
+}
+
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 {
 	int ret = 0;
@@ -4942,24 +4957,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	}
 
 out:
-	for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
-		struct btrfs_root *r = sctx->clone_roots[i].root;
-
-		spin_lock(&r->root_item_lock);
-		r->send_in_progress--;
-		spin_unlock(&r->root_item_lock);
-	}
-	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
-		struct btrfs_root *r = sctx->parent_root;
-
-		spin_lock(&r->root_item_lock);
-		r->send_in_progress--;
-		spin_unlock(&r->root_item_lock);
-	}
-
-	spin_lock(&send_root->root_item_lock);
-	send_root->send_in_progress--;
-	spin_unlock(&send_root->root_item_lock);
+	for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+		btrfs_root_dec_send_in_progress(sctx->clone_roots[i].root);
+	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+		btrfs_root_dec_send_in_progress(sctx->parent_root);
+	btrfs_root_dec_send_in_progress(send_root);
 
 	kfree(arg);
 	vfree(clone_sources_tmp);
-- 
cgit v1.2.3-70-g09d2


From 5de865eebb8330eee19c37b31fb6f315a09d4273 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 20 Dec 2013 15:17:46 +0000
Subject: Btrfs: fix tree mod logging

While running the test btrfs/004 from xfstests in a loop, it failed
about 1 time out of 20 runs in my desktop. The failure happened in
the backref walking part of the test, and the test's error message was
like this:

  btrfs/004 93s ... [failed, exit status 1] - output mismatch (see /home/fdmanana/git/hub/xfstests_2/results//btrfs/004.out.bad)
      --- tests/btrfs/004.out	2013-11-26 18:25:29.263333714 +0000
      +++ /home/fdmanana/git/hub/xfstests_2/results//btrfs/004.out.bad	2013-12-10 15:25:10.327518516 +0000
      @@ -1,3 +1,8 @@
       QA output created by 004
       *** test backref walking
      -*** done
      +unexpected output from
      +	/home/fdmanana/git/hub/btrfs-progs/btrfs inspect-internal logical-resolve -P 141512704 /home/fdmanana/btrfs-tests/scratch_1
      +expected inum: 405, expected address: 454656, file: /home/fdmanana/btrfs-tests/scratch_1/snap1/p0/d6/d3d/d156/fce, got:
      +
       ...
       (Run 'diff -u tests/btrfs/004.out /home/fdmanana/git/hub/xfstests_2/results//btrfs/004.out.bad' to see the entire diff)
  Ran: btrfs/004
  Failures: btrfs/004
  Failed 1 of 1 tests

But immediately after the test finished, the btrfs inspect-internal command
returned the expected output:

  $ btrfs inspect-internal logical-resolve -P 141512704 /home/fdmanana/btrfs-tests/scratch_1
  inode 405 offset 454656 root 258
  inode 405 offset 454656 root 5

It turned out this was because the btrfs_search_old_slot() calls performed
during backref walking (backref.c:__resolve_indirect_ref) were not finding
anything. The reason for this turned out to be that the tree mod logging
code was not logging some node multi-step operations atomically, therefore
btrfs_search_old_slot() callers iterated often over an incomplete tree that
wasn't fully consistent with any tree state from the past. Besides missing
items, this often (but not always) resulted in -EIO errors during old slot
searches, reported in dmesg like this:

[ 4299.933936] ------------[ cut here ]------------
[ 4299.933949] WARNING: CPU: 0 PID: 23190 at fs/btrfs/ctree.c:1343 btrfs_search_old_slot+0x57b/0xab0 [btrfs]()
[ 4299.933950] Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) bnep rfcomm bluetooth parport_pc ppdev binfmt_misc joydev snd_hda_codec_h
[ 4299.933977] CPU: 0 PID: 23190 Comm: btrfs Tainted: G        W  O 3.12.0-fdm-btrfs-next-16+ #70
[ 4299.933978] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
[ 4299.933979]  000000000000053f ffff8806f3fd98f8 ffffffff8176d284 0000000000000007
[ 4299.933982]  0000000000000000 ffff8806f3fd9938 ffffffff8104a81c ffff880659c64b70
[ 4299.933984]  ffff880659c643d0 ffff8806599233d8 ffff880701e2e938 0000160000000000
[ 4299.933987] Call Trace:
[ 4299.933991]  [<ffffffff8176d284>] dump_stack+0x55/0x76
[ 4299.933994]  [<ffffffff8104a81c>] warn_slowpath_common+0x8c/0xc0
[ 4299.933997]  [<ffffffff8104a86a>] warn_slowpath_null+0x1a/0x20
[ 4299.934003]  [<ffffffffa065d3bb>] btrfs_search_old_slot+0x57b/0xab0 [btrfs]
[ 4299.934005]  [<ffffffff81775f3b>] ? _raw_read_unlock+0x2b/0x50
[ 4299.934010]  [<ffffffffa0655001>] ? __tree_mod_log_search+0x81/0xc0 [btrfs]
[ 4299.934019]  [<ffffffffa06dd9b0>] __resolve_indirect_refs+0x130/0x5f0 [btrfs]
[ 4299.934027]  [<ffffffffa06a21f1>] ? free_extent_buffer+0x61/0xc0 [btrfs]
[ 4299.934034]  [<ffffffffa06de39c>] find_parent_nodes+0x1fc/0xe40 [btrfs]
[ 4299.934042]  [<ffffffffa06b13e0>] ? defrag_lookup_extent+0xe0/0xe0 [btrfs]
[ 4299.934048]  [<ffffffffa06b13e0>] ? defrag_lookup_extent+0xe0/0xe0 [btrfs]
[ 4299.934056]  [<ffffffffa06df980>] iterate_extent_inodes+0xe0/0x250 [btrfs]
[ 4299.934058]  [<ffffffff817762db>] ? _raw_spin_unlock+0x2b/0x50
[ 4299.934065]  [<ffffffffa06dfb82>] iterate_inodes_from_logical+0x92/0xb0 [btrfs]
[ 4299.934071]  [<ffffffffa06b13e0>] ? defrag_lookup_extent+0xe0/0xe0 [btrfs]
[ 4299.934078]  [<ffffffffa06b7015>] btrfs_ioctl+0xf65/0x1f60 [btrfs]
[ 4299.934080]  [<ffffffff811658b8>] ? handle_mm_fault+0x278/0xb00
[ 4299.934083]  [<ffffffff81075563>] ? up_read+0x23/0x40
[ 4299.934085]  [<ffffffff8177a41c>] ? __do_page_fault+0x20c/0x5a0
[ 4299.934088]  [<ffffffff811b2946>] do_vfs_ioctl+0x96/0x570
[ 4299.934090]  [<ffffffff81776e23>] ? error_sti+0x5/0x6
[ 4299.934093]  [<ffffffff810b71e8>] ? trace_hardirqs_off_caller+0x28/0xd0
[ 4299.934096]  [<ffffffff81776a09>] ? retint_swapgs+0xe/0x13
[ 4299.934098]  [<ffffffff811b2eb1>] SyS_ioctl+0x91/0xb0
[ 4299.934100]  [<ffffffff813eecde>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 4299.934102]  [<ffffffff8177ef12>] system_call_fastpath+0x16/0x1b
[ 4299.934102]  [<ffffffff8177ef12>] system_call_fastpath+0x16/0x1b
[ 4299.934104] ---[ end trace 48f0cfc902491414 ]---
[ 4299.934378] btrfs bad fsid on block 0

These tree mod log operations that must be performed atomically, tree_mod_log_free_eb,
tree_mod_log_eb_copy, tree_mod_log_insert_root and tree_mod_log_insert_move, used to
be performed atomically before the following commit:

  c8cc6341653721b54760480b0d0d9b5f09b46741
  (Btrfs: stop using GFP_ATOMIC for the tree mod log allocations)

That change removed the atomicity of such operations. This patch restores the
atomicity while still not doing the GFP_ATOMIC allocations of tree_mod_elem
structures, so it has to do the allocations using GFP_NOFS before acquiring
the mod log lock.

This issue has been experienced by several users recently, such as for example:

  http://www.spinics.net/lists/linux-btrfs/msg28574.html

After running the btrfs/004 test for 679 consecutive iterations with this
patch applied, I didn't ran into the issue anymore.

Cc: stable@vger.kernel.org
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 385 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 296 insertions(+), 89 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 59664f6cbc4..7d88d8543aa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -39,7 +39,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 		    int level, int slot);
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 				 struct extent_buffer *eb);
 
 struct btrfs_path *btrfs_alloc_path(void)
@@ -474,6 +474,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  * the index is the shifted logical of the *new* root node for root replace
  * operations, or the shifted logical of the affected block for all other
  * operations.
+ *
+ * Note: must be called with write lock (tree_mod_log_write_lock).
  */
 static noinline int
 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -482,24 +484,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 	struct rb_node **new;
 	struct rb_node *parent = NULL;
 	struct tree_mod_elem *cur;
-	int ret = 0;
 
 	BUG_ON(!tm);
 
-	tree_mod_log_write_lock(fs_info);
-	if (list_empty(&fs_info->tree_mod_seq_list)) {
-		tree_mod_log_write_unlock(fs_info);
-		/*
-		 * Ok we no longer care about logging modifications, free up tm
-		 * and return 0.  Any callers shouldn't be using tm after
-		 * calling tree_mod_log_insert, but if they do we can just
-		 * change this to return a special error code to let the callers
-		 * do their own thing.
-		 */
-		kfree(tm);
-		return 0;
-	}
-
 	spin_lock(&fs_info->tree_mod_seq_lock);
 	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
 	spin_unlock(&fs_info->tree_mod_seq_lock);
@@ -517,18 +504,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 			new = &((*new)->rb_left);
 		else if (cur->seq > tm->seq)
 			new = &((*new)->rb_right);
-		else {
-			ret = -EEXIST;
-			kfree(tm);
-			goto out;
-		}
+		else
+			return -EEXIST;
 	}
 
 	rb_link_node(&tm->node, parent, new);
 	rb_insert_color(&tm->node, tm_root);
-out:
-	tree_mod_log_write_unlock(fs_info);
-	return ret;
+	return 0;
 }
 
 /*
@@ -544,19 +526,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 		return 1;
 	if (eb && btrfs_header_level(eb) == 0)
 		return 1;
+
+	tree_mod_log_write_lock(fs_info);
+	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+		tree_mod_log_write_unlock(fs_info);
+		return 1;
+	}
+
 	return 0;
 }
 
-static inline int
-__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
-			  struct extent_buffer *eb, int slot,
-			  enum mod_log_op op, gfp_t flags)
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+				    struct extent_buffer *eb)
+{
+	smp_mb();
+	if (list_empty(&(fs_info)->tree_mod_seq_list))
+		return 0;
+	if (eb && btrfs_header_level(eb) == 0)
+		return 0;
+
+	return 1;
+}
+
+static struct tree_mod_elem *
+alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
+		    enum mod_log_op op, gfp_t flags)
 {
 	struct tree_mod_elem *tm;
 
 	tm = kzalloc(sizeof(*tm), flags);
 	if (!tm)
-		return -ENOMEM;
+		return NULL;
 
 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
 	if (op != MOD_LOG_KEY_ADD) {
@@ -566,8 +567,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 	tm->op = op;
 	tm->slot = slot;
 	tm->generation = btrfs_node_ptr_generation(eb, slot);
+	RB_CLEAR_NODE(&tm->node);
 
-	return __tree_mod_log_insert(fs_info, tm);
+	return tm;
 }
 
 static noinline int
@@ -575,10 +577,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 			struct extent_buffer *eb, int slot,
 			enum mod_log_op op, gfp_t flags)
 {
-	if (tree_mod_dont_log(fs_info, eb))
+	struct tree_mod_elem *tm;
+	int ret;
+
+	if (!tree_mod_need_log(fs_info, eb))
+		return 0;
+
+	tm = alloc_tree_mod_elem(eb, slot, op, flags);
+	if (!tm)
+		return -ENOMEM;
+
+	if (tree_mod_dont_log(fs_info, eb)) {
+		kfree(tm);
 		return 0;
+	}
+
+	ret = __tree_mod_log_insert(fs_info, tm);
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		kfree(tm);
 
-	return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+	return ret;
 }
 
 static noinline int
@@ -586,53 +605,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 			 struct extent_buffer *eb, int dst_slot, int src_slot,
 			 int nr_items, gfp_t flags)
 {
-	struct tree_mod_elem *tm;
-	int ret;
+	struct tree_mod_elem *tm = NULL;
+	struct tree_mod_elem **tm_list = NULL;
+	int ret = 0;
 	int i;
+	int locked = 0;
 
-	if (tree_mod_dont_log(fs_info, eb))
+	if (!tree_mod_need_log(fs_info, eb))
 		return 0;
 
+	tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+	if (!tm_list)
+		return -ENOMEM;
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm) {
+		ret = -ENOMEM;
+		goto free_tms;
+	}
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	tm->slot = src_slot;
+	tm->move.dst_slot = dst_slot;
+	tm->move.nr_items = nr_items;
+	tm->op = MOD_LOG_MOVE_KEYS;
+
+	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+		if (!tm_list[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, eb))
+		goto free_tms;
+	locked = 1;
+
 	/*
 	 * When we override something during the move, we log these removals.
 	 * This can only happen when we move towards the beginning of the
 	 * buffer, i.e. dst_slot < src_slot.
 	 */
 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-		ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
-				MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
-		BUG_ON(ret < 0);
+		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+		if (ret)
+			goto free_tms;
 	}
 
-	tm = kzalloc(sizeof(*tm), flags);
-	if (!tm)
-		return -ENOMEM;
+	ret = __tree_mod_log_insert(fs_info, tm);
+	if (ret)
+		goto free_tms;
+	tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
 
-	tm->index = eb->start >> PAGE_CACHE_SHIFT;
-	tm->slot = src_slot;
-	tm->move.dst_slot = dst_slot;
-	tm->move.nr_items = nr_items;
-	tm->op = MOD_LOG_MOVE_KEYS;
+	return 0;
+free_tms:
+	for (i = 0; i < nr_items; i++) {
+		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+		kfree(tm_list[i]);
+	}
+	if (locked)
+		tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+	kfree(tm);
 
-	return __tree_mod_log_insert(fs_info, tm);
+	return ret;
 }
 
-static inline void
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static inline int
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+		       struct tree_mod_elem **tm_list,
+		       int nritems)
 {
-	int i;
-	u32 nritems;
+	int i, j;
 	int ret;
 
-	if (btrfs_header_level(eb) == 0)
-		return;
-
-	nritems = btrfs_header_nritems(eb);
 	for (i = nritems - 1; i >= 0; i--) {
-		ret = __tree_mod_log_insert_key(fs_info, eb, i,
-				MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
-		BUG_ON(ret < 0);
+		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+		if (ret) {
+			for (j = nritems - 1; j > i; j--)
+				rb_erase(&tm_list[j]->node,
+					 &fs_info->tree_mod_log);
+			return ret;
+		}
 	}
+
+	return 0;
 }
 
 static noinline int
@@ -641,17 +702,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 			 struct extent_buffer *new_root, gfp_t flags,
 			 int log_removal)
 {
-	struct tree_mod_elem *tm;
+	struct tree_mod_elem *tm = NULL;
+	struct tree_mod_elem **tm_list = NULL;
+	int nritems = 0;
+	int ret = 0;
+	int i;
 
-	if (tree_mod_dont_log(fs_info, NULL))
+	if (!tree_mod_need_log(fs_info, NULL))
 		return 0;
 
-	if (log_removal)
-		__tree_mod_log_free_eb(fs_info, old_root);
+	if (log_removal && btrfs_header_level(old_root) > 0) {
+		nritems = btrfs_header_nritems(old_root);
+		tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+				  flags);
+		if (!tm_list) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+		for (i = 0; i < nritems; i++) {
+			tm_list[i] = alloc_tree_mod_elem(old_root, i,
+			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+			if (!tm_list[i]) {
+				ret = -ENOMEM;
+				goto free_tms;
+			}
+		}
+	}
 
 	tm = kzalloc(sizeof(*tm), flags);
-	if (!tm)
-		return -ENOMEM;
+	if (!tm) {
+		ret = -ENOMEM;
+		goto free_tms;
+	}
 
 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
 	tm->old_root.logical = old_root->start;
@@ -659,7 +741,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 	tm->generation = btrfs_header_generation(old_root);
 	tm->op = MOD_LOG_ROOT_REPLACE;
 
-	return __tree_mod_log_insert(fs_info, tm);
+	if (tree_mod_dont_log(fs_info, NULL))
+		goto free_tms;
+
+	if (tm_list)
+		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+	if (!ret)
+		ret = __tree_mod_log_insert(fs_info, tm);
+
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		goto free_tms;
+	kfree(tm_list);
+
+	return ret;
+
+free_tms:
+	if (tm_list) {
+		for (i = 0; i < nritems; i++)
+			kfree(tm_list[i]);
+		kfree(tm_list);
+	}
+	kfree(tm);
+
+	return ret;
 }
 
 static struct tree_mod_elem *
@@ -728,31 +833,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
 }
 
-static noinline void
+static noinline int
 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 		     struct extent_buffer *src, unsigned long dst_offset,
 		     unsigned long src_offset, int nr_items)
 {
-	int ret;
+	int ret = 0;
+	struct tree_mod_elem **tm_list = NULL;
+	struct tree_mod_elem **tm_list_add, **tm_list_rem;
 	int i;
+	int locked = 0;
 
-	if (tree_mod_dont_log(fs_info, NULL))
-		return;
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
 
 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
-		return;
+		return 0;
 
+	tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+			  GFP_NOFS);
+	if (!tm_list)
+		return -ENOMEM;
+
+	tm_list_add = tm_list;
+	tm_list_rem = tm_list + nr_items;
 	for (i = 0; i < nr_items; i++) {
-		ret = __tree_mod_log_insert_key(fs_info, src,
-						i + src_offset,
-						MOD_LOG_KEY_REMOVE, GFP_NOFS);
-		BUG_ON(ret < 0);
-		ret = __tree_mod_log_insert_key(fs_info, dst,
-						     i + dst_offset,
-						     MOD_LOG_KEY_ADD,
-						     GFP_NOFS);
-		BUG_ON(ret < 0);
+		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		if (!tm_list_rem[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+
+		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+		    MOD_LOG_KEY_ADD, GFP_NOFS);
+		if (!tm_list_add[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, NULL))
+		goto free_tms;
+	locked = 1;
+
+	for (i = 0; i < nr_items; i++) {
+		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
+		if (ret)
+			goto free_tms;
+		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
+		if (ret)
+			goto free_tms;
+	}
+
+	tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return 0;
+
+free_tms:
+	for (i = 0; i < nr_items * 2; i++) {
+		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+		kfree(tm_list[i]);
 	}
+	if (locked)
+		tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return ret;
 }
 
 static inline void
@@ -777,12 +926,52 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 	BUG_ON(ret < 0);
 }
 
-static noinline void
+static noinline int
 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 {
+	struct tree_mod_elem **tm_list = NULL;
+	int nritems = 0;
+	int i;
+	int ret = 0;
+
+	if (btrfs_header_level(eb) == 0)
+		return 0;
+
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
+
+	nritems = btrfs_header_nritems(eb);
+	tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+			  GFP_NOFS);
+	if (!tm_list)
+		return -ENOMEM;
+
+	for (i = 0; i < nritems; i++) {
+		tm_list[i] = alloc_tree_mod_elem(eb, i,
+		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+		if (!tm_list[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
 	if (tree_mod_dont_log(fs_info, eb))
-		return;
-	__tree_mod_log_free_eb(fs_info, eb);
+		goto free_tms;
+
+	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		goto free_tms;
+	kfree(tm_list);
+
+	return 0;
+
+free_tms:
+	for (i = 0; i < nritems; i++)
+		kfree(tm_list[i]);
+	kfree(tm_list);
+
+	return ret;
 }
 
 static noinline void
@@ -1040,8 +1229,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
-		if (last_ref)
-			tree_mod_log_free_eb(root->fs_info, buf);
+		if (last_ref) {
+			ret = tree_mod_log_free_eb(root->fs_info, buf);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				return ret;
+			}
+		}
 		btrfs_free_tree_block(trans, root, buf, parent_start,
 				      last_ref);
 	}
@@ -3064,8 +3258,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	} else
 		push_items = min(src_nritems - 8, push_items);
 
-	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
-			     push_items);
+	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+				   push_items);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
 			   btrfs_node_key_ptr_offset(0),
@@ -3135,8 +3333,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 				      (dst_nritems) *
 				      sizeof(struct btrfs_key_ptr));
 
-	tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
-			     src_nritems - push_items, push_items);
+	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+				   src_nritems - push_items, push_items);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -3337,7 +3539,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 			    btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
 
-	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
+	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
+				   mid, c_nritems - mid);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 	copy_extent_buffer(split, c,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(mid),
-- 
cgit v1.2.3-70-g09d2


From efe120a067c8674a8ae21b194f0e68f098b61ee2 Mon Sep 17 00:00:00 2001
From: Frank Holton <fholton@gmail.com>
Date: Fri, 20 Dec 2013 11:37:06 -0500
Subject: Btrfs: convert printk to btrfs_ and fix BTRFS prefix

Convert all applicable cases of printk and pr_* to the btrfs_* macros.

Fix all uses of the BTRFS prefix.

Signed-off-by: Frank Holton <fholton@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/compression.c       |  12 ++---
 fs/btrfs/ctree.c             |  16 +++----
 fs/btrfs/ctree.h             |   4 +-
 fs/btrfs/delayed-inode.c     |   8 ++--
 fs/btrfs/dev-replace.c       |  56 +++++++++++++----------
 fs/btrfs/dir-item.c          |   6 +--
 fs/btrfs/disk-io.c           | 106 ++++++++++++++++++++++---------------------
 fs/btrfs/extent-tree.c       |  14 +++---
 fs/btrfs/extent_io.c         |  61 +++++++++++++++----------
 fs/btrfs/file-item.c         |   4 +-
 fs/btrfs/free-space-cache.c  |  21 +++++----
 fs/btrfs/inode.c             |   4 +-
 fs/btrfs/ioctl.c             |  30 ++++++------
 fs/btrfs/lzo.c               |   6 +--
 fs/btrfs/ordered-data.c      |  12 +++--
 fs/btrfs/print-tree.c        |   2 +-
 fs/btrfs/qgroup.c            |  28 ++++++------
 fs/btrfs/reada.c             |   9 ++--
 fs/btrfs/relocation.c        |   4 +-
 fs/btrfs/root-tree.c         |   4 +-
 fs/btrfs/scrub.c             |  50 ++++++++++----------
 fs/btrfs/send.c              |   4 +-
 fs/btrfs/super.c             | 104 ++++++++++++++++++++----------------------
 fs/btrfs/sysfs.c             |   2 +-
 fs/btrfs/tests/btrfs-tests.h |   2 +-
 fs/btrfs/transaction.c       |   8 ++--
 fs/btrfs/uuid-tree.c         |  13 +++---
 fs/btrfs/volumes.c           |  89 ++++++++++++++++++------------------
 fs/btrfs/zlib.c              |   8 ++--
 29 files changed, 358 insertions(+), 329 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1499b27b418..af815eb8f97 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -128,11 +128,10 @@ static int check_compressed_csum(struct inode *inode,
 		kunmap_atomic(kaddr);
 
 		if (csum != *cb_sum) {
-			printk(KERN_INFO "btrfs csum failed ino %llu "
-			       "extent %llu csum %u "
-			       "wanted %u mirror %d\n",
-			       btrfs_ino(inode), disk_start, csum, *cb_sum,
-			       cb->mirror_num);
+			btrfs_info(BTRFS_I(inode)->root->fs_info,
+			   "csum failed ino %llu extent %llu csum %u wanted %u mirror %d",
+			   btrfs_ino(inode), disk_start, csum, *cb_sum,
+			   cb->mirror_num);
 			ret = -EIO;
 			goto fail;
 		}
@@ -412,7 +411,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 		}
 		if (bytes_left < PAGE_CACHE_SIZE) {
-			printk("bytes left %lu compress len %lu nr %lu\n",
+			btrfs_info(BTRFS_I(inode)->root->fs_info,
+					"bytes left %lu compress len %lu nr %lu",
 			       bytes_left, cb->compressed_len, cb->nr_pages);
 		}
 		bytes_left -= PAGE_CACHE_SIZE;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7d88d8543aa..062438d3898 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1480,8 +1480,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		old = read_tree_block(root, logical, blocksize, 0);
 		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
 			free_extent_buffer(old);
-			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
-				logical);
+			btrfs_warn(root->fs_info,
+				"failed to read tree block %llu from get_old_root", logical);
 		} else {
 			eb = btrfs_clone_extent_buffer(old);
 			free_extent_buffer(old);
@@ -3611,8 +3611,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 	int ret;
 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
 	if (ret < 0) {
-		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
-		       "used %d nritems %d\n",
+		btrfs_crit(root->fs_info,
+			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
 		       leaf_space_used(leaf, 0, nritems), nritems);
 	}
@@ -4702,7 +4702,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
 		btrfs_print_leaf(root, leaf);
-		printk(KERN_CRIT "slot %d too large, nritems %d\n",
+		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
 		       slot, nritems);
 		BUG_ON(1);
 	}
@@ -4765,7 +4765,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 
 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
 		btrfs_print_leaf(root, leaf);
-		printk(KERN_CRIT "not enough freespace need %u have %d\n",
+		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
 		       total_size, btrfs_leaf_free_space(root, leaf));
 		BUG();
 	}
@@ -4775,7 +4775,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -5510,7 +5510,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 
 			if (!left_start_ctransid || !right_start_ctransid) {
 				WARN(1, KERN_WARNING
-					"btrfs: btrfs_compare_tree detected "
+					"BTRFS: btrfs_compare_tree detected "
 					"a change in one of the trees while "
 					"iterating. This is probably a "
 					"bug.\n");
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9318c7520f5..5be778e62a2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3838,7 +3838,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 
 static inline void assfail(char *expr, char *file, int line)
 {
-	printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d",
+	pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
 	       expr, file, line);
 	BUG();
 }
@@ -3876,7 +3876,7 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
 		if (!(features & flag)) {
 			features |= flag;
 			btrfs_set_super_incompat_flags(disk_super, features);
-			printk(KERN_INFO "btrfs: setting %llu feature flag\n",
+			btrfs_info(fs_info, "setting %llu feature flag",
 					 flag);
 		}
 		spin_unlock(&fs_info->super_lock);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 8d292fbae65..58419493861 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1472,9 +1472,9 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	mutex_lock(&delayed_node->mutex);
 	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
 	if (unlikely(ret)) {
-		printk(KERN_ERR "err add delayed dir index item(name: %.*s) "
+		btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
 				"into the insertion tree of the delayed node"
-				"(root id: %llu, inode id: %llu, errno: %d)\n",
+				"(root id: %llu, inode id: %llu, errno: %d)",
 				name_len, name, delayed_node->root->objectid,
 				delayed_node->inode_id, ret);
 		BUG();
@@ -1544,9 +1544,9 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 	mutex_lock(&node->mutex);
 	ret = __btrfs_add_delayed_deletion_item(node, item);
 	if (unlikely(ret)) {
-		printk(KERN_ERR "err add delayed dir index item(index: %llu) "
+		btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
 				"into the deletion tree of the delayed node"
-				"(root id: %llu, inode id: %llu, errno: %d)\n",
+				"(root id: %llu, inode id: %llu, errno: %d)",
 				index, node->root->objectid, node->inode_id,
 				ret);
 		BUG();
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2cfc3dfff64..564c92638b2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -102,7 +102,8 @@ no_valid_dev_replace_entry_found:
 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
 
 	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
-		pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+		btrfs_warn(fs_info,
+			"dev_replace entry found has unexpected size, ignore entry");
 		goto no_valid_dev_replace_entry_found;
 	}
 
@@ -145,13 +146,19 @@ no_valid_dev_replace_entry_found:
 		if (!dev_replace->srcdev &&
 		    !btrfs_test_opt(dev_root, DEGRADED)) {
 			ret = -EIO;
-			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
-				src_devid);
+			btrfs_warn(fs_info,
+			   "cannot mount because device replace operation is ongoing and");
+			btrfs_warn(fs_info,
+			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+			   src_devid);
 		}
 		if (!dev_replace->tgtdev &&
 		    !btrfs_test_opt(dev_root, DEGRADED)) {
 			ret = -EIO;
-			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+			btrfs_warn(fs_info,
+			   "cannot mount because device replace operation is ongoing and");
+			btrfs_warn(fs_info,
+			   "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 				BTRFS_DEV_REPLACE_DEVID);
 		}
 		if (dev_replace->tgtdev) {
@@ -210,7 +217,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 	}
 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 	if (ret < 0) {
-		pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+		btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
 			ret);
 		goto out;
 	}
@@ -230,7 +237,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 		 */
 		ret = btrfs_del_item(trans, dev_root, path);
 		if (ret != 0) {
-			pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+			btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
 				ret);
 			goto out;
 		}
@@ -243,7 +250,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 		ret = btrfs_insert_empty_item(trans, dev_root, path,
 					      &key, sizeof(*ptr));
 		if (ret < 0) {
-			pr_warn("btrfs: insert dev_replace item failed %d!\n",
+			btrfs_warn(fs_info, "insert dev_replace item failed %d!",
 				ret);
 			goto out;
 		}
@@ -305,7 +312,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	struct btrfs_device *src_device = NULL;
 
 	if (btrfs_fs_incompat(fs_info, RAID56)) {
-		pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
+		btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
 		return -EINVAL;
 	}
 
@@ -325,7 +332,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
 					    &tgt_device);
 	if (ret) {
-		pr_err("btrfs: target device %s is invalid!\n",
+		btrfs_err(fs_info, "target device %s is invalid!",
 		       args->start.tgtdev_name);
 		mutex_unlock(&fs_info->volume_mutex);
 		return -EINVAL;
@@ -341,7 +348,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	}
 
 	if (tgt_device->total_bytes < src_device->total_bytes) {
-		pr_err("btrfs: target device is smaller than source device!\n");
+		btrfs_err(fs_info, "target device is smaller than source device!");
 		ret = -EINVAL;
 		goto leave_no_lock;
 	}
@@ -366,7 +373,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	dev_replace->tgtdev = tgt_device;
 
 	printk_in_rcu(KERN_INFO
-		      "btrfs: dev_replace from %s (devid %llu) to %s started\n",
+		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
 		      src_device->missing ? "<missing disk>" :
 		        rcu_str_deref(src_device->name),
 		      src_device->devid,
@@ -489,7 +496,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
 	if (scrub_ret) {
 		printk_in_rcu(KERN_ERR
-			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+			      "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
 			      src_device->missing ? "<missing disk>" :
 			        rcu_str_deref(src_device->name),
 			      src_device->devid,
@@ -504,7 +511,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	}
 
 	printk_in_rcu(KERN_INFO
-		      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+		      "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
 		      src_device->missing ? "<missing disk>" :
 		        rcu_str_deref(src_device->name),
 		      src_device->devid,
@@ -699,7 +706,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
 		dev_replace->time_stopped = get_seconds();
 		dev_replace->item_needs_writeback = 1;
-		pr_info("btrfs: suspending dev_replace for unmount\n");
+		btrfs_info(fs_info, "suspending dev_replace for unmount");
 		break;
 	}
 
@@ -728,8 +735,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 		break;
 	}
 	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
-		pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
-			"btrfs: you may cancel the operation after 'mount -o degraded'\n");
+		btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
+		btrfs_info(fs_info,
+			"you may cancel the operation after 'mount -o degraded'");
 		btrfs_dev_replace_unlock(dev_replace);
 		return 0;
 	}
@@ -755,14 +763,14 @@ static int btrfs_dev_replace_kthread(void *data)
 		kfree(status_args);
 		do_div(progress, 10);
 		printk_in_rcu(KERN_INFO
-			      "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
-			      dev_replace->srcdev->missing ? "<missing disk>" :
-				rcu_str_deref(dev_replace->srcdev->name),
-			      dev_replace->srcdev->devid,
-			      dev_replace->tgtdev ?
-				rcu_str_deref(dev_replace->tgtdev->name) :
-				"<missing target disk>",
-			      (unsigned int)progress);
+			"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+			dev_replace->srcdev->missing ? "<missing disk>" :
+			rcu_str_deref(dev_replace->srcdev->name),
+			dev_replace->srcdev->devid,
+			dev_replace->tgtdev ?
+			rcu_str_deref(dev_replace->tgtdev->name) :
+			"<missing target disk>",
+			(unsigned int)progress);
 	}
 	btrfs_dev_replace_continue_on_mount(fs_info);
 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 9a89ceb25e8..a0691df5dce 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -459,7 +459,7 @@ int verify_dir_item(struct btrfs_root *root,
 	u8 type = btrfs_dir_type(leaf, dir_item);
 
 	if (type >= BTRFS_FT_MAX) {
-		printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
+		btrfs_crit(root->fs_info, "invalid dir item type: %d",
 		       (int)type);
 		return 1;
 	}
@@ -468,7 +468,7 @@ int verify_dir_item(struct btrfs_root *root,
 		namelen = XATTR_NAME_MAX;
 
 	if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
-		printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
+		btrfs_crit(root->fs_info, "invalid dir item name len: %u",
 		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
 		return 1;
 	}
@@ -476,7 +476,7 @@ int verify_dir_item(struct btrfs_root *root,
 	/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
 	if ((btrfs_dir_data_len(leaf, dir_item) +
 	     btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
-		printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n",
+		btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u",
 		       (unsigned)btrfs_dir_name_len(leaf, dir_item),
 		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
 		return 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4a1871cf797..0400a26fc8c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -300,11 +300,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
-				       "failed on %llu wanted %X found %X "
-				       "level %d\n",
-				       root->fs_info->sb->s_id, buf->start,
-				       val, found, btrfs_header_level(buf));
+			printk_ratelimited(KERN_INFO
+				"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
+				"level %d\n",
+				root->fs_info->sb->s_id, buf->start,
+				val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
 				kfree(result);
 			return 1;
@@ -383,13 +383,14 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
 			ret = 1;
 
 		if (ret && btrfs_super_generation(disk_sb) < 10) {
-			printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
+			printk(KERN_WARNING
+				"BTRFS: super block crcs don't match, older mkfs detected\n");
 			ret = 0;
 		}
 	}
 
 	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
-		printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
+		printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
 				csum_type);
 		ret = 1;
 	}
@@ -498,8 +499,8 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 }
 
 #define CORRUPT(reason, eb, root, slot)				\
-	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
-	       "root=%llu, slot=%d\n", reason,			\
+	btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu,"	\
+		   "root=%llu, slot=%d", reason,			\
 	       btrfs_header_bytenr(eb),	root->objectid, slot)
 
 static noinline int check_leaf(struct btrfs_root *root,
@@ -596,21 +597,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != eb->start) {
-		printk_ratelimited(KERN_INFO "btrfs bad tree block start "
+		printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
 			       "%llu %llu\n",
 			       found_start, eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
-		printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
+		printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
 			       eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
 	if (found_level >= BTRFS_MAX_LEVEL) {
-		btrfs_info(root->fs_info, "bad tree block level %d\n",
+		btrfs_info(root->fs_info, "bad tree block level %d",
 			   (int)btrfs_header_level(eb));
 		ret = -EIO;
 		goto err;
@@ -1004,8 +1005,9 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		printk(KERN_WARNING "btrfs warning page private not zero "
-		       "on page %llu\n", (unsigned long long)page_offset(page));
+		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
+			   "page private not zero on page %llu",
+			   (unsigned long long)page_offset(page));
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -2322,7 +2324,7 @@ int open_ctree(struct super_block *sb,
 	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
 	 */
 	if (btrfs_check_super_csum(bh->b_data)) {
-		printk(KERN_ERR "btrfs: superblock checksum mismatch\n");
+		printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
 		err = -EINVAL;
 		goto fail_alloc;
 	}
@@ -2341,7 +2343,7 @@ int open_ctree(struct super_block *sb,
 
 	ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
 	if (ret) {
-		printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+		printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
 		err = -EINVAL;
 		goto fail_alloc;
 	}
@@ -2406,7 +2408,7 @@ int open_ctree(struct super_block *sb,
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
 
 	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
-		printk(KERN_ERR "btrfs: has skinny extents\n");
+		printk(KERN_ERR "BTRFS: has skinny extents\n");
 
 	/*
 	 * flag our filesystem as having big metadata blocks if
@@ -2414,7 +2416,7 @@ int open_ctree(struct super_block *sb,
 	 */
 	if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
 		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
-			printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+			printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
 		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
 	}
 
@@ -2431,7 +2433,7 @@ int open_ctree(struct super_block *sb,
 	 */
 	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
 	    (sectorsize != leafsize)) {
-		printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
+		printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
 				"are not allowed for mixed block groups on %s\n",
 				sb->s_id);
 		goto fail_alloc;
@@ -2568,12 +2570,12 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
 	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+		printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
 
 	if (sectorsize != PAGE_SIZE) {
-		printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) "
+		printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
 		       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
 		goto fail_sb_buffer;
 	}
@@ -2582,7 +2584,7 @@ int open_ctree(struct super_block *sb,
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk(KERN_WARNING "btrfs: failed to read the system "
+		printk(KERN_WARNING "BTRFS: failed to read the system "
 		       "array on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
@@ -2599,7 +2601,7 @@ int open_ctree(struct super_block *sb,
 					   blocksize, generation);
 	if (!chunk_root->node ||
 	    !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
-		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+		printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
 		       sb->s_id);
 		goto fail_tree_roots;
 	}
@@ -2611,7 +2613,7 @@ int open_ctree(struct super_block *sb,
 
 	ret = btrfs_read_chunk_tree(chunk_root);
 	if (ret) {
-		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+		printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
 		       sb->s_id);
 		goto fail_tree_roots;
 	}
@@ -2623,7 +2625,7 @@ int open_ctree(struct super_block *sb,
 	btrfs_close_extra_devices(fs_info, fs_devices, 0);
 
 	if (!fs_devices->latest_bdev) {
-		printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
+		printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
 		       sb->s_id);
 		goto fail_tree_roots;
 	}
@@ -2638,7 +2640,7 @@ retry_root_backup:
 					  blocksize, generation);
 	if (!tree_root->node ||
 	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
-		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+		printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
 		       sb->s_id);
 
 		goto recovery_tree_root;
@@ -2709,20 +2711,20 @@ retry_root_backup:
 
 	ret = btrfs_recover_balance(fs_info);
 	if (ret) {
-		printk(KERN_WARNING "btrfs: failed to recover balance\n");
+		printk(KERN_WARNING "BTRFS: failed to recover balance\n");
 		goto fail_block_groups;
 	}
 
 	ret = btrfs_init_dev_stats(fs_info);
 	if (ret) {
-		printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+		printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
 		       ret);
 		goto fail_block_groups;
 	}
 
 	ret = btrfs_init_dev_replace(fs_info);
 	if (ret) {
-		pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+		pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
 		goto fail_block_groups;
 	}
 
@@ -2730,19 +2732,19 @@ retry_root_backup:
 
 	ret = btrfs_sysfs_add_one(fs_info);
 	if (ret) {
-		pr_err("btrfs: failed to init sysfs interface: %d\n", ret);
+		pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
 		goto fail_block_groups;
 	}
 
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
-		printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+		printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
 		goto fail_block_groups;
 	}
 
 	ret = btrfs_read_block_groups(extent_root);
 	if (ret) {
-		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+		printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
 		goto fail_block_groups;
 	}
 	fs_info->num_tolerated_disk_barrier_failures =
@@ -2750,8 +2752,8 @@ retry_root_backup:
 	if (fs_info->fs_devices->missing_devices >
 	     fs_info->num_tolerated_disk_barrier_failures &&
 	    !(sb->s_flags & MS_RDONLY)) {
-		printk(KERN_WARNING
-		       "Btrfs: too many missing devices, writeable mount is not allowed\n");
+		printk(KERN_WARNING "BTRFS: "
+			"too many missing devices, writeable mount is not allowed\n");
 		goto fail_block_groups;
 	}
 
@@ -2769,7 +2771,7 @@ retry_root_backup:
 	if (!btrfs_test_opt(tree_root, SSD) &&
 	    !btrfs_test_opt(tree_root, NOSSD) &&
 	    !fs_info->fs_devices->rotating) {
-		printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+		printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
 		       "mode\n");
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
@@ -2782,7 +2784,7 @@ retry_root_backup:
 				    1 : 0,
 				    fs_info->check_integrity_print_mask);
 		if (ret)
-			printk(KERN_WARNING "btrfs: failed to initialize"
+			printk(KERN_WARNING "BTRFS: failed to initialize"
 			       " integrity check module %s\n", sb->s_id);
 	}
 #endif
@@ -2795,7 +2797,7 @@ retry_root_backup:
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
-			printk(KERN_WARNING "Btrfs log replay required "
+			printk(KERN_WARNING "BTRFS: log replay required "
 			       "on RO media\n");
 			err = -EIO;
 			goto fail_qgroup;
@@ -2818,7 +2820,7 @@ retry_root_backup:
 						      generation + 1);
 		if (!log_tree_root->node ||
 		    !extent_buffer_uptodate(log_tree_root->node)) {
-			printk(KERN_ERR "btrfs: failed to read log tree\n");
+			printk(KERN_ERR "BTRFS: failed to read log tree\n");
 			free_extent_buffer(log_tree_root->node);
 			kfree(log_tree_root);
 			goto fail_trans_kthread;
@@ -2852,7 +2854,7 @@ retry_root_backup:
 		ret = btrfs_recover_relocation(tree_root);
 		if (ret < 0) {
 			printk(KERN_WARNING
-			       "btrfs: failed to recover relocation\n");
+			       "BTRFS: failed to recover relocation\n");
 			err = -EINVAL;
 			goto fail_qgroup;
 		}
@@ -2882,14 +2884,14 @@ retry_root_backup:
 
 	ret = btrfs_resume_balance_async(fs_info);
 	if (ret) {
-		printk(KERN_WARNING "btrfs: failed to resume balance\n");
+		printk(KERN_WARNING "BTRFS: failed to resume balance\n");
 		close_ctree(tree_root);
 		return ret;
 	}
 
 	ret = btrfs_resume_dev_replace_async(fs_info);
 	if (ret) {
-		pr_warn("btrfs: failed to resume dev_replace\n");
+		pr_warn("BTRFS: failed to resume dev_replace\n");
 		close_ctree(tree_root);
 		return ret;
 	}
@@ -2897,20 +2899,20 @@ retry_root_backup:
 	btrfs_qgroup_rescan_resume(fs_info);
 
 	if (create_uuid_tree) {
-		pr_info("btrfs: creating UUID tree\n");
+		pr_info("BTRFS: creating UUID tree\n");
 		ret = btrfs_create_uuid_tree(fs_info);
 		if (ret) {
-			pr_warn("btrfs: failed to create the UUID tree %d\n",
+			pr_warn("BTRFS: failed to create the UUID tree %d\n",
 				ret);
 			close_ctree(tree_root);
 			return ret;
 		}
 	} else if (check_uuid_tree ||
 		   btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
-		pr_info("btrfs: checking UUID tree\n");
+		pr_info("BTRFS: checking UUID tree\n");
 		ret = btrfs_check_uuid_tree(fs_info);
 		if (ret) {
-			pr_warn("btrfs: failed to check the UUID tree %d\n",
+			pr_warn("BTRFS: failed to check the UUID tree %d\n",
 				ret);
 			close_ctree(tree_root);
 			return ret;
@@ -2991,7 +2993,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 		struct btrfs_device *device = (struct btrfs_device *)
 			bh->b_private;
 
-		printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
+		printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
 					  "I/O error on %s\n",
 					  rcu_str_deref(device->name));
 		/* note, we dont' set_buffer_write_io_error because we have
@@ -3110,7 +3112,7 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
 			if (!bh) {
-				printk(KERN_ERR "btrfs: couldn't get super "
+				printk(KERN_ERR "BTRFS: couldn't get super "
 				       "buffer head for bytenr %Lu\n", bytenr);
 				errors++;
 				continue;
@@ -3177,7 +3179,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 		wait_for_completion(&device->flush_wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-			printk_in_rcu("btrfs: disabling barriers on dev %s\n",
+			printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
 				      rcu_str_deref(device->name));
 			device->nobarriers = 1;
 		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
@@ -3398,7 +3400,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		btrfs_err(root->fs_info, "%d errors while writing supers",
 		       total_errors);
 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
@@ -3554,7 +3556,7 @@ int close_ctree(struct btrfs_root *root)
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_commit_super(root);
 		if (ret)
-			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+			btrfs_err(root->fs_info, "commit super ret %d", ret);
 	}
 
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
@@ -3571,7 +3573,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_free_qgroup_config(root->fs_info);
 
 	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
-		printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+		btrfs_info(root->fs_info, "at unmount delalloc count %lld",
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
@@ -3798,7 +3800,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	spin_lock(&delayed_refs->lock);
 	if (delayed_refs->num_entries == 0) {
 		spin_unlock(&delayed_refs->lock);
-		printk(KERN_INFO "delayed_refs has NO entry\n");
+		btrfs_info(root->fs_info, "delayed_refs has NO entry");
 		return ret;
 	}
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9e524b0a7b3..1c82bead2c0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6637,12 +6637,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 	int index = 0;
 
 	spin_lock(&info->lock);
-	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+	printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
 	       info->flags,
 	       info->total_bytes - info->bytes_used - info->bytes_pinned -
 	       info->bytes_reserved - info->bytes_readonly,
 	       (info->full) ? "" : "not ");
-	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+	printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
 	       info->total_bytes, info->bytes_used, info->bytes_pinned,
 	       info->bytes_reserved, info->bytes_may_use,
@@ -6656,7 +6656,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 again:
 	list_for_each_entry(cache, &info->block_groups[index], list) {
 		spin_lock(&cache->lock);
-		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
+		printk(KERN_INFO "BTRFS: "
+			   "block group %llu has %llu bytes, "
+			   "%llu used %llu pinned %llu reserved %s\n",
 		       cache->key.objectid, cache->key.offset,
 		       btrfs_block_group_used(&cache->item), cache->pinned,
 		       cache->reserved, cache->ro ? "[readonly]" : "");
@@ -7019,7 +7021,7 @@ again:
 				/*DEFAULT_RATELIMIT_BURST*/ 1);
 		if (__ratelimit(&_rs))
 			WARN(1, KERN_DEBUG
-				"btrfs: block rsv returned %d\n", ret);
+				"BTRFS: block rsv returned %d\n", ret);
 	}
 try_reserve:
 	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
@@ -7767,7 +7769,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 
 			btrfs_end_transaction_throttle(trans, tree_root);
 			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
-				pr_debug("btrfs: drop snapshot early exit\n");
+				pr_debug("BTRFS: drop snapshot early exit\n");
 				err = -EAGAIN;
 				goto out_free;
 			}
@@ -8427,7 +8429,7 @@ static void __link_block_group(struct btrfs_space_info *space_info,
 		ret = kobject_add(kobj, &space_info->kobj, "%s",
 				  get_raid_name(index));
 		if (ret) {
-			pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n");
+			pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
 			kobject_put(&space_info->kobj);
 		}
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2fa23b57d8d..fbe501d3bd0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -59,7 +59,7 @@ void btrfs_leak_debug_check(void)
 
 	while (!list_empty(&states)) {
 		state = list_entry(states.next, struct extent_state, leak_list);
-		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
 		       "state %lu in tree %p refs %d\n",
 		       state->start, state->end, state->state, state->tree,
 		       atomic_read(&state->refs));
@@ -69,7 +69,7 @@ void btrfs_leak_debug_check(void)
 
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
 		       "refs %d\n",
 		       eb->start, eb->len, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
@@ -92,7 +92,7 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 	isize = i_size_read(inode);
 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 		printk_ratelimited(KERN_DEBUG
-		    "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+		    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
 				caller, btrfs_ino(inode), isize, start, end);
 	}
 }
@@ -423,7 +423,7 @@ static int insert_state(struct extent_io_tree *tree,
 	struct rb_node *node;
 
 	if (end < start)
-		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
+		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
 		       end, start);
 	state->start = start;
 	state->end = end;
@@ -434,7 +434,7 @@ static int insert_state(struct extent_io_tree *tree,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+		printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
 		       "%llu %llu\n",
 		       found->start, found->end, start, end);
 		return -EEXIST;
@@ -2054,9 +2054,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 		return -EIO;
 	}
 
-	printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
-		      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
-		      start, rcu_str_deref(dev->name), sector);
+	printk_ratelimited_in_rcu(KERN_INFO
+			"BTRFS: read error corrected: ino %lu off %llu "
+		    "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+		    start, rcu_str_deref(dev->name), sector);
 
 	bio_put(bio);
 	return 0;
@@ -2386,11 +2387,17 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		 * advance bv_offset and adjust bv_len to compensate.
 		 * Print a warning for nonzero offsets, and an error
 		 * if they don't add up to a full page.  */
-		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
-			printk("%s page write in btrfs with offset %u and length %u\n",
-			       bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
-			       ? KERN_ERR "partial" : KERN_INFO "incomplete",
-			       bvec->bv_offset, bvec->bv_len);
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "partial page write in btrfs with offset %u and length %u",
+					bvec->bv_offset, bvec->bv_len);
+			else
+				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "incomplete page write in btrfs with offset %u and "
+				   "length %u",
+					bvec->bv_offset, bvec->bv_len);
+		}
 
 		start = page_offset(page);
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
@@ -2463,11 +2470,17 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		 * advance bv_offset and adjust bv_len to compensate.
 		 * Print a warning for nonzero offsets, and an error
 		 * if they don't add up to a full page.  */
-		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
-			printk("%s page read in btrfs with offset %u and length %u\n",
-			       bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
-			       ? KERN_ERR "partial" : KERN_INFO "incomplete",
-			       bvec->bv_offset, bvec->bv_len);
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "partial page read in btrfs with offset %u and length %u",
+					bvec->bv_offset, bvec->bv_len);
+			else
+				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "incomplete page read in btrfs with offset %u and "
+				   "length %u",
+					bvec->bv_offset, bvec->bv_len);
+		}
 
 		start = page_offset(page);
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
@@ -3327,8 +3340,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
-				printk(KERN_ERR "btrfs warning page %lu not "
-				       "writeback, cur %llu end %llu\n",
+				btrfs_err(BTRFS_I(inode)->root->fs_info,
+					   "page %lu not writeback, cur %llu end %llu",
 				       page->index, cur, end);
 			}
 
@@ -5149,12 +5162,12 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
@@ -5196,12 +5209,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
 		       "len %lu len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
 		       "len %lu len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6f384886028..9d846588f72 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -246,8 +246,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 						offset + bvec->bv_len - 1,
 						EXTENT_NODATASUM, GFP_NOFS);
 				} else {
-					printk(KERN_INFO "btrfs no csum found "
-					       "for inode %llu start %llu\n",
+					btrfs_info(BTRFS_I(inode)->root->fs_info,
+						   "no csum found for inode %llu start %llu",
 					       btrfs_ino(inode), offset);
 				}
 				item = NULL;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 332aa33d02f..73f3de7a083 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -347,8 +347,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
 			btrfs_readpage(NULL, page);
 			lock_page(page);
 			if (!PageUptodate(page)) {
-				printk(KERN_ERR "btrfs: error reading free "
-				       "space cache\n");
+				btrfs_err(BTRFS_I(inode)->root->fs_info,
+					   "error reading free space cache");
 				io_ctl_drop_pages(io_ctl);
 				return -EIO;
 			}
@@ -405,7 +405,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 
 	gen = io_ctl->cur;
 	if (le64_to_cpu(*gen) != generation) {
-		printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+		printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
 				   "(%Lu) does not match inode (%Lu)\n", *gen,
 				   generation);
 		io_ctl_unmap_page(io_ctl);
@@ -463,7 +463,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	if (val != crc) {
-		printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
 				   "space cache\n");
 		io_ctl_unmap_page(io_ctl);
 		return -EIO;
@@ -1902,7 +1902,7 @@ out:
 	spin_unlock(&ctl->tree_lock);
 
 	if (ret) {
-		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
+		printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
 		ASSERT(ret != -EEXIST);
 	}
 
@@ -2011,14 +2011,15 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes && !block_group->ro)
 			count++;
-		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
-		       info->offset, info->bytes,
+		btrfs_crit(block_group->fs_info,
+			   "entry offset %llu, bytes %llu, bitmap %s",
+			   info->offset, info->bytes,
 		       (info->bitmap) ? "yes" : "no");
 	}
-	printk(KERN_INFO "block group has cluster?: %s\n",
+	btrfs_info(block_group->fs_info, "block group has cluster?: %s",
 	       list_empty(&block_group->cluster_list) ? "no" : "yes");
-	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
-	       "\n", count);
+	btrfs_info(block_group->fs_info,
+		   "%d blocks of free space at or bigger than bytes is", count);
 }
 
 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2ccf8e6b1e1..06bcf5b53cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6966,8 +6966,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 	struct btrfs_dio_private *dip = bio->bi_private;
 
 	if (err) {
-		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
-		      "sector %#Lx len %u err no %d\n",
+		btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
+			  "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
 		      btrfs_ino(dip->inode), bio->bi_rw,
 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
 		dip->errors = 1;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c0dc05467ce..edf5f0093f2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1262,7 +1262,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			break;
 
 		if (btrfs_defrag_cancelled(root->fs_info)) {
-			printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+			printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
 			ret = -EAGAIN;
 			break;
 		}
@@ -1424,20 +1424,20 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 			ret = -EINVAL;
 			goto out_free;
 		}
-		printk(KERN_INFO "btrfs: resizing devid %llu\n", devid);
+		btrfs_info(root->fs_info, "resizing devid %llu", devid);
 	}
 
 	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 	if (!device) {
-		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
+		btrfs_info(root->fs_info, "resizer unable to find device %llu",
 		       devid);
 		ret = -ENODEV;
 		goto out_free;
 	}
 
 	if (!device->writeable) {
-		printk(KERN_INFO "btrfs: resizer unable to apply on "
-		       "readonly device %llu\n",
+		btrfs_info(root->fs_info,
+			   "resizer unable to apply on readonly device %llu",
 		       devid);
 		ret = -EPERM;
 		goto out_free;
@@ -1489,7 +1489,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	do_div(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-	printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
+	printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
 		      rcu_str_deref(device->name), new_size);
 
 	if (new_size > old_size) {
@@ -1550,8 +1550,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 
 		src_inode = file_inode(src.file);
 		if (src_inode->i_sb != file_inode(file)->i_sb) {
-			printk(KERN_INFO "btrfs: Snapshot src from "
-			       "another FS\n");
+			btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+				   "Snapshot src from another FS");
 			ret = -EINVAL;
 		} else {
 			ret = btrfs_mksubvol(&file->f_path, name, namelen,
@@ -1934,7 +1934,7 @@ static noinline int search_ioctl(struct inode *inode,
 		key.offset = (u64)-1;
 		root = btrfs_read_fs_root_no_name(info, &key);
 		if (IS_ERR(root)) {
-			printk(KERN_ERR "could not find root %llu\n",
+			printk(KERN_ERR "BTRFS: could not find root %llu\n",
 			       sk->tree_id);
 			btrfs_free_path(path);
 			return -ENOENT;
@@ -2024,7 +2024,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.offset = (u64)-1;
 	root = btrfs_read_fs_root_no_name(info, &key);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "could not find root %llu\n", tree_id);
+		printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
 		ret = -ENOENT;
 		goto out;
 	}
@@ -3367,8 +3367,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	if (IS_ERR_OR_NULL(di)) {
 		btrfs_free_path(path);
 		btrfs_end_transaction(trans, root);
-		printk(KERN_ERR "Umm, you don't have the default dir item, "
-		       "this isn't going to work\n");
+		btrfs_err(new_root->fs_info, "Umm, you don't have the default dir"
+			   "item, this isn't going to work");
 		ret = -ENOENT;
 		goto out;
 	}
@@ -4469,8 +4469,8 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
 	len = strnlen(label, BTRFS_LABEL_SIZE);
 
 	if (len == BTRFS_LABEL_SIZE) {
-		pr_warn("btrfs: label is too long, return the first %zu bytes\n",
-			--len);
+		btrfs_warn(root->fs_info,
+			"label is too long, return the first %zu bytes", --len);
 	}
 
 	ret = copy_to_user(arg, label, len);
@@ -4493,7 +4493,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
 		return -EFAULT;
 
 	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
-		pr_err("btrfs: unable to set label with more than %d bytes\n",
+		btrfs_err(root->fs_info, "unable to set label with more than %d bytes",
 		       BTRFS_LABEL_SIZE - 1);
 		return -EINVAL;
 	}
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b6a6f07c5ce..b47f669aca7 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -141,7 +141,7 @@ static int lzo_compress_pages(struct list_head *ws,
 		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
 				       &out_len, workspace->mem);
 		if (ret != LZO_E_OK) {
-			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
 			       ret);
 			ret = -1;
 			goto out;
@@ -357,7 +357,7 @@ cont:
 		if (need_unmap)
 			kunmap(pages_in[page_in_index - 1]);
 		if (ret != LZO_E_OK) {
-			printk(KERN_WARNING "btrfs decompress failed\n");
+			printk(KERN_WARNING "BTRFS: decompress failed\n");
 			ret = -1;
 			break;
 		}
@@ -401,7 +401,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	out_len = PAGE_CACHE_SIZE;
 	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
 	if (ret != LZO_E_OK) {
-		printk(KERN_WARNING "btrfs decompress failed!\n");
+		printk(KERN_WARNING "BTRFS: decompress failed!\n");
 		ret = -1;
 		goto out;
 	}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b8c2ded75fe..b16450b840e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -336,13 +336,14 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 		      entry->len);
 	*file_offset = dec_end;
 	if (dec_start > dec_end) {
-		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
-		       dec_start, dec_end);
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			"bad ordering dec_start %llu end %llu", dec_start, dec_end);
 	}
 	to_dec = dec_end - dec_start;
 	if (to_dec > entry->bytes_left) {
-		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
-		       entry->bytes_left, to_dec);
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			"bad ordered accounting left %llu size %llu",
+			entry->bytes_left, to_dec);
 	}
 	entry->bytes_left -= to_dec;
 	if (!uptodate)
@@ -401,7 +402,8 @@ have_entry:
 	}
 
 	if (io_size > entry->bytes_left) {
-		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			   "bad ordered accounting left %llu size %llu",
 		       entry->bytes_left, io_size);
 	}
 	entry->bytes_left -= io_size;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 417053b1718..4eed002b7cf 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -154,7 +154,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
 			    u32 item_size)
 {
 	if (!IS_ALIGNED(item_size, sizeof(u64))) {
-		pr_warn("btrfs: uuid item with illegal size %lu!\n",
+		pr_warn("BTRFS: uuid item with illegal size %lu!\n",
 			(unsigned long)item_size);
 		return;
 	}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bd0b058f2a2..d22e0a14dde 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -301,16 +301,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 
 			if (btrfs_qgroup_status_version(l, ptr) !=
 			    BTRFS_QGROUP_STATUS_VERSION) {
-				printk(KERN_ERR
-				 "btrfs: old qgroup version, quota disabled\n");
+				btrfs_err(fs_info,
+				 "old qgroup version, quota disabled");
 				goto out;
 			}
 			if (btrfs_qgroup_status_generation(l, ptr) !=
 			    fs_info->generation) {
 				flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
-				printk(KERN_ERR
-					"btrfs: qgroup generation mismatch, "
-					"marked as inconsistent\n");
+				btrfs_err(fs_info,
+					"qgroup generation mismatch, "
+					"marked as inconsistent");
 			}
 			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 									  ptr);
@@ -325,7 +325,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 		qgroup = find_qgroup_rb(fs_info, found_key.offset);
 		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
-			printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
+			btrfs_err(fs_info, "inconsitent qgroup config");
 			flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 		}
 		if (!qgroup) {
@@ -396,8 +396,8 @@ next1:
 		ret = add_relation_rb(fs_info, found_key.objectid,
 				      found_key.offset);
 		if (ret == -ENOENT) {
-			printk(KERN_WARNING
-				"btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+			btrfs_warn(fs_info,
+				"orphan qgroup relation 0x%llx->0x%llx",
 				found_key.objectid, found_key.offset);
 			ret = 0;	/* ignore the error */
 		}
@@ -1159,7 +1159,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 				       limit->rsv_excl);
 	if (ret) {
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
-		printk(KERN_INFO "unable to update quota limit for %llu\n",
+		btrfs_info(fs_info, "unable to update quota limit for %llu",
 		       qgroupid);
 	}
 
@@ -1833,7 +1833,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 {
 	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
 		return;
-	pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
+	btrfs_err(trans->root->fs_info,
+		"qgroups not uptodate in trans handle %p:  list is%s empty, "
+		"seq is %#x.%x",
 		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
 		(u32)(trans->delayed_ref_elem.seq >> 32),
 		(u32)trans->delayed_ref_elem.seq);
@@ -2030,10 +2032,10 @@ out:
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
 	if (err >= 0) {
-		pr_info("btrfs: qgroup scan completed%s\n",
+		btrfs_info(fs_info, "qgroup scan completed%s",
 			err == 2 ? " (inconsistency flag cleared)" : "");
 	} else {
-		pr_err("btrfs: qgroup scan failed with %d\n", err);
+		btrfs_err(fs_info, "qgroup scan failed with %d", err);
 	}
 
 	complete_all(&fs_info->qgroup_rescan_completion);
@@ -2089,7 +2091,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 
 	if (ret) {
 err:
-		pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+		btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
 		return ret;
 	}
 
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 1031b69252c..31c797c48c3 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -189,8 +189,8 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 			 */
 #ifdef DEBUG
 			if (rec->generation != generation) {
-				printk(KERN_DEBUG "generation mismatch for "
-						"(%llu,%d,%llu) %llu != %llu\n",
+				btrfs_debug(root->fs_info,
+					   "generation mismatch for (%llu,%d,%llu) %llu != %llu",
 				       key.objectid, key.type, key.offset,
 				       rec->generation, generation);
 			}
@@ -365,8 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 		goto error;
 
 	if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
-		printk(KERN_ERR "btrfs readahead: more than %d copies not "
-				"supported", BTRFS_MAX_MIRRORS);
+		btrfs_err(root->fs_info,
+			   "readahead: more than %d copies not supported",
+			   BTRFS_MAX_MIRRORS);
 		goto error;
 	}
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d8a82b84998..8cf99c496ff 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4245,7 +4245,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		goto out;
 	}
 
-	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+	btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
 	       rc->block_group->key.objectid, rc->block_group->flags);
 
 	ret = btrfs_start_delalloc_roots(fs_info, 0);
@@ -4267,7 +4267,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		if (rc->extents_found == 0)
 			break;
 
-		printk(KERN_INFO "btrfs: found %llu extents\n",
+		btrfs_info(extent_root->fs_info, "found %llu extents",
 			rc->extents_found);
 
 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index fcc10ebb71a..1389b69059d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -44,7 +44,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 	if (!need_reset && btrfs_root_generation(item)
 		!= btrfs_root_generation_v2(item)) {
 		if (btrfs_root_generation_v2(item) != 0) {
-			printk(KERN_WARNING "btrfs: mismatching "
+			printk(KERN_WARNING "BTRFS: mismatching "
 					"generation and generation_v2 "
 					"found in root item. This root "
 					"was probably mounted with an "
@@ -154,7 +154,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+		btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu",
 		       key->objectid, key->type, key->offset);
 		BUG_ON(1);
 	}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index adebe12e497..7806e2c47f8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -505,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 	 * hold all of the paths here
 	 */
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
-		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
+		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
 			"length %llu, links %u (path: %s)\n", swarn->errstr,
 			swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -517,7 +517,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 	return 0;
 
 err:
-	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
+	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 		"resolving failed with ret=%d\n", swarn->errstr,
 		swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -580,7 +580,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
 							&ref_root, &ref_level);
 			printk_in_rcu(KERN_WARNING
-				"btrfs: %s at logical %llu on dev %s, "
+				"BTRFS: %s at logical %llu on dev %s, "
 				"sector %llu: metadata %s (level %d) in tree "
 				"%llu\n", errstr, swarn.logical,
 				rcu_str_deref(dev->name),
@@ -782,8 +782,8 @@ out:
 		btrfs_dev_replace_stats_inc(
 			&sctx->dev_root->fs_info->dev_replace.
 			num_uncorrectable_read_errors);
-		printk_ratelimited_in_rcu(KERN_ERR
-			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 			fixup->logical, rcu_str_deref(fixup->dev->name));
 	}
 
@@ -1184,7 +1184,7 @@ corrected_error:
 			sctx->stat.corrected_errors++;
 			spin_unlock(&sctx->stat_lock);
 			printk_ratelimited_in_rcu(KERN_ERR
-				"btrfs: fixed up error at logical %llu on dev %s\n",
+				"BTRFS: fixed up error at logical %llu on dev %s\n",
 				logical, rcu_str_deref(dev->name));
 		}
 	} else {
@@ -1193,7 +1193,7 @@ did_not_correct_error:
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
 		printk_ratelimited_in_rcu(KERN_ERR
-			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
 			logical, rcu_str_deref(dev->name));
 	}
 
@@ -1441,8 +1441,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		int ret;
 
 		if (!page_bad->dev->bdev) {
-			printk_ratelimited(KERN_WARNING
-				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+			printk_ratelimited(KERN_WARNING "BTRFS: "
+				"scrub_repair_page_from_good_copy(bdev == NULL) "
+				"is unexpected!\n");
 			return -EIO;
 		}
 
@@ -1900,7 +1901,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
 		 * This case is handled correctly (but _very_ slowly).
 		 */
 		printk_ratelimited(KERN_WARNING
-			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
 		bio_endio(sbio->bio, -EIO);
 	} else {
 		btrfsic_submit_bio(READ, sbio->bio);
@@ -2440,9 +2441,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
 			if (key.objectid < logical &&
 			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
-				printk(KERN_ERR
-				       "btrfs scrub: tree block %llu spanning "
-				       "stripes, ignored. logical=%llu\n",
+				btrfs_err(fs_info,
+					   "scrub: tree block %llu spanning "
+					   "stripes, ignored. logical=%llu",
 				       key.objectid, logical);
 				goto next;
 			}
@@ -2812,8 +2813,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	 * check some assumptions
 	 */
 	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
-		printk(KERN_ERR
-		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+		btrfs_err(fs_info,
+			   "scrub: size assumption nodesize == leafsize (%d == %d) fails",
 		       fs_info->chunk_root->nodesize,
 		       fs_info->chunk_root->leafsize);
 		return -EINVAL;
@@ -2825,16 +2826,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		 * the way scrub is implemented. Do not handle this
 		 * situation at all because it won't ever happen.
 		 */
-		printk(KERN_ERR
-		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+		btrfs_err(fs_info,
+			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
 		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
 		return -EINVAL;
 	}
 
 	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
 		/* not supported for data w/o checksums */
-		printk(KERN_ERR
-		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
+		btrfs_err(fs_info,
+			   "scrub: size assumption sectorsize != PAGE_SIZE "
+			   "(%d != %lu) fails",
 		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
 		return -EINVAL;
 	}
@@ -2847,7 +2849,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		 * would exhaust the array bounds of pagev member in
 		 * struct scrub_block
 		 */
-		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+		btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
+			   "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
 		       fs_info->chunk_root->nodesize,
 		       SCRUB_MAX_PAGES_PER_BLOCK,
 		       fs_info->chunk_root->sectorsize,
@@ -3163,7 +3166,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
 	ret = iterate_inodes_from_logical(logical, fs_info, path,
 					  record_inode_for_nocow, nocow_ctx);
 	if (ret != 0 && ret != -ENOENT) {
-		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
+		btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
+			"phys %llu, len %llu, mir %u, ret %d",
 			logical, physical_for_dev_replace, len, mirror_num,
 			ret);
 		not_written = 1;
@@ -3285,7 +3289,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 again:
 		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 		if (!page) {
-			pr_err("find_or_create_page() failed\n");
+			btrfs_err(fs_info, "find_or_create_page() failed");
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -3357,7 +3361,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 		return -EIO;
 	if (!dev->bdev) {
 		printk_ratelimited(KERN_WARNING
-			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
 		return -EIO;
 	}
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8877adc4539..bff0b1ac3be 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1234,7 +1234,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	if (!backref_ctx->found_itself) {
 		/* found a bug in backref code? */
 		ret = -EIO;
-		printk(KERN_ERR "btrfs: ERROR did not find backref in "
+		btrfs_err(sctx->send_root->fs_info, "did not find backref in "
 				"send_root. inode=%llu, offset=%llu, "
 				"disk_byte=%llu found extent=%llu\n",
 				ino, data_offset, disk_byte, found_key.objectid);
@@ -4648,7 +4648,7 @@ static int full_send_tree(struct send_ctx *sctx)
 	spin_unlock(&send_root->root_item_lock);
 
 	if (ctransid != start_ctransid) {
-		WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
+		WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
 				     "send was modified in between. This is "
 				     "probably a bug.\n");
 		ret = -EIO;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d71a11d13df..15b6a1d4c53 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -152,11 +152,12 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		vaf.fmt = fmt;
 		vaf.va = &args;
 
-		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+		printk(KERN_CRIT
+			"BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
 			sb->s_id, function, line, errno, errstr, &vaf);
 		va_end(args);
 	} else {
-		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n",
+		printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
 			sb->s_id, function, line, errno, errstr);
 	}
 
@@ -250,7 +251,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 	 */
 	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
 				&root->fs_info->fs_state)) {
-		WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n",
+		WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
 				errno);
 	}
 	trans->aborted = errno;
@@ -294,8 +295,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
 			s_id, function, line, &vaf, errno, errstr);
 
-	printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
-	       s_id, function, line, &vaf, errno, errstr);
+	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+		   function, line, &vaf, errno, errstr);
 	va_end(args);
 	/* Caller calls BUG() */
 }
@@ -409,7 +410,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_degraded:
-			printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+			btrfs_info(root->fs_info, "allowing degraded mounts");
 			btrfs_set_opt(info->mount_opt, DEGRADED);
 			break;
 		case Opt_subvol:
@@ -422,15 +423,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			 */
 			break;
 		case Opt_nodatasum:
-			printk(KERN_INFO "btrfs: setting nodatasum\n");
+			btrfs_info(root->fs_info, "setting nodatasum");
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
 			if (!btrfs_test_opt(root, COMPRESS) ||
 				!btrfs_test_opt(root, FORCE_COMPRESS)) {
-					printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+					btrfs_info(root->fs_info,
+						"setting nodatacow, compression disabled");
 			} else {
-				printk(KERN_INFO "btrfs: setting nodatacow\n");
+				btrfs_info(root->fs_info, "setting nodatacow");
 			}
 			btrfs_clear_opt(info->mount_opt, COMPRESS);
 			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
@@ -470,7 +472,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 
 			if (compress_force) {
 				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
-				pr_info("btrfs: force %s compression\n",
+				btrfs_info(root->fs_info, "force %s compression",
 					compress_type);
 			} else if (btrfs_test_opt(root, COMPRESS)) {
 				pr_info("btrfs: use %s compression\n",
@@ -478,24 +480,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			}
 			break;
 		case Opt_ssd:
-			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+			btrfs_info(root->fs_info, "use ssd allocation scheme");
 			btrfs_set_opt(info->mount_opt, SSD);
 			break;
 		case Opt_ssd_spread:
-			printk(KERN_INFO "btrfs: use spread ssd "
-			       "allocation scheme\n");
+			btrfs_info(root->fs_info, "use spread ssd allocation scheme");
 			btrfs_set_opt(info->mount_opt, SSD);
 			btrfs_set_opt(info->mount_opt, SSD_SPREAD);
 			break;
 		case Opt_nossd:
-			printk(KERN_INFO "btrfs: not using ssd allocation "
-			       "scheme\n");
+			btrfs_info(root->fs_info, "not using ssd allocation scheme");
 			btrfs_set_opt(info->mount_opt, NOSSD);
 			btrfs_clear_opt(info->mount_opt, SSD);
 			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
 			break;
 		case Opt_nobarrier:
-			printk(KERN_INFO "btrfs: turning off barriers\n");
+			btrfs_info(root->fs_info, "turning off barriers");
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
 			break;
 		case Opt_thread_pool:
@@ -520,7 +520,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 						info->max_inline,
 						root->sectorsize);
 				}
-				printk(KERN_INFO "btrfs: max_inline at %llu\n",
+				btrfs_info(root->fs_info, "max_inline at %llu",
 					info->max_inline);
 			} else {
 				ret = -ENOMEM;
@@ -534,8 +534,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				info->alloc_start = memparse(num, NULL);
 				mutex_unlock(&info->chunk_mutex);
 				kfree(num);
-				printk(KERN_INFO
-					"btrfs: allocations start at %llu\n",
+				btrfs_info(root->fs_info, "allocations start at %llu",
 					info->alloc_start);
 			} else {
 				ret = -ENOMEM;
@@ -546,11 +545,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
 			break;
 		case Opt_notreelog:
-			printk(KERN_INFO "btrfs: disabling tree log\n");
+			btrfs_info(root->fs_info, "disabling tree log");
 			btrfs_set_opt(info->mount_opt, NOTREELOG);
 			break;
 		case Opt_flushoncommit:
-			printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+			btrfs_info(root->fs_info, "turning on flush-on-commit");
 			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
 			break;
 		case Opt_ratio:
@@ -559,7 +558,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				goto out;
 			} else if (intarg >= 0) {
 				info->metadata_ratio = intarg;
-				printk(KERN_INFO "btrfs: metadata ratio %d\n",
+				btrfs_info(root->fs_info, "metadata ratio %d",
 				       info->metadata_ratio);
 			} else {
 				ret = -EINVAL;
@@ -576,15 +575,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
 			break;
 		case Opt_no_space_cache:
-			printk(KERN_INFO "btrfs: disabling disk space caching\n");
+			btrfs_info(root->fs_info, "disabling disk space caching");
 			btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
 			break;
 		case Opt_inode_cache:
-			printk(KERN_INFO "btrfs: enabling inode map caching\n");
+			btrfs_info(root->fs_info, "enabling inode map caching");
 			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
 			break;
 		case Opt_clear_cache:
-			printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+			btrfs_info(root->fs_info, "force clearing of disk cache");
 			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
 			break;
 		case Opt_user_subvol_rm_allowed:
@@ -594,11 +593,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
 			break;
 		case Opt_defrag:
-			printk(KERN_INFO "btrfs: enabling auto defrag\n");
+			btrfs_info(root->fs_info, "enabling auto defrag");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
 		case Opt_recovery:
-			printk(KERN_INFO "btrfs: enabling auto recovery\n");
+			btrfs_info(root->fs_info, "enabling auto recovery");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
 		case Opt_skip_balance:
@@ -606,14 +605,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			break;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 		case Opt_check_integrity_including_extent_data:
-			printk(KERN_INFO "btrfs: enabling check integrity"
-			       " including extent data\n");
+			btrfs_info(root->fs_info,
+				   "enabling check integrity including extent data");
 			btrfs_set_opt(info->mount_opt,
 				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
 			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
 			break;
 		case Opt_check_integrity:
-			printk(KERN_INFO "btrfs: enabling check integrity\n");
+			btrfs_info(root->fs_info, "enabling check integrity");
 			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
 			break;
 		case Opt_check_integrity_print_mask:
@@ -622,8 +621,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				goto out;
 			} else if (intarg >= 0) {
 				info->check_integrity_print_mask = intarg;
-				printk(KERN_INFO "btrfs:"
-				       " check_integrity_print_mask 0x%x\n",
+				btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
 				       info->check_integrity_print_mask);
 			} else {
 				ret = -EINVAL;
@@ -634,8 +632,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_check_integrity_including_extent_data:
 		case Opt_check_integrity:
 		case Opt_check_integrity_print_mask:
-			printk(KERN_ERR "btrfs: support for check_integrity*"
-			       " not compiled in!\n");
+			btrfs_err(root->fs_info,
+				"support for check_integrity* not compiled in!");
 			ret = -EINVAL;
 			goto out;
 #endif
@@ -655,28 +653,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			intarg = 0;
 			ret = match_int(&args[0], &intarg);
 			if (ret < 0) {
-				printk(KERN_ERR
-					"btrfs: invalid commit interval\n");
+				btrfs_err(root->fs_info, "invalid commit interval");
 				ret = -EINVAL;
 				goto out;
 			}
 			if (intarg > 0) {
 				if (intarg > 300) {
-					printk(KERN_WARNING
-					    "btrfs: excessive commit interval %d\n",
+					btrfs_warn(root->fs_info, "excessive commit interval %d",
 							intarg);
 				}
 				info->commit_interval = intarg;
 			} else {
-				printk(KERN_INFO
-				    "btrfs: using default commit interval %ds\n",
+				btrfs_info(root->fs_info, "using default commit interval %ds",
 				    BTRFS_DEFAULT_COMMIT_INTERVAL);
 				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 			}
 			break;
 		case Opt_err:
-			printk(KERN_INFO "btrfs: unrecognized mount option "
-			       "'%s'\n", p);
+			btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
 			ret = -EINVAL;
 			goto out;
 		default:
@@ -685,7 +679,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	}
 out:
 	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
-		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
+		btrfs_info(root->fs_info, "disk space caching is enabled");
 	kfree(orig);
 	return ret;
 }
@@ -748,7 +742,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			break;
 		case Opt_subvolrootid:
 			printk(KERN_WARNING
-				"btrfs: 'subvolrootid' mount option is deprecated and has no effect\n");
+				"BTRFS: 'subvolrootid' mount option is deprecated and has "
+				"no effect\n");
 			break;
 		case Opt_device:
 			device_name = match_strdup(&args[0]);
@@ -877,7 +872,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_flags |= MS_I_VERSION;
 	err = open_ctree(sb, fs_devices, (char *)data);
 	if (err) {
-		printk("btrfs: open_ctree failed\n");
+		printk(KERN_ERR "BTRFS: open_ctree failed\n");
 		return err;
 	}
 
@@ -1115,7 +1110,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 		dput(root);
 		root = ERR_PTR(-EINVAL);
 		deactivate_locked_super(s);
-		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+		printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
 				subvol_name);
 	}
 
@@ -1240,7 +1235,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 
 	fs_info->thread_pool_size = new_pool_size;
 
-	printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+	btrfs_info(fs_info, "resize thread pool %d -> %d",
 	       old_pool_size, new_pool_size);
 
 	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
@@ -1346,7 +1341,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	} else {
 		if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
 			btrfs_err(fs_info,
-				"Remounting read-write after error is not allowed\n");
+				"Remounting read-write after error is not allowed");
 			ret = -EINVAL;
 			goto restore;
 		}
@@ -1358,8 +1353,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (fs_info->fs_devices->missing_devices >
 		     fs_info->num_tolerated_disk_barrier_failures &&
 		    !(*flags & MS_RDONLY)) {
-			printk(KERN_WARNING
-			       "Btrfs: too many missing devices, writeable remount is not allowed\n");
+			btrfs_warn(fs_info,
+				"too many missing devices, writeable remount is not allowed");
 			ret = -EACCES;
 			goto restore;
 		}
@@ -1384,16 +1379,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
 		ret = btrfs_resume_dev_replace_async(fs_info);
 		if (ret) {
-			pr_warn("btrfs: failed to resume dev_replace\n");
+			btrfs_warn(fs_info, "failed to resume dev_replace");
 			goto restore;
 		}
 
 		if (!fs_info->uuid_root) {
-			pr_info("btrfs: creating UUID tree\n");
+			btrfs_info(fs_info, "creating UUID tree");
 			ret = btrfs_create_uuid_tree(fs_info);
 			if (ret) {
-				pr_warn("btrfs: failed to create the uuid tree"
-					"%d\n", ret);
+				btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
 				goto restore;
 			}
 		}
@@ -1773,7 +1767,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
+		printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
 }
 
 static void btrfs_print_info(void)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f25deb9c132..ba94b277e98 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -374,7 +374,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
 	int ret;
 
 	if (len >= BTRFS_LABEL_SIZE) {
-		pr_err("btrfs: unable to set label with more than %d bytes\n",
+		pr_err("BTRFS: unable to set label with more than %d bytes\n",
 		       BTRFS_LABEL_SIZE - 1);
 		return -EINVAL;
 	}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index b353bc806ca..312560a9123 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -21,7 +21,7 @@
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 
-#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__)
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
 
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 026f1fea963..46bfd820d91 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -198,10 +198,10 @@ loop:
 	 */
 	smp_mb();
 	if (!list_empty(&fs_info->tree_mod_seq_list))
-		WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
 			"creating a fresh transaction\n");
 	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
-		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
 			"creating a fresh transaction\n");
 	atomic64_set(&fs_info->tree_mod_seq, 0);
 
@@ -1107,7 +1107,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
 			break;
 
 		if (btrfs_defrag_cancelled(root->fs_info)) {
-			printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+			pr_debug("BTRFS: defrag_root cancelled\n");
 			ret = -EAGAIN;
 			break;
 		}
@@ -1981,7 +1981,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 	list_del_init(&root->root_list);
 	spin_unlock(&fs_info->trans_lock);
 
-	pr_debug("btrfs: cleaner removing %llu\n", root->objectid);
+	pr_debug("BTRFS: cleaner removing %llu\n", root->objectid);
 
 	btrfs_kill_all_delayed_nodes(root);
 
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index fbda90004fe..f6a4c03ee7d 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -69,7 +69,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
 	ret = -ENOENT;
 
 	if (!IS_ALIGNED(item_size, sizeof(u64))) {
-		pr_warn("btrfs: uuid item with illegal size %lu!\n",
+		btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
 			(unsigned long)item_size);
 		goto out;
 	}
@@ -137,7 +137,8 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
 		offset = btrfs_item_ptr_offset(eb, slot);
 		offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
 	} else if (ret < 0) {
-		pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n",
+		btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d "
+			"(0x%016llx, 0x%016llx) type %u!",
 			ret, (unsigned long long)key.objectid,
 			(unsigned long long)key.offset, type);
 		goto out;
@@ -183,7 +184,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
 	if (ret < 0) {
-		pr_warn("btrfs: error %d while searching for uuid item!\n",
+		btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!",
 			ret);
 		goto out;
 	}
@@ -197,7 +198,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
 	offset = btrfs_item_ptr_offset(eb, slot);
 	item_size = btrfs_item_size_nr(eb, slot);
 	if (!IS_ALIGNED(item_size, sizeof(u64))) {
-		pr_warn("btrfs: uuid item with illegal size %lu!\n",
+		btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
 			(unsigned long)item_size);
 		ret = -ENOENT;
 		goto out;
@@ -299,7 +300,7 @@ again_search_slot:
 		offset = btrfs_item_ptr_offset(leaf, slot);
 		item_size = btrfs_item_size_nr(leaf, slot);
 		if (!IS_ALIGNED(item_size, sizeof(u64))) {
-			pr_warn("btrfs: uuid item with illegal size %lu!\n",
+			btrfs_warn(fs_info, "uuid item with illegal size %lu!",
 				(unsigned long)item_size);
 			goto skip;
 		}
@@ -349,6 +350,6 @@ skip:
 out:
 	btrfs_free_path(path);
 	if (ret)
-		pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret);
+		btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
 	return 0;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 92303f42baa..b68afe32419 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,7 +125,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
 
 	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
 	if (ret)
-		pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
 			action,
 			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
 			&disk_to_dev(bdev->bd_disk)->kobj);
@@ -200,7 +200,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 
 	if (IS_ERR(*bdev)) {
 		ret = PTR_ERR(*bdev);
-		printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+		printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
 		goto error;
 	}
 
@@ -912,9 +912,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	if (disk_super->label[0]) {
 		if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
-		printk(KERN_INFO "btrfs: device label %s ", disk_super->label);
+		printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
 	} else {
-		printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid);
+		printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
 	}
 
 	printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
@@ -1813,7 +1813,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
 		}
 
 		if (!*device) {
-			pr_err("btrfs: no missing device found\n");
+			btrfs_err(root->fs_info, "no missing device found");
 			return -ENOENT;
 		}
 
@@ -3052,7 +3052,7 @@ loop:
 error:
 	btrfs_free_path(path);
 	if (enospc_errors) {
-		printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+		btrfs_info(fs_info, "%d enospc errors during balance",
 		       enospc_errors);
 		if (!ret)
 			ret = -ENOSPC;
@@ -3138,8 +3138,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
 		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
 		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
-			printk(KERN_ERR "btrfs: with mixed groups data and "
-			       "metadata balance options must be the same\n");
+			btrfs_err(fs_info, "with mixed groups data and "
+				   "metadata balance options must be the same");
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3165,8 +3165,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
 	     (bctl->data.target & ~allowed))) {
-		printk(KERN_ERR "btrfs: unable to start balance with target "
-		       "data profile %llu\n",
+		btrfs_err(fs_info, "unable to start balance with target "
+			   "data profile %llu",
 		       bctl->data.target);
 		ret = -EINVAL;
 		goto out;
@@ -3174,8 +3174,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 	    (!alloc_profile_is_valid(bctl->meta.target, 1) ||
 	     (bctl->meta.target & ~allowed))) {
-		printk(KERN_ERR "btrfs: unable to start balance with target "
-		       "metadata profile %llu\n",
+		btrfs_err(fs_info,
+			   "unable to start balance with target metadata profile %llu",
 		       bctl->meta.target);
 		ret = -EINVAL;
 		goto out;
@@ -3183,8 +3183,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 	    (!alloc_profile_is_valid(bctl->sys.target, 1) ||
 	     (bctl->sys.target & ~allowed))) {
-		printk(KERN_ERR "btrfs: unable to start balance with target "
-		       "system profile %llu\n",
+		btrfs_err(fs_info,
+			   "unable to start balance with target system profile %llu",
 		       bctl->sys.target);
 		ret = -EINVAL;
 		goto out;
@@ -3193,7 +3193,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	/* allow dup'ed data chunks only in mixed mode */
 	if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 	    (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
-		printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+		btrfs_err(fs_info, "dup for data is not allowed");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3213,11 +3213,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		     (fs_info->avail_metadata_alloc_bits & allowed) &&
 		     !(bctl->meta.target & allowed))) {
 			if (bctl->flags & BTRFS_BALANCE_FORCE) {
-				printk(KERN_INFO "btrfs: force reducing metadata "
-				       "integrity\n");
+				btrfs_info(fs_info, "force reducing metadata integrity");
 			} else {
-				printk(KERN_ERR "btrfs: balance will reduce metadata "
-				       "integrity, use force if you want this\n");
+				btrfs_err(fs_info, "balance will reduce metadata "
+					   "integrity, use force if you want this");
 				ret = -EINVAL;
 				goto out;
 			}
@@ -3303,7 +3302,7 @@ static int balance_kthread(void *data)
 	mutex_lock(&fs_info->balance_mutex);
 
 	if (fs_info->balance_ctl) {
-		printk(KERN_INFO "btrfs: continuing balance\n");
+		btrfs_info(fs_info, "continuing balance");
 		ret = btrfs_balance(fs_info->balance_ctl, NULL);
 	}
 
@@ -3325,7 +3324,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->balance_lock);
 
 	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
-		printk(KERN_INFO "btrfs: force skipping balance\n");
+		btrfs_info(fs_info, "force skipping balance");
 		return 0;
 	}
 
@@ -3543,7 +3542,7 @@ update_tree:
 						  BTRFS_UUID_KEY_SUBVOL,
 						  key.objectid);
 			if (ret < 0) {
-				pr_warn("btrfs: uuid_tree_add failed %d\n",
+				btrfs_warn(fs_info, "uuid_tree_add failed %d",
 					ret);
 				break;
 			}
@@ -3555,7 +3554,7 @@ update_tree:
 						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 						  key.objectid);
 			if (ret < 0) {
-				pr_warn("btrfs: uuid_tree_add failed %d\n",
+				btrfs_warn(fs_info, "uuid_tree_add failed %d",
 					ret);
 				break;
 			}
@@ -3590,7 +3589,7 @@ out:
 	if (trans && !IS_ERR(trans))
 		btrfs_end_transaction(trans, fs_info->uuid_root);
 	if (ret)
-		pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret);
+		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
 	else
 		fs_info->update_uuid_tree_gen = 1;
 	up(&fs_info->uuid_tree_rescan_sem);
@@ -3654,7 +3653,7 @@ static int btrfs_uuid_rescan_kthread(void *data)
 	 */
 	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
 	if (ret < 0) {
-		pr_warn("btrfs: iterating uuid_tree failed %d\n", ret);
+		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
 		up(&fs_info->uuid_tree_rescan_sem);
 		return ret;
 	}
@@ -3695,7 +3694,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
 	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
 	if (IS_ERR(task)) {
 		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
-		pr_warn("btrfs: failed to start uuid_scan task\n");
+		btrfs_warn(fs_info, "failed to start uuid_scan task");
 		up(&fs_info->uuid_tree_rescan_sem);
 		return PTR_ERR(task);
 	}
@@ -3711,7 +3710,7 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
 	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
 	if (IS_ERR(task)) {
 		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
-		pr_warn("btrfs: failed to start uuid_rescan task\n");
+		btrfs_warn(fs_info, "failed to start uuid_rescan task");
 		up(&fs_info->uuid_tree_rescan_sem);
 		return PTR_ERR(task);
 	}
@@ -4033,7 +4032,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		max_stripe_size = 32 * 1024 * 1024;
 		max_chunk_size = 2 * max_stripe_size;
 	} else {
-		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
+		btrfs_err(info, "invalid chunk type 0x%llx requested\n",
 		       type);
 		BUG_ON(1);
 	}
@@ -4065,7 +4064,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 		if (!device->writeable) {
 			WARN(1, KERN_ERR
-			       "btrfs: read-only device in alloc_list\n");
+			       "BTRFS: read-only device in alloc_list\n");
 			continue;
 		}
 
@@ -5193,13 +5192,13 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
-		printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n",
+		printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
 		       chunk_start);
 		return -EIO;
 	}
 
 	if (em->start != chunk_start) {
-		printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n",
+		printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
 		       em->start, chunk_start);
 		free_extent_map(em);
 		return -EIO;
@@ -6123,7 +6122,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 	if (ret < 0) {
-		printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+		printk_in_rcu(KERN_WARNING "BTRFS: "
+			"error %d while searching for dev_stats item for device %s!\n",
 			      ret, rcu_str_deref(device->name));
 		goto out;
 	}
@@ -6133,7 +6133,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		/* need to delete old one and insert a new one */
 		ret = btrfs_del_item(trans, dev_root, path);
 		if (ret != 0) {
-			printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+			printk_in_rcu(KERN_WARNING "BTRFS: "
+				"delete too small dev_stats item for device %s failed %d!\n",
 				      rcu_str_deref(device->name), ret);
 			goto out;
 		}
@@ -6146,7 +6147,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		ret = btrfs_insert_empty_item(trans, dev_root, path,
 					      &key, sizeof(*ptr));
 		if (ret < 0) {
-			printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+			printk_in_rcu(KERN_WARNING "BTRFS: "
+					  "insert dev_stats item for device %s failed %d!\n",
 				      rcu_str_deref(device->name), ret);
 			goto out;
 		}
@@ -6199,16 +6201,14 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
 {
 	if (!dev->dev_stats_valid)
 		return;
-	printk_ratelimited_in_rcu(KERN_ERR
-			   "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+			   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
 			   rcu_str_deref(dev->name),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
-			   btrfs_dev_stat_read(dev,
-					       BTRFS_DEV_STAT_CORRUPTION_ERRS),
-			   btrfs_dev_stat_read(dev,
-					       BTRFS_DEV_STAT_GENERATION_ERRS));
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
 }
 
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
@@ -6221,7 +6221,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
 	if (i == BTRFS_DEV_STAT_VALUES_MAX)
 		return; /* all values == 0, suppress message */
 
-	printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	printk_in_rcu(KERN_INFO "BTRFS: "
+		   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
 	       rcu_str_deref(dev->name),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6242,12 +6243,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (!dev) {
-		printk(KERN_WARNING
-		       "btrfs: get dev_stats failed, device not found\n");
+		btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
 		return -ENODEV;
 	} else if (!dev->dev_stats_valid) {
-		printk(KERN_WARNING
-		       "btrfs: get dev_stats failed, not yet valid\n");
+		btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
 		return -ENODEV;
 	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 9acb846c3e7..8e57191950c 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	*total_in = 0;
 
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-		printk(KERN_WARNING "btrfs: deflateInit failed\n");
+		printk(KERN_WARNING "BTRFS: deflateInit failed\n");
 		ret = -1;
 		goto out;
 	}
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	while (workspace->def_strm.total_in < len) {
 		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 		if (ret != Z_OK) {
-			printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
+			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
 			       ret);
 			zlib_deflateEnd(&workspace->def_strm);
 			ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "btrfs: inflateInit failed\n");
+		printk(KERN_WARNING "BTRFS: inflateInit failed\n");
 		return -1;
 	}
 	while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "btrfs: inflateInit failed\n");
+		printk(KERN_WARNING "BTRFS: inflateInit failed\n");
 		return -1;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 74c40f925e0d8e1ddfe5f9fc410b4c2f6c70acf5 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:07:01 +0800
Subject: Btrfs: remove residual code in delayed inode async helper

Before applying the patch
  commit de3cb945db4d8eb3b046dc7a5ea89a893372750c
  title: Btrfs: improve the delayed inode throttling

We need requeue the async work after the current work was done, it
introduced a deadlock problem. So we wrote the code that this patch
removes to avoid the above problem. But after applying the above
patch, the deadlock problem didn't exist. So we should remove that
fix code.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-inode.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 58419493861..608efeb24ae 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1300,33 +1300,6 @@ again:
 	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
 	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
-	/*
-	 * Maybe new delayed items have been inserted, so we need requeue
-	 * the work. Besides that, we must dequeue the empty delayed nodes
-	 * to avoid the race between delayed items balance and the worker.
-	 * The race like this:
-	 * 	Task1				Worker thread
-	 * 					count == 0, needn't requeue
-	 * 					  also needn't insert the
-	 * 					  delayed node into prepare
-	 * 					  list again.
-	 * 	add lots of delayed items
-	 * 	queue the delayed node
-	 * 	  already in the list,
-	 * 	  and not in the prepare
-	 * 	  list, it means the delayed
-	 * 	  node is being dealt with
-	 * 	  by the worker.
-	 * 	do delayed items balance
-	 * 	  the delayed node is being
-	 * 	  dealt with by the worker
-	 * 	  now, just wait.
-	 * 	  				the worker goto idle.
-	 * Task1 will sleep until the transaction is commited.
-	 */
-	mutex_lock(&delayed_node->mutex);
-	btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);
-	mutex_unlock(&delayed_node->mutex);
 
 	trans->block_rsv = block_rsv;
 	btrfs_end_transaction_dmeta(trans, root);
-- 
cgit v1.2.3-70-g09d2


From 4dd466d36a50d9c96cd990e2e0d472fe720a10aa Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:07:02 +0800
Subject: Btrfs: don't run delayed nodes again after all nodes flush

If the number of the delayed items is greater than the upper limit, we will
try to flush all the delayed items. After that, it is unnecessary to run
them again because they are being dealt with by the wokers or the number of
them is less than the lower limit.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 608efeb24ae..10d149c1fde 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1395,6 +1395,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
 				break;
 		}
 		finish_wait(&delayed_root->wait, &__wait);
+		return;
 	}
 
 	btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
-- 
cgit v1.2.3-70-g09d2


From 0353808cae35bc81c86e3510748a10f6bdff41b8 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:07:03 +0800
Subject: Btrfs: cleanup code of btrfs_balance_delayed_items()

- move the condition check for wait into a function
- use wait_event_interruptible instead of prepare-schedule-finish process

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-inode.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 10d149c1fde..6b2013403f4 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1349,52 +1349,40 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
 	WARN_ON(btrfs_first_delayed_node(delayed_root));
 }
 
-static int refs_newer(struct btrfs_delayed_root *delayed_root,
-		      int seq, int count)
+static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
 {
 	int val = atomic_read(&delayed_root->items_seq);
 
-	if (val < seq || val >= seq + count)
+	if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
 		return 1;
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return 1;
+
 	return 0;
 }
 
 void btrfs_balance_delayed_items(struct btrfs_root *root)
 {
 	struct btrfs_delayed_root *delayed_root;
-	int seq;
 
 	delayed_root = btrfs_get_delayed_root(root);
 
 	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
 		return;
 
-	seq = atomic_read(&delayed_root->items_seq);
-
 	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+		int seq;
 		int ret;
-		DEFINE_WAIT(__wait);
+
+		seq = atomic_read(&delayed_root->items_seq);
 
 		ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
 		if (ret)
 			return;
 
-		while (1) {
-			prepare_to_wait(&delayed_root->wait, &__wait,
-					TASK_INTERRUPTIBLE);
-
-			if (refs_newer(delayed_root, seq,
-				       BTRFS_DELAYED_BATCH) ||
-			    atomic_read(&delayed_root->items) <
-			    BTRFS_DELAYED_BACKGROUND) {
-				break;
-			}
-			if (!signal_pending(current))
-				schedule();
-			else
-				break;
-		}
-		finish_wait(&delayed_root->wait, &__wait);
+		wait_event_interruptible(delayed_root->wait,
+					 could_end_wait(delayed_root, seq));
 		return;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a56dbd89400dd2cb9c91d734435dbfe059495da1 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:07:04 +0800
Subject: Btrfs: remove btrfs_end_transaction_dmeta()

Two reasons:
- btrfs_end_transaction_dmeta() is the same as btrfs_end_transaction_throttle()
  so it is unnecessary.
- All the delayed items should be dealt in the current transaction, so the
  workers should not commit the transaction, instead, deal with the delayed
  items as many as possible.

So we can remove btrfs_end_transaction_dmeta()

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-inode.c | 2 +-
 fs/btrfs/transaction.c   | 6 ------
 fs/btrfs/transaction.h   | 2 --
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6b2013403f4..826a26053c7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1302,7 +1302,7 @@ again:
 	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 
 	trans->block_rsv = block_rsv;
-	btrfs_end_transaction_dmeta(trans, root);
+	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty_nodelay(root);
 
 release_path:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 46bfd820d91..e5fe801659b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -790,12 +790,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 	return __btrfs_end_transaction(trans, root, 1);
 }
 
-int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
-{
-	return __btrfs_end_transaction(trans, root, 1);
-}
-
 /*
  * when btree blocks are allocated, they have some corresponding bits set for
  * them in one of two extent_io trees.  This is used to make sure all of
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7657d115067..d05b6013fea 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -154,8 +154,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 				   int wait_for_unblock);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
-int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
-- 
cgit v1.2.3-70-g09d2


From 7cf35d91b4f143b5c7529976bf5e7573a07051cd Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:07:05 +0800
Subject: Btrfs: use flags instead of the bool variants in delayed node

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-inode.c | 33 +++++++++++++++++----------------
 fs/btrfs/delayed-inode.h |  6 ++++--
 2 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 826a26053c7..222ca8cdc53 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -55,8 +55,7 @@ static inline void btrfs_init_delayed_node(
 	delayed_node->inode_id = inode_id;
 	atomic_set(&delayed_node->refs, 0);
 	delayed_node->count = 0;
-	delayed_node->in_list = 0;
-	delayed_node->inode_dirty = 0;
+	delayed_node->flags = 0;
 	delayed_node->ins_root = RB_ROOT;
 	delayed_node->del_root = RB_ROOT;
 	mutex_init(&delayed_node->mutex);
@@ -172,7 +171,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
 				     int mod)
 {
 	spin_lock(&root->lock);
-	if (node->in_list) {
+	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
 		if (!list_empty(&node->p_list))
 			list_move_tail(&node->p_list, &root->prepare_list);
 		else if (mod)
@@ -182,7 +181,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
 		list_add_tail(&node->p_list, &root->prepare_list);
 		atomic_inc(&node->refs);	/* inserted into list */
 		root->nodes++;
-		node->in_list = 1;
+		set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
 	}
 	spin_unlock(&root->lock);
 }
@@ -192,13 +191,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
 				       struct btrfs_delayed_node *node)
 {
 	spin_lock(&root->lock);
-	if (node->in_list) {
+	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
 		root->nodes--;
 		atomic_dec(&node->refs);	/* not in the list */
 		list_del_init(&node->n_list);
 		if (!list_empty(&node->p_list))
 			list_del_init(&node->p_list);
-		node->in_list = 0;
+		clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
 	}
 	spin_unlock(&root->lock);
 }
@@ -231,7 +230,8 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
 
 	delayed_root = node->root->fs_info->delayed_root;
 	spin_lock(&delayed_root->lock);
-	if (!node->in_list) {	/* not in the list */
+	if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+		/* not in the list */
 		if (list_empty(&delayed_root->node_list))
 			goto out;
 		p = delayed_root->node_list.next;
@@ -1004,9 +1004,10 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
 {
 	struct btrfs_delayed_root *delayed_root;
 
-	if (delayed_node && delayed_node->inode_dirty) {
+	if (delayed_node &&
+	    test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		BUG_ON(!delayed_node->root);
-		delayed_node->inode_dirty = 0;
+		clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
 		delayed_node->count--;
 
 		delayed_root = delayed_node->root->fs_info->delayed_root;
@@ -1059,7 +1060,7 @@ static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 	int ret;
 
 	mutex_lock(&node->mutex);
-	if (!node->inode_dirty) {
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
 		mutex_unlock(&node->mutex);
 		return 0;
 	}
@@ -1203,7 +1204,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
 		return 0;
 
 	mutex_lock(&delayed_node->mutex);
-	if (!delayed_node->inode_dirty) {
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		mutex_unlock(&delayed_node->mutex);
 		btrfs_release_delayed_node(delayed_node);
 		return 0;
@@ -1227,7 +1228,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
 	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
 
 	mutex_lock(&delayed_node->mutex);
-	if (delayed_node->inode_dirty)
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
 		ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
 						   path, delayed_node);
 	else
@@ -1721,7 +1722,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 		return -ENOENT;
 
 	mutex_lock(&delayed_node->mutex);
-	if (!delayed_node->inode_dirty) {
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		mutex_unlock(&delayed_node->mutex);
 		btrfs_release_delayed_node(delayed_node);
 		return -ENOENT;
@@ -1772,7 +1773,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 		return PTR_ERR(delayed_node);
 
 	mutex_lock(&delayed_node->mutex);
-	if (delayed_node->inode_dirty) {
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
 		goto release_node;
 	}
@@ -1783,7 +1784,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 		goto release_node;
 
 	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
-	delayed_node->inode_dirty = 1;
+	set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
 	delayed_node->count++;
 	atomic_inc(&root->fs_info->delayed_root->items);
 release_node:
@@ -1814,7 +1815,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 		btrfs_release_delayed_item(prev_item);
 	}
 
-	if (delayed_node->inode_dirty) {
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		btrfs_delayed_inode_release_metadata(root, delayed_node);
 		btrfs_release_delayed_inode(delayed_node);
 	}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index a4b38f934d1..a6a13f7e014 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -48,6 +48,9 @@ struct btrfs_delayed_root {
 	wait_queue_head_t wait;
 };
 
+#define BTRFS_DELAYED_NODE_IN_LIST	0
+#define BTRFS_DELAYED_NODE_INODE_DIRTY	1
+
 struct btrfs_delayed_node {
 	u64 inode_id;
 	u64 bytes_reserved;
@@ -65,8 +68,7 @@ struct btrfs_delayed_node {
 	struct btrfs_inode_item inode_item;
 	atomic_t refs;
 	u64 index_cnt;
-	bool in_list;
-	bool inode_dirty;
+	unsigned long flags;
 	int count;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 67de11769bd5ec339a62169f500b04f304826c00 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:07:06 +0800
Subject: Btrfs: introduce the delayed inode ref deletion for the single link
 inode

The inode reference item is close to inode item, so we insert it simultaneously
with the inode item insertion when we create a file/directory.. In fact, we also
can handle the inode reference deletion by the same way. So we made this patch to
introduce the delayed inode reference deletion for the single link inode(At most
case, the file doesn't has hard link, so we don't take the hard link into account).

This function is based on the delayed inode mechanism. After applying this patch,
we can reduce the time of the file/directory deletion by ~10%.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h   |   3 ++
 fs/btrfs/delayed-inode.c | 103 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/delayed-inode.h |   2 +
 fs/btrfs/inode.c         |  55 ++++++++++++++++++++++++-
 4 files changed, 157 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ac0b39db27d..661b0ac90e8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -135,6 +135,9 @@ struct btrfs_inode {
 	 */
 	u64 index_cnt;
 
+	/* Cache the directory index number to speed the dir/file remove */
+	u64 dir_index;
+
 	/* the fsync log has some corner cases that mean we have to check
 	 * directories to see if any unlinks have been done before
 	 * the directory was logged.  See tree-log.c for all the
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 222ca8cdc53..451b00c86f6 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1015,6 +1015,18 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
 	}
 }
 
+static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	ASSERT(delayed_node->root);
+	clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+	delayed_node->count--;
+
+	delayed_root = delayed_node->root->fs_info->delayed_root;
+	finish_one_item(delayed_root);
+}
+
 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
@@ -1023,13 +1035,19 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
+	int mod;
 	int ret;
 
 	key.objectid = node->inode_id;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	ret = btrfs_lookup_inode(trans, root, path, &key, 1);
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+		mod = -1;
+	else
+		mod = 1;
+
+	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
 	if (ret > 0) {
 		btrfs_release_path(path);
 		return -ENOENT;
@@ -1037,19 +1055,58 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	btrfs_unlock_up_safe(path, 1);
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
 	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
 			    sizeof(struct btrfs_inode_item));
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(path);
 
+	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+		goto no_iref;
+
+	path->slots[0]++;
+	if (path->slots[0] >= btrfs_header_nritems(leaf))
+		goto search;
+again:
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (key.objectid != node->inode_id)
+		goto out;
+
+	if (key.type != BTRFS_INODE_REF_KEY &&
+	    key.type != BTRFS_INODE_EXTREF_KEY)
+		goto out;
+
+	/*
+	 * Delayed iref deletion is for the inode who has only one link,
+	 * so there is only one iref. The case that several irefs are
+	 * in the same item doesn't exist.
+	 */
+	btrfs_del_item(trans, root, path);
+out:
+	btrfs_release_delayed_iref(node);
+no_iref:
+	btrfs_release_path(path);
+err_out:
 	btrfs_delayed_inode_release_metadata(root, node);
 	btrfs_release_delayed_inode(node);
 
-	return 0;
+	return ret;
+
+search:
+	btrfs_release_path(path);
+
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = -1;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto err_out;
+	ASSERT(ret);
+
+	ret = 0;
+	leaf = path->nodes[0];
+	path->slots[0]--;
+	goto again;
 }
 
 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1793,6 +1850,41 @@ release_node:
 	return ret;
 }
 
+int btrfs_delayed_delete_inode_ref(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	/*
+	 * We don't reserve space for inode ref deletion is because:
+	 * - We ONLY do async inode ref deletion for the inode who has only
+	 *   one link(i_nlink == 1), it means there is only one inode ref.
+	 *   And in most case, the inode ref and the inode item are in the
+	 *   same leaf, and we will deal with them at the same time.
+	 *   Since we are sure we will reserve the space for the inode item,
+	 *   it is unnecessary to reserve space for inode ref deletion.
+	 * - If the inode ref and the inode item are not in the same leaf,
+	 *   We also needn't worry about enospc problem, because we reserve
+	 *   much more space for the inode update than it needs.
+	 * - At the worst, we can steal some space from the global reservation.
+	 *   It is very rare.
+	 */
+	mutex_lock(&delayed_node->mutex);
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+		goto release_node;
+
+	set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+	delayed_node->count++;
+	atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
+release_node:
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
 static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 {
 	struct btrfs_root *root = delayed_node->root;
@@ -1815,6 +1907,9 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 		btrfs_release_delayed_item(prev_item);
 	}
 
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+		btrfs_release_delayed_iref(delayed_node);
+
 	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		btrfs_delayed_inode_release_metadata(root, delayed_node);
 		btrfs_release_delayed_inode(delayed_node);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index a6a13f7e014..f70119f2542 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -50,6 +50,7 @@ struct btrfs_delayed_root {
 
 #define BTRFS_DELAYED_NODE_IN_LIST	0
 #define BTRFS_DELAYED_NODE_INODE_DIRTY	1
+#define BTRFS_DELAYED_NODE_DEL_IREF	2
 
 struct btrfs_delayed_node {
 	u64 inode_id;
@@ -127,6 +128,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode);
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, struct inode *inode);
 int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_delayed_delete_inode_ref(struct inode *inode);
 
 /* Used for drop dead root */
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06bcf5b53cb..9eaa1c8ed38 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3309,6 +3309,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
+	unsigned long ptr;
 	int maybe_acls;
 	u32 rdev;
 	int ret;
@@ -3332,7 +3333,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	leaf = path->nodes[0];
 
 	if (filled)
-		goto cache_acl;
+		goto cache_index;
 
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
@@ -3375,6 +3376,30 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
+	path->slots[0]++;
+	if (inode->i_nlink != 1 ||
+	    path->slots[0] >= btrfs_header_nritems(leaf))
+		goto cache_acl;
+
+	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+	if (location.objectid != btrfs_ino(inode))
+		goto cache_acl;
+
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	if (location.type == BTRFS_INODE_REF_KEY) {
+		struct btrfs_inode_ref *ref;
+
+		ref = (struct btrfs_inode_ref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *extref;
+
+		extref = (struct btrfs_inode_extref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+								     extref);
+	}
 cache_acl:
 	/*
 	 * try to precache a NULL acl entry for files that don't have
@@ -3587,6 +3612,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		goto err;
 	btrfs_release_path(path);
 
+	/*
+	 * If we don't have dir index, we have to get it by looking up
+	 * the inode ref, since we get the inode ref, remove it directly,
+	 * it is unnecessary to do delayed deletion.
+	 *
+	 * But if we have dir index, needn't search inode ref to get it.
+	 * Since the inode ref is close to the inode item, it is better
+	 * that we delay to delete it, and just do this deletion when
+	 * we update the inode item.
+	 */
+	if (BTRFS_I(inode)->dir_index) {
+		ret = btrfs_delayed_delete_inode_ref(inode);
+		if (!ret) {
+			index = BTRFS_I(inode)->dir_index;
+			goto skip_backref;
+		}
+	}
+
 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
 				  dir_ino, &index);
 	if (ret) {
@@ -3596,7 +3639,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, root, ret);
 		goto err;
 	}
-
+skip_backref:
 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
@@ -5465,6 +5508,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 * number
 	 */
 	BTRFS_I(inode)->index_cnt = 2;
+	BTRFS_I(inode)->dir_index = *index;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
 	inode->i_generation = BTRFS_I(inode)->generation;
@@ -5809,6 +5853,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto fail;
 	}
 
+	/* There are several dir indexes for this inode, clear the cache. */
+	BTRFS_I(inode)->dir_index = 0ULL;
 	inc_nlink(inode);
 	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
@@ -7861,6 +7907,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->flags = 0;
 	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
+	ei->dir_index = 0;
 	ei->last_unlink_trans = 0;
 	ei->last_log_commit = 0;
 
@@ -8148,6 +8195,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_fail;
 
+	BTRFS_I(old_inode)->dir_index = 0ULL;
 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		/* force full log commit if subvolume involved. */
 		root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -8236,6 +8284,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out_fail;
 	}
 
+	if (old_inode->i_nlink == 1)
+		BTRFS_I(old_inode)->dir_index = index;
+
 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
 		struct dentry *parent = new_dentry->d_parent;
 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
-- 
cgit v1.2.3-70-g09d2


From c9ea7b24ce5863d65efb1134319cede160674d41 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 19 Sep 2013 10:02:11 -0400
Subject: Btrfs: stop caching thread if extent_commit_sem is contended

We can starve out the transaction commit with a bunch of caching threads all
running at the same time.  This is because we will only drop the
extent_commit_sem if we need_resched(), which isn't likely to happen since we
will be reading a lot from the disk so have already schedule()'ed plenty.  Alex
observed that he could starve out a transaction commit for up to a minute with
32 caching threads all running at once.  This will allow us to drop the
extent_commit_sem to allow the transaction commit to swap the commit_root out
and then all the cachers will start back up. Here is an explanation provided by
Igno

So, just to fill in what happens in this loop:

                                mutex_unlock(&caching_ctl->mutex);
                                cond_resched();
                                goto again;

where 'again:' takes caching_ctl->mutex and fs_info->extent_commit_sem
again:

        again:
                mutex_lock(&caching_ctl->mutex);
                /* need to make sure the commit_root doesn't disappear */
                down_read(&fs_info->extent_commit_sem);

So, if I'm reading the code correct, there can be a fair amount of
concurrency here: there may be multiple 'caching kthreads' per filesystem
active, while there's one fs_info->extent_commit_sem per filesystem
AFAICS.

So, what happens if there are a lot of CPUs all busy holding the
->extent_commit_sem rwsem read-locked and a writer arrives? They'd all
rush to try to release the fs_info->extent_commit_sem, and they'd block in
the down_read() because there's a writer waiting.

So there's a guarantee of forward progress. This should answer akpm's
concern I think.

Thanks,

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1c82bead2c0..3d19dcc553a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -442,7 +442,8 @@ next:
 			if (ret)
 				break;
 
-			if (need_resched()) {
+			if (need_resched() ||
+			    rwsem_is_contended(&fs_info->extent_commit_sem)) {
 				caching_ctl->progress = last;
 				btrfs_release_path(path);
 				up_read(&fs_info->extent_commit_sem);
-- 
cgit v1.2.3-70-g09d2


From eb8052e015f2c015926db45943f8ee724ace97e5 Mon Sep 17 00:00:00 2001
From: Wenliang Fan <fanwlexca@gmail.com>
Date: Fri, 20 Dec 2013 15:28:56 +0800
Subject: fs/btrfs: Integer overflow in btrfs_ioctl_resize()

The local variable 'new_size' comes from userspace. If a large number
was passed, there would be an integer overflow in the following line:
	new_size = old_size + new_size;

Signed-off-by: Wenliang Fan <fanwlexca@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index edf5f0093f2..ed3edc28325 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1474,6 +1474,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		}
 		new_size = old_size - new_size;
 	} else if (mod > 0) {
+		if (new_size > ULLONG_MAX - old_size) {
+			ret = -EINVAL;
+			goto out_free;
+		}
 		new_size = old_size + new_size;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From eb653de15987612444b6cde3b0e67b1edd94625f Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 23 Dec 2013 11:53:02 +0000
Subject: Btrfs: reduce btree node locking duration on item update

If we do a btree search with the goal of updating an existing item
without changing its size (ins_len == 0 and cow == 1), then we never
need to hold locks on upper level nodes (even when slot == 0) after we
COW their child nodes/leaves, as we won't have node splits or merges
in this scenario (that is, no key additions, removals or shifts on any
nodes or leaves).

Therefore release the locks immediately after COWing the child nodes/leaves
while navigating the btree, even if their parent slot is 0, instead of
returning a path to the caller with those nodes locked, which would get
released only when the caller releases or frees the path (or if it calls
btrfs_unlock_up_safe).

This is a common scenario, for example when updating inode items in fs
trees and block group items in the extent tree.

The following benchmarks were performed on a quad core machine with 32Gb
of ram, using a leaf/node size of 4Kb (to generate deeper fs trees more
quickly).

  sysbench --test=fileio --file-num=131072 --file-total-size=8G \
    --file-test-mode=seqwr --num-threads=512 --file-block-size=8192 \
    --max-requests=100000 --file-io-mode=sync [prepare|run]

Before this change:  49.85Mb/s (average of 5 runs)
After this change:   50.38Mb/s (average of 5 runs)

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 23 ++++++++++++++---------
 fs/btrfs/inode.c |  1 -
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 062438d3898..9e9de68eb81 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2731,6 +2731,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
+	BUG_ON(!cow && ins_len);
 
 	if (ins_len < 0) {
 		lowest_unlock = 2;
@@ -2839,8 +2840,6 @@ again:
 			}
 		}
 cow_done:
-		BUG_ON(!cow && ins_len);
-
 		p->nodes[level] = b;
 		btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -2850,13 +2849,19 @@ cow_done:
 		 * It is safe to drop the lock on our parent before we
 		 * go through the expensive btree search on b.
 		 *
-		 * If cow is true, then we might be changing slot zero,
-		 * which may require changing the parent.  So, we can't
-		 * drop the lock until after we know which slot we're
-		 * operating on.
+		 * If we're inserting or deleting (ins_len != 0), then we might
+		 * be changing slot zero, which may require changing the parent.
+		 * So, we can't drop the lock until after we know which slot
+		 * we're operating on.
 		 */
-		if (!cow)
-			btrfs_unlock_up_safe(p, level + 1);
+		if (!ins_len && !p->keep_locks) {
+			int u = level + 1;
+
+			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+				p->locks[u] = 0;
+			}
+		}
 
 		ret = key_search(b, key, level, &prev_cmp, &slot);
 
@@ -2884,7 +2889,7 @@ cow_done:
 			 * which means we must have a write lock
 			 * on the parent
 			 */
-			if (slot == 0 && cow &&
+			if (slot == 0 && ins_len &&
 			    write_lock_level < level + 1) {
 				write_lock_level = level + 1;
 				btrfs_release_path(p);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9eaa1c8ed38..8e45fdcdbd8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3515,7 +3515,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 		goto failed;
 	}
 
-	btrfs_unlock_up_safe(path, 1);
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
-- 
cgit v1.2.3-70-g09d2


From dc4103f933291cb1a2e6742c4db432e6ed337bae Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:10:49 +0800
Subject: Btrfs: remove unused argument from select_reloc_root()

@nr is no longer used, remove it from select_reloc_root()

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/relocation.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8cf99c496ff..277b8e371fa 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2454,7 +2454,7 @@ static noinline_for_stack
 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 				     struct reloc_control *rc,
 				     struct backref_node *node,
-				     struct backref_edge *edges[], int *nr)
+				     struct backref_edge *edges[])
 {
 	struct backref_node *next;
 	struct btrfs_root *root;
@@ -2496,7 +2496,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 	if (!root)
 		return NULL;
 
-	*nr = index;
 	next = node;
 	/* setup backref node path for btrfs_reloc_cow_block */
 	while (1) {
@@ -2643,7 +2642,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 	u32 blocksize;
 	u64 bytenr;
 	u64 generation;
-	int nr;
 	int slot;
 	int ret;
 	int err = 0;
@@ -2656,7 +2654,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		cond_resched();
 
 		upper = edge->node[UPPER];
-		root = select_reloc_root(trans, rc, upper, edges, &nr);
+		root = select_reloc_root(trans, rc, upper, edges);
 		BUG_ON(!root);
 
 		if (upper->eb && !upper->locked) {
-- 
cgit v1.2.3-70-g09d2


From 25e293c2a2916b58cdafb8219c0e93d6277762d7 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Thu, 26 Dec 2013 13:10:50 +0800
Subject: Btrfs: fix an oops when we fail to merge reloc roots

Previously, we will free reloc root memory and then force filesystem
to be readonly. The problem is that there may be another thread commiting
transaction which will try to access freed reloc root during merging reloc
roots process.

To keep consistency snapshots shared space, we should allow snapshot
finished if possible, so here we don't free reloc root memory.

signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/relocation.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 277b8e371fa..9189f9e3c35 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2311,9 +2311,6 @@ void free_reloc_roots(struct list_head *list)
 		reloc_root = list_entry(list->next, struct btrfs_root,
 					root_list);
 		__del_reloc_root(reloc_root);
-		free_extent_buffer(reloc_root->node);
-		free_extent_buffer(reloc_root->commit_root);
-		kfree(reloc_root);
 	}
 }
 
@@ -2355,10 +2352,9 @@ again:
 
 			ret = merge_reloc_root(rc, root);
 			if (ret) {
-				__del_reloc_root(reloc_root);
-				free_extent_buffer(reloc_root->node);
-				free_extent_buffer(reloc_root->commit_root);
-				kfree(reloc_root);
+				if (list_empty(&reloc_root->root_list))
+					list_add_tail(&reloc_root->root_list,
+						      &reloc_roots);
 				goto out;
 			}
 		} else {
-- 
cgit v1.2.3-70-g09d2


From e77751aad1facc4973613a11e2ad98ee4bbb04e1 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 27 Dec 2013 21:11:50 +0800
Subject: Btrfs: fix the wrong nocow range check

The following warning message was outputed when running the 274th case
of xfstests with nodatacow option:
 BUG: Bad page state in process kswapd0  pfn:1c66f
 page:ffffea0000636848 count:0 mapcount:0 mapping:(null) index:0x78000
 page flags: 0x1000000000100a(error|uptodate|private_2)

It is because the check of nocow range was wrong, we should compare the
start and end position of the extent with the write position to verify
if the write position was in the extent, but the current code just used
the start postion to do the check, so we got the wrong extent and told
the caller that it was a nocow write. And then when we write back the
dirty pages, we found we should cow the extent, but at that time, there
was no space in the fs, we had to the error flag for the page. When
someone reclaimed that page, the above warning outputed. Fix it.

Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e45fdcdbd8..ffb23e50676 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6503,6 +6503,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	int slot;
 	int found_type;
 	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -6546,6 +6547,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
 		goto out;
 
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if (extent_end <= offset)
+		goto out;
+
 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	if (disk_bytenr == 0)
 		goto out;
@@ -6563,8 +6568,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 	}
 
-	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
 	btrfs_release_path(path);
-- 
cgit v1.2.3-70-g09d2


From 1708cc5723cb775703b42a0ce8e521019c42dd67 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 28 Dec 2013 19:52:39 +0800
Subject: Btrfs: fix an oops when we fail to relocate tree blocks

During balance test, we hit an oops:
[ 2013.841551] kernel BUG at fs/btrfs/relocation.c:1174!

The problem is that if we fail to relocate tree blocks, we should
update backref cache, otherwise, some pending nodes are not updated
while snapshot check @cache->last_trans is within one transaction
and won't update it and then oops happen.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/relocation.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9189f9e3c35..07b3b36f40e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4009,6 +4009,12 @@ restart:
 		if (!RB_EMPTY_ROOT(&blocks)) {
 			ret = relocate_tree_blocks(trans, rc, &blocks);
 			if (ret < 0) {
+				/*
+				 * if we fail to relocate tree blocks, force to update
+				 * backref cache when committing transaction.
+				 */
+				rc->backref_cache.last_trans = trans->transid - 1;
+
 				if (ret != -EAGAIN) {
 					err = ret;
 					break;
-- 
cgit v1.2.3-70-g09d2


From 17504584f52af944960f1ad16752aa10a0755b3b Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Sun, 29 Dec 2013 21:44:50 +0800
Subject: Btrfs: return free space to global_rsv as much as possible

@full is not protected within global_rsv.lock, so we may think global_rsv
is already full but in fact it's not, so we miss the opportunity to return
free space to global_rsv directly when we release other block_rsvs.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d19dcc553a..41fe80b9db4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4674,7 +4674,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
 			     u64 num_bytes)
 {
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
-	if (global_rsv->full || global_rsv == block_rsv ||
+	if (global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
 	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
-- 
cgit v1.2.3-70-g09d2


From e8117c26b24098496b6011aabe84e43e0189a506 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Fri, 3 Jan 2014 18:22:57 +0800
Subject: Btrfs: only fua the first superblock when writting supers

We only intent to fua the first superblock in every device from
comments, fix it.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0400a26fc8c..9850a51a3f1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3133,7 +3133,10 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * we fua the first super.  The others we allow
 		 * to go down lazy.
 		 */
-		ret = btrfsic_submit_bh(WRITE_FUA, bh);
+		if (i == 0)
+			ret = btrfsic_submit_bh(WRITE_FUA, bh);
+		else
+			ret = btrfsic_submit_bh(WRITE_SYNC, bh);
 		if (ret)
 			errors++;
 	}
-- 
cgit v1.2.3-70-g09d2


From 842bef5891aaf13e2dede01d86397d810fde2dd8 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:25 +0800
Subject: btrfs: Add "barrier" option to support "-o remount,barrier"

Btrfs can be remounted without barrier, but there is no "barrier" option
so nobody can remount btrfs back with barrier on. Only umount and
mount again can re-enable barrier.(Quite awkward)

Also the mount options in the document is also changed slightly for the
further pairing options changes.

Reported-by: Daniel Blueman <daniel@quora.org>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Mike Fleetwood <mike.fleetwood@googlemail.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 13 +++++++------
 fs/btrfs/super.c                    |  8 +++++++-
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 5dd282dda55..ce487a2a88f 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -38,7 +38,7 @@ Mount Options
 =============
 
 When mounting a btrfs filesystem, the following option are accepted.
-Unless otherwise specified, all options default to off.
+Options with (*) are default options and will not show in the mount options.
 
   alloc_start=<bytes>
 	Debugging option to force all block allocations above a certain
@@ -138,12 +138,13 @@ Unless otherwise specified, all options default to off.
 	Disable support for Posix Access Control Lists (ACLs).  See the
 	acl(5) manual page for more information about ACLs.
 
+  barrier(*)
   nobarrier
-        Disables the use of block layer write barriers.  Write barriers ensure
-	that certain IOs make it through the device cache and are on persistent
-	storage.  If used on a device with a volatile (non-battery-backed)
-	write-back cache, this option will lead to filesystem corruption on a
-	system crash or power loss.
+        Enable/disable the use of block layer write barriers.  Write barriers
+	ensure that certain IOs make it through the device cache and are on
+	persistent storage. If disabled on a device with a volatile
+	(non-battery-backed) write-back cache, nobarrier option will lead to
+	filesystem corruption on a system crash or power loss.
 
   nodatacow
 	Disable data copy-on-write for newly created files.  Implies nodatasum,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15b6a1d4c53..b02d25a64b2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -323,7 +323,7 @@ enum {
 	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
-	Opt_commit_interval,
+	Opt_commit_interval, Opt_barrier,
 	Opt_err,
 };
 
@@ -335,6 +335,7 @@ static match_table_t tokens = {
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
 	{Opt_nobarrier, "nobarrier"},
+	{Opt_barrier, "barrier"},
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
@@ -494,6 +495,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_clear_opt(info->mount_opt, SSD);
 			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
 			break;
+		case Opt_barrier:
+			if (btrfs_test_opt(root, NOBARRIER))
+				btrfs_info(root->fs_info, "turning on barriers");
+			btrfs_clear_opt(info->mount_opt, NOBARRIER);
+			break;
 		case Opt_nobarrier:
 			btrfs_info(root->fs_info, "turning off barriers");
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
-- 
cgit v1.2.3-70-g09d2


From fc0ca9af180b91aad2fbf2fe3b16a12e1e05a760 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:26 +0800
Subject: btrfs: Add noautodefrag mount option.

Btrfs has autodefrag mount option but no pairing noautodefrag option,
which makes it impossible to disable autodefrag without umount.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 8 +++++---
 fs/btrfs/super.c                    | 8 +++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index ce487a2a88f..e87609a4e21 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -46,10 +46,12 @@ Options with (*) are default options and will not show in the mount options.
 	bytes, optionally with a K, M, or G suffix, case insensitive.
 	Default is 1MB.
 
+  noautodefrag(*)
   autodefrag
-	Detect small random writes into files and queue them up for the
-	defrag process.  Works best for small files; Not well suited for
-	large database workloads.
+	Disable/enable auto defragmentation.
+	Auto defragmentation detects small random writes into files and queue
+	them up for the defrag process.  Works best for small files;
+	Not well suited for large database workloads.
 
   check_int
   check_int_data
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b02d25a64b2..44513f3fbdc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -323,7 +323,7 @@ enum {
 	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
-	Opt_commit_interval, Opt_barrier,
+	Opt_commit_interval, Opt_barrier, Opt_nodefrag,
 	Opt_err,
 };
 
@@ -357,6 +357,7 @@ static match_table_t tokens = {
 	{Opt_enospc_debug, "enospc_debug"},
 	{Opt_subvolrootid, "subvolrootid=%d"},
 	{Opt_defrag, "autodefrag"},
+	{Opt_nodefrag, "noautodefrag"},
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
@@ -602,6 +603,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_info(root->fs_info, "enabling auto defrag");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
+		case Opt_nodefrag:
+			if (btrfs_test_opt(root, AUTO_DEFRAG))
+				btrfs_info(root->fs_info, "disabling auto defrag");
+			btrfs_clear_opt(info->mount_opt, AUTO_DEFRAG);
+			break;
 		case Opt_recovery:
 			btrfs_info(root->fs_info, "enabling auto recovery");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
-- 
cgit v1.2.3-70-g09d2


From e07a2ade4426a2cbafae4018aa7b6944bb627a6e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:27 +0800
Subject: btrfs: Add nodiscard mount option.

Add nodiscard mount option to disable discard with remount option.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 7 +++++--
 fs/btrfs/super.c                    | 6 +++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index e87609a4e21..7254cf50139 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -98,9 +98,12 @@ Options with (*) are default options and will not show in the mount options.
 	can be avoided.  Especially useful when trying to mount a multi-device
 	setup as root.  May be specified multiple times for multiple devices.
 
+  nodiscard(*)
   discard
-	Issue frequent commands to let the block device reclaim space freed by
-	the filesystem.  This is useful for SSD devices, thinly provisioned
+	Disable/enable discard mount option.
+	Discard issues frequent commands to let the block device reclaim space
+	freed by the filesystem.
+	This is useful for SSD devices, thinly provisioned
 	LUNs and virtual machine images, but may have a significant
 	performance impact.  (The fstrim command is also available to
 	initiate batch trims from userspace).
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 44513f3fbdc..e15377035dd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -323,7 +323,7 @@ enum {
 	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
-	Opt_commit_interval, Opt_barrier, Opt_nodefrag,
+	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_err,
 };
 
@@ -351,6 +351,7 @@ static match_table_t tokens = {
 	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_space_cache, "space_cache"},
 	{Opt_clear_cache, "clear_cache"},
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
@@ -575,6 +576,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_discard:
 			btrfs_set_opt(info->mount_opt, DISCARD);
 			break;
+		case Opt_nodiscard:
+			btrfs_clear_opt(info->mount_opt, DISCARD);
+			break;
 		case Opt_space_cache:
 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 			break;
-- 
cgit v1.2.3-70-g09d2


From 530362934332e4efac81d40583aa1225e64f556f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:28 +0800
Subject: btrfs: Add noenospc_debug mount option.

Add noenospc_debug mount option to disable ENOSPC debug with
remount option.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 3 ++-
 fs/btrfs/super.c                    | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 7254cf50139..13a7cac8043 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -108,8 +108,9 @@ Options with (*) are default options and will not show in the mount options.
 	performance impact.  (The fstrim command is also available to
 	initiate batch trims from userspace).
 
+  noenospc_debug(*)
   enospc_debug
-	Debugging option to be more verbose in some ENOSPC conditions.
+	Disable/enable debugging option to be more verbose in some ENOSPC conditions.
 
   fatal_errors=<action>
 	Action to take when encountering a fatal error: 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e15377035dd..832540645c2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -324,6 +324,7 @@ enum {
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
+	Opt_noenospc_debug,
 	Opt_err,
 };
 
@@ -356,6 +357,7 @@ static match_table_t tokens = {
 	{Opt_clear_cache, "clear_cache"},
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
 	{Opt_enospc_debug, "enospc_debug"},
+	{Opt_noenospc_debug, "noenospc_debug"},
 	{Opt_subvolrootid, "subvolrootid=%d"},
 	{Opt_defrag, "autodefrag"},
 	{Opt_nodefrag, "noautodefrag"},
@@ -603,6 +605,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_enospc_debug:
 			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
 			break;
+		case Opt_noenospc_debug:
+			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+			break;
 		case Opt_defrag:
 			btrfs_info(root->fs_info, "enabling auto defrag");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
-- 
cgit v1.2.3-70-g09d2


From 2c9ee85671f66cd3ffc7067de47cc59ed6677299 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:29 +0800
Subject: btrfs: Add noflushoncommit mount option.

Add noflushoncommit mount option to disable flush on commit with
remount option.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 1 +
 fs/btrfs/super.c                    | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 13a7cac8043..303b49c6c29 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -117,6 +117,7 @@ Options with (*) are default options and will not show in the mount options.
 	  "bug" - BUG() on a fatal error.  This is the default.
 	  "panic" - panic() on a fatal error.
 
+  noflushoncommit(*)
   flushoncommit
 	The 'flushoncommit' mount option forces any data dirtied by a write in a
 	prior transaction to commit as part of the current commit.  This makes
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 832540645c2..98a68234093 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -324,7 +324,7 @@ enum {
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
-	Opt_noenospc_debug,
+	Opt_noenospc_debug, Opt_noflushoncommit,
 	Opt_err,
 };
 
@@ -350,6 +350,7 @@ static match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
+	{Opt_noflushoncommit, "noflushoncommit"},
 	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
@@ -562,6 +563,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_info(root->fs_info, "turning on flush-on-commit");
 			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
 			break;
+		case Opt_noflushoncommit:
+			if (btrfs_test_opt(root, FLUSHONCOMMIT))
+				btrfs_info(root->fs_info, "turning off flush-on-commit");
+			btrfs_clear_opt(info->mount_opt, FLUSHONCOMMIT);
+			break;
 		case Opt_ratio:
 			ret = match_int(&args[0], &intarg);
 			if (ret) {
-- 
cgit v1.2.3-70-g09d2


From bd0330ad2174d1a5f762eee2ee58f0148f10d575 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:30 +0800
Subject: btrfs: Add acl mount option.

Add acl mount option to enable acl with remount option.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 3 ++-
 fs/btrfs/super.c                    | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 303b49c6c29..79c08f368ac 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -141,8 +141,9 @@ Options with (*) are default options and will not show in the mount options.
 	Specify that 1 metadata chunk should be allocated after every <value>
 	data chunks.  Off by default.
 
+  acl(*)
   noacl
-	Disable support for Posix Access Control Lists (ACLs).  See the
+	Enable/disable support for Posix Access Control Lists (ACLs).  See the
 	acl(5) manual page for more information about ACLs.
 
   barrier(*)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 98a68234093..76eecd13db1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -324,7 +324,7 @@ enum {
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
-	Opt_noenospc_debug, Opt_noflushoncommit,
+	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl,
 	Opt_err,
 };
 
@@ -347,6 +347,7 @@ static match_table_t tokens = {
 	{Opt_ssd, "ssd"},
 	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd, "nossd"},
+	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
@@ -552,6 +553,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				goto out;
 			}
 			break;
+		case Opt_acl:
+			root->fs_info->sb->s_flags |= MS_POSIXACL;
+			break;
 		case Opt_noacl:
 			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
 			break;
-- 
cgit v1.2.3-70-g09d2


From a258af7a3e395a1d36190c81614dca0bcb5f6012 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:31 +0800
Subject: btrfs: Add datacow mount option.

Add datacow mount option to enable copy-on-write with
remount option.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 5 +++--
 fs/btrfs/super.c                    | 8 +++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 79c08f368ac..bbd1f0f57ce 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -154,9 +154,10 @@ Options with (*) are default options and will not show in the mount options.
 	(non-battery-backed) write-back cache, nobarrier option will lead to
 	filesystem corruption on a system crash or power loss.
 
+  datacow(*)
   nodatacow
-	Disable data copy-on-write for newly created files.  Implies nodatasum,
-	and disables all compression.
+	Enable/disable data copy-on-write for newly created files.
+	Nodatacow implies nodatasum, and disables all compression.
 
   nodatasum
 	Disable data checksumming for newly created files.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 76eecd13db1..6567865a250 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -324,7 +324,7 @@ enum {
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
-	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl,
+	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
 	Opt_err,
 };
 
@@ -335,6 +335,7 @@ static match_table_t tokens = {
 	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
+	{Opt_datacow, "datacow"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_barrier, "barrier"},
 	{Opt_max_inline, "max_inline=%s"},
@@ -446,6 +447,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
+		case Opt_datacow:
+			if (btrfs_test_opt(root, NODATACOW))
+				btrfs_info(root->fs_info, "setting datacow");
+			btrfs_clear_opt(info->mount_opt, NODATACOW);
+			break;
 		case Opt_compress_force:
 		case Opt_compress_force_type:
 			compress_force = true;
-- 
cgit v1.2.3-70-g09d2


From d399167d88ea53590d6c0850b2d5534cbd21da02 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:32 +0800
Subject: btrfs: Add datasum mount option.

Add datasum mount option to enable checksum with
remount option.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt |  4 +++-
 fs/btrfs/super.c                    | 10 ++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index bbd1f0f57ce..e05c6aecc0c 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -159,8 +159,10 @@ Options with (*) are default options and will not show in the mount options.
 	Enable/disable data copy-on-write for newly created files.
 	Nodatacow implies nodatasum, and disables all compression.
 
+  datasum(*)
   nodatasum
-	Disable data checksumming for newly created files.
+	Enable/disable data checksumming for newly created files.
+	Datasum implies datacow.
 
   notreelog
 	Disable the tree logging used for fsync and O_SYNC writes.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6567865a250..e84e6cb5752 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -325,6 +325,7 @@ enum {
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
+	Opt_datasum,
 	Opt_err,
 };
 
@@ -334,6 +335,7 @@ static match_table_t tokens = {
 	{Opt_subvolid, "subvolid=%s"},
 	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
+	{Opt_datasum, "datasum"},
 	{Opt_nodatacow, "nodatacow"},
 	{Opt_datacow, "datacow"},
 	{Opt_nobarrier, "nobarrier"},
@@ -434,6 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_info(root->fs_info, "setting nodatasum");
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
+		case Opt_datasum:
+			if (btrfs_test_opt(root, NODATACOW))
+				btrfs_info(root->fs_info, "setting datasum, datacow enabled");
+			else
+				btrfs_info(root->fs_info, "setting datasum");
+			btrfs_clear_opt(info->mount_opt, NODATACOW);
+			btrfs_clear_opt(info->mount_opt, NODATASUM);
+			break;
 		case Opt_nodatacow:
 			if (!btrfs_test_opt(root, COMPRESS) ||
 				!btrfs_test_opt(root, FORCE_COMPRESS)) {
-- 
cgit v1.2.3-70-g09d2


From a88998f291fc707f18ee42ae45220a3a3e384c27 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 6 Jan 2014 09:58:33 +0800
Subject: btrfs: Add treelog mount option.

Add treelog mount option to enable tree log with
remount option.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 Documentation/filesystems/btrfs.txt | 3 ++-
 fs/btrfs/super.c                    | 8 +++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index e05c6aecc0c..d11cc2f8077 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -164,8 +164,9 @@ Options with (*) are default options and will not show in the mount options.
 	Enable/disable data checksumming for newly created files.
 	Datasum implies datacow.
 
+  treelog(*)
   notreelog
-	Disable the tree logging used for fsync and O_SYNC writes.
+	Enable/disable the tree logging used for fsync and O_SYNC writes.
 
   recovery
 	Enable autorecovery attempts if a bad tree root is found at mount time.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e84e6cb5752..16d7fc751ba 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -325,7 +325,7 @@ enum {
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
-	Opt_datasum,
+	Opt_datasum, Opt_treelog,
 	Opt_err,
 };
 
@@ -353,6 +353,7 @@ static match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
+	{Opt_treelog, "treelog"},
 	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_noflushoncommit, "noflushoncommit"},
 	{Opt_ratio, "metadata_ratio=%d"},
@@ -579,6 +580,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_info(root->fs_info, "disabling tree log");
 			btrfs_set_opt(info->mount_opt, NOTREELOG);
 			break;
+		case Opt_treelog:
+			if (btrfs_test_opt(root, NOTREELOG))
+				btrfs_info(root->fs_info, "enabling tree log");
+			btrfs_clear_opt(info->mount_opt, NOTREELOG);
+			break;
 		case Opt_flushoncommit:
 			btrfs_info(root->fs_info, "turning on flush-on-commit");
 			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
-- 
cgit v1.2.3-70-g09d2


From 896c14f97f700aec6565154f2451605d7c5ce3ed Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 7 Jan 2014 17:25:18 +0800
Subject: Btrfs: fix wrong send_in_progress accounting

Steps to reproduce:
 # mkfs.btrfs -f /dev/sda8
 # mount /dev/sda8 /mnt
 # btrfs sub snapshot -r /mnt /mnt/snap1
 # btrfs sub snapshot -r /mnt /mnt/snap2
 # btrfs send /mnt/snap1 -p /mnt/snap2 -f /mnt/1
 # dmesg

The problem is that we will sort clone roots(include @send_root), it
might push @send_root before thus @send_root's @send_in_progress will
be decreased twice.

Cc: David Sterba <dsterba@suse.cz>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bff0b1ac3be..5b697851646 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4752,6 +4752,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	u32 i;
 	u64 *clone_sources_tmp = NULL;
 	int clone_sources_to_rollback = 0;
+	int sort_clone_roots = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -4942,6 +4943,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	sort(sctx->clone_roots, sctx->clone_roots_cnt,
 			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
 			NULL);
+	sort_clone_roots = 1;
 
 	ret = send_subvol(sctx);
 	if (ret < 0)
@@ -4957,11 +4959,19 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	}
 
 out:
-	for (i = 0; sctx && i < clone_sources_to_rollback; i++)
-		btrfs_root_dec_send_in_progress(sctx->clone_roots[i].root);
+	if (sort_clone_roots) {
+		for (i = 0; i < sctx->clone_roots_cnt; i++)
+			btrfs_root_dec_send_in_progress(
+					sctx->clone_roots[i].root);
+	} else {
+		for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+			btrfs_root_dec_send_in_progress(
+					sctx->clone_roots[i].root);
+
+		btrfs_root_dec_send_in_progress(send_root);
+	}
 	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
 		btrfs_root_dec_send_in_progress(sctx->parent_root);
-	btrfs_root_dec_send_in_progress(send_root);
 
 	kfree(arg);
 	vfree(clone_sources_tmp);
-- 
cgit v1.2.3-70-g09d2


From 18f687d538449373c37cbe52b03f5f3d42b7c7ed Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 7 Jan 2014 17:25:19 +0800
Subject: Btrfs: fix protection between send and root deletion

We should gurantee that parent and clone roots can not be destroyed
during send, for this we have two ideas.

1.by holding @subvol_sem, this might be a nightmare, because it will
block all subvolumes deletion for a long time.

2.Miao pointed out we can reuse @send_in_progress, that mean we will
skip snapshot deletion if root sending is in progress.

Here we adopt the second approach since it won't block other subvolumes
deletion for a long time.

Besides in btrfs_clean_one_deleted_snapshot(), we only check first root
, if this root is involved in send, we return directly rather than
continue to check.There are several reasons about it:

1.this case happen seldomly.
2.after sending,cleaner thread can continue to drop that root.
3.make code simple

Cc: David Sterba <dsterba@suse.cz>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c        | 16 ++++++++++++++++
 fs/btrfs/transaction.c | 13 +++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5b697851646..4e2461b857f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4753,6 +4753,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	u64 *clone_sources_tmp = NULL;
 	int clone_sources_to_rollback = 0;
 	int sort_clone_roots = 0;
+	int index;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -4893,8 +4894,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 			key.objectid = clone_sources_tmp[i];
 			key.type = BTRFS_ROOT_ITEM_KEY;
 			key.offset = (u64)-1;
+
+			index = srcu_read_lock(&fs_info->subvol_srcu);
+
 			clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
 			if (IS_ERR(clone_root)) {
+				srcu_read_unlock(&fs_info->subvol_srcu, index);
 				ret = PTR_ERR(clone_root);
 				goto out;
 			}
@@ -4903,10 +4908,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 			clone_root->send_in_progress++;
 			if (!btrfs_root_readonly(clone_root)) {
 				spin_unlock(&clone_root->root_item_lock);
+				srcu_read_unlock(&fs_info->subvol_srcu, index);
 				ret = -EPERM;
 				goto out;
 			}
 			spin_unlock(&clone_root->root_item_lock);
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+
 			sctx->clone_roots[i].root = clone_root;
 		}
 		vfree(clone_sources_tmp);
@@ -4917,19 +4925,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 		key.objectid = arg->parent_root;
 		key.type = BTRFS_ROOT_ITEM_KEY;
 		key.offset = (u64)-1;
+
+		index = srcu_read_lock(&fs_info->subvol_srcu);
+
 		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
 		if (IS_ERR(sctx->parent_root)) {
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
 			ret = PTR_ERR(sctx->parent_root);
 			goto out;
 		}
+
 		spin_lock(&sctx->parent_root->root_item_lock);
 		sctx->parent_root->send_in_progress++;
 		if (!btrfs_root_readonly(sctx->parent_root)) {
 			spin_unlock(&sctx->parent_root->root_item_lock);
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
 			ret = -EPERM;
 			goto out;
 		}
 		spin_unlock(&sctx->parent_root->root_item_lock);
+
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
 	}
 
 	/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e5fe801659b..da2ac4c6d78 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1972,6 +1972,19 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 	}
 	root = list_first_entry(&fs_info->dead_roots,
 			struct btrfs_root, root_list);
+	/*
+	 * Make sure root is not involved in send,
+	 * if we fail with first root, we return
+	 * directly rather than continue.
+	 */
+	spin_lock(&root->root_item_lock);
+	if (root->send_in_progress) {
+		spin_unlock(&fs_info->trans_lock);
+		spin_unlock(&root->root_item_lock);
+		return 0;
+	}
+	spin_unlock(&root->root_item_lock);
+
 	list_del_init(&root->root_list);
 	spin_unlock(&fs_info->trans_lock);
 
-- 
cgit v1.2.3-70-g09d2


From 8e56338d7d0ee38ecae86d35dae43020356acca1 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 7 Jan 2014 17:26:57 +0800
Subject: Btrfs: remove unnecessary transaction commit before send

We will finish orphan cleanups during snapshot, so we don't
have to commit transaction here.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e2461b857f..591063dac0e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4775,35 +4775,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	 */
 	WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
 
-	/*
-	 * If we just created this root we need to make sure that the orphan
-	 * cleanup has been done and committed since we search the commit root,
-	 * so check its commit root transid with our otransid and if they match
-	 * commit the transaction to make sure everything is updated.
-	 */
-	down_read(&send_root->fs_info->extent_commit_sem);
-	if (btrfs_header_generation(send_root->commit_root) ==
-	    btrfs_root_otransid(&send_root->root_item)) {
-		struct btrfs_trans_handle *trans;
-
-		up_read(&send_root->fs_info->extent_commit_sem);
-
-		trans = btrfs_attach_transaction_barrier(send_root);
-		if (IS_ERR(trans)) {
-			if (PTR_ERR(trans) != -ENOENT) {
-				ret = PTR_ERR(trans);
-				goto out;
-			}
-			/* ENOENT means theres no transaction */
-		} else {
-			ret = btrfs_commit_transaction(trans, send_root);
-			if (ret)
-				goto out;
-		}
-	} else {
-		up_read(&send_root->fs_info->extent_commit_sem);
-	}
-
 	/*
 	 * Userspace tools do the checks and warn the user if it's
 	 * not RO.
-- 
cgit v1.2.3-70-g09d2


From 90515e7f5d7d24cbb2a4038a3f1b5cfa2921aa17 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 7 Jan 2014 17:26:58 +0800
Subject: Btrfs: handle EAGAIN case properly in btrfs_drop_snapshot()

We may return early in btrfs_drop_snapshot(), we shouldn't
call btrfs_std_err() for this case, fix it.

Cc: stable@vger.kernel.org
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41fe80b9db4..77acc087389 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7835,7 +7835,7 @@ out:
 	 */
 	if (!for_reloc && root_dropped == false)
 		btrfs_add_dead_root(root);
-	if (err)
+	if (err && err != -EAGAIN)
 		btrfs_std_error(root->fs_info, err);
 	return err;
 }
-- 
cgit v1.2.3-70-g09d2


From 1acae57b161ef1282f565ef907f72aeed0eb71d9 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 7 Jan 2014 11:42:27 +0000
Subject: Btrfs: faster file extent item replace operations

When writing to a file we drop existing file extent items that cover the
write range and then add a new file extent item that represents that write
range.

Before this change we were doing a tree lookup to remove the file extent
items, and then after we did another tree lookup to insert the new file
extent item.
Most of the time all the file extent items we need to drop are located
within a single leaf - this is the leaf where our new file extent item ends
up at. Therefore, in this common case just combine these 2 operations into
a single one.

By avoiding the second btree navigation for insertion of the new file extent
item, we reduce btree node/leaf lock acquisitions/releases, btree block/leaf
COW operations, CPU time on btree node/leaf key binary searches, etc.

Besides for file writes, this is an operation that happens for file fsync's
as well. However log btrees are much less likely to big as big as regular
fs btrees, therefore the impact of this change is smaller.

The following benchmark was performed against an SSD drive and a
HDD drive, both for random and sequential writes:

  sysbench --test=fileio --file-num=4096 --file-total-size=8G \
     --file-test-mode=[rndwr|seqwr] --num-threads=512 \
     --file-block-size=8192 \ --max-requests=1000000 \
     --file-fsync-freq=0 --file-io-mode=sync [prepare|run]

All results below are averages of 10 runs of the respective test.

** SSD sequential writes

Before this change: 225.88 Mb/sec
After this change:  277.26 Mb/sec

** SSD random writes

Before this change: 49.91 Mb/sec
After this change:  56.39 Mb/sec

** HDD sequential writes

Before this change: 68.53 Mb/sec
After this change:  69.87 Mb/sec

** HDD random writes

Before this change: 13.04 Mb/sec
After this change:  14.39 Mb/sec

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h    |  5 ++-
 fs/btrfs/file.c     | 44 ++++++++++++++++++++++++---
 fs/btrfs/inode.c    | 87 +++++++++++++++++++++++++++++++++--------------------
 fs/btrfs/tree-log.c | 24 +++++++++------
 4 files changed, 114 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5be778e62a2..f52a60b9eba 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3772,7 +3772,10 @@ extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 u64 *drop_end, int drop_cache);
+			 u64 *drop_end, int drop_cache,
+			 int replace_extent,
+			 u32 extent_item_size,
+			 int *key_inserted);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
 		       u64 end, int drop_cache);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 35bf83876f3..030012e1710 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -692,7 +692,10 @@ next:
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 u64 *drop_end, int drop_cache)
+			 u64 *drop_end, int drop_cache,
+			 int replace_extent,
+			 u32 extent_item_size,
+			 int *key_inserted)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
@@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int modify_tree = -1;
 	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
 	int found = 0;
+	int leafs_visited = 0;
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				path->slots[0]--;
 		}
 		ret = 0;
+		leafs_visited++;
 next_slot:
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -744,6 +749,7 @@ next_slot:
 				ret = 0;
 				break;
 			}
+			leafs_visited++;
 			leaf = path->nodes[0];
 			recow = 1;
 		}
@@ -927,14 +933,44 @@ next_slot:
 	}
 
 	if (!ret && del_nr > 0) {
+		/*
+		 * Set path->slots[0] to first slot, so that after the delete
+		 * if items are move off from our leaf to its immediate left or
+		 * right neighbor leafs, we end up with a correct and adjusted
+		 * path->slots[0] for our insertion.
+		 */
+		path->slots[0] = del_slot;
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret)
 			btrfs_abort_transaction(trans, root, ret);
+
+		leaf = path->nodes[0];
+		/*
+		 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
+		 * is, its contents got pushed to its neighbors), in which case
+		 * it means path->locks[0] == 0
+		 */
+		if (!ret && replace_extent && leafs_visited == 1 &&
+		    path->locks[0] &&
+		    btrfs_leaf_free_space(root, leaf) >=
+		    sizeof(struct btrfs_item) + extent_item_size) {
+
+			key.objectid = ino;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = start;
+			setup_items_for_insert(root, path, &key,
+					       &extent_item_size,
+					       extent_item_size,
+					       sizeof(struct btrfs_item) +
+					       extent_item_size, 1);
+			*key_inserted = 1;
+		}
 	}
 
+	if (!replace_extent || !(*key_inserted))
+		btrfs_release_path(path);
 	if (drop_end)
 		*drop_end = found ? min(end, extent_end) : end;
-	btrfs_release_path(path);
 	return ret;
 }
 
@@ -949,7 +985,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
-				   drop_cache);
+				   drop_cache, 0, 0, NULL);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2219,7 +2255,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	while (cur_offset < lockend) {
 		ret = __btrfs_drop_extents(trans, root, inode, path,
 					   cur_offset, lockend + 1,
-					   &drop_end, 1);
+					   &drop_end, 1, 0, 0, NULL);
 		if (ret != -ENOSPC)
 			break;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ffb23e50676..23f18eb5fb5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,13 +125,12 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
  * no overlapping inline items exist in the btree
  */
 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_path *path, int extent_inserted,
 				struct btrfs_root *root, struct inode *inode,
 				u64 start, size_t size, size_t compressed_size,
 				int compress_type,
 				struct page **compressed_pages)
 {
-	struct btrfs_key key;
-	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct page *page = NULL;
 	char *kaddr;
@@ -140,29 +139,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int ret;
 	size_t cur_size = size;
-	size_t datasize;
 	unsigned long offset;
 
 	if (compressed_size && compressed_pages)
 		cur_size = compressed_size;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	inode_add_bytes(inode, size);
 
-	path->leave_spinning = 1;
+	if (!extent_inserted) {
+		struct btrfs_key key;
+		size_t datasize;
 
-	key.objectid = btrfs_ino(inode);
-	key.offset = start;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+		key.objectid = btrfs_ino(inode);
+		key.offset = start;
+		btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 
-	inode_add_bytes(inode, size);
-	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      datasize);
-	if (ret) {
-		err = ret;
-		goto fail;
+		datasize = btrfs_file_extent_calc_inline_size(cur_size);
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+					      datasize);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -203,7 +202,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 		page_cache_release(page);
 	}
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 
 	/*
 	 * we're an inline extent, so nobody can
@@ -219,7 +218,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 
 	return ret;
 fail:
-	btrfs_free_path(path);
 	return err;
 }
 
@@ -242,6 +240,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 	u64 aligned_end = ALIGN(end, root->sectorsize);
 	u64 data_len = inline_len;
 	int ret;
+	struct btrfs_path *path;
+	int extent_inserted = 0;
+	u32 extent_item_size;
 
 	if (compressed_size)
 		data_len = compressed_size;
@@ -256,12 +257,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 		return 1;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
 		return PTR_ERR(trans);
+	}
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
-	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
+	if (compressed_size && compressed_pages)
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		   compressed_size);
+	else
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		    inline_len);
+
+	ret = __btrfs_drop_extents(trans, root, inode, path,
+				   start, aligned_end, NULL,
+				   1, 1, extent_item_size, &extent_inserted);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out;
@@ -269,7 +285,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 
 	if (isize > actual_end)
 		inline_len = min_t(u64, isize, actual_end);
-	ret = insert_inline_extent(trans, root, inode, start,
+	ret = insert_inline_extent(trans, path, extent_inserted,
+				   root, inode, start,
 				   inline_len, compressed_size,
 				   compress_type, compressed_pages);
 	if (ret && ret != -ENOSPC) {
@@ -284,6 +301,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 	return ret;
 }
@@ -1841,14 +1859,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key ins;
+	int extent_inserted = 0;
 	int ret;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	path->leave_spinning = 1;
-
 	/*
 	 * we may be replacing one extent in the tree with another.
 	 * The new extent is pinned in the extent map, and we don't want
@@ -1858,17 +1875,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * the caller is expected to unpin it and allow it to be merged
 	 * with the others.
 	 */
-	ret = btrfs_drop_extents(trans, root, inode, file_pos,
-				 file_pos + num_bytes, 0);
+	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
+				   file_pos + num_bytes, NULL, 0,
+				   1, sizeof(*fi), &extent_inserted);
 	if (ret)
 		goto out;
 
-	ins.objectid = btrfs_ino(inode);
-	ins.offset = file_pos;
-	ins.type = BTRFS_EXTENT_DATA_KEY;
-	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
-	if (ret)
-		goto out;
+	if (!extent_inserted) {
+		ins.objectid = btrfs_ino(inode);
+		ins.offset = file_pos;
+		ins.type = BTRFS_EXTENT_DATA_KEY;
+
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &ins,
+					      sizeof(*fi));
+		if (ret)
+			goto out;
+	}
 	leaf = path->nodes[0];
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ba2f15109da..b561e7a4007 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3495,21 +3495,27 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	int index = log->log_transid % 2;
 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
-	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
-				   em->start + em->len, NULL, 0);
-	if (ret)
-		return ret;
+	int extent_inserted = 0;
 
 	INIT_LIST_HEAD(&ordered_sums);
 	btrfs_init_map_token(&token);
-	key.objectid = btrfs_ino(inode);
-	key.type = BTRFS_EXTENT_DATA_KEY;
-	key.offset = em->start;
 
-	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+				   em->start + em->len, NULL, 0, 1,
+				   sizeof(*fi), &extent_inserted);
 	if (ret)
 		return ret;
+
+	if (!extent_inserted) {
+		key.objectid = btrfs_ino(inode);
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = em->start;
+
+		ret = btrfs_insert_empty_item(trans, log, path, &key,
+					      sizeof(*fi));
+		if (ret)
+			return ret;
+	}
 	leaf = path->nodes[0];
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
-- 
cgit v1.2.3-70-g09d2


From 63541927c8d11d2686778b1e8ec71c14b4fd53e4 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 7 Jan 2014 11:47:46 +0000
Subject: Btrfs: add support for inode properties

This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."

Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).

This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.

The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.

Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.

Basically the tests correspond to:

Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.

Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.

Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.

Test 4 - same as test 3 but with 10 properties per file.

Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.

* Without properties (test 1)

                    file creation time        ls -lha time
10 000 files              3.49                   0.76
100 000 files            47.19                   8.37
1 000 000 files         518.51                 107.06

* With 1 property (compression property set to lzo - test 2)

                    file creation time        ls -lha time
10 000 files              3.63                    0.93
100 000 files            48.56                    9.74
1 000 000 files         537.72                  125.11

* With 4 properties (test 3)

                    file creation time        ls -lha time
10 000 files              3.94                    1.20
100 000 files            52.14                   11.48
1 000 000 files         572.70                  142.13

* With 10 properties (test 4)

                    file creation time        ls -lha time
10 000 files              4.61                    1.35
100 000 files            58.86                   13.83
1 000 000 files         656.01                  177.61

The increased latencies with properties are essencialy because of:

*) When creating an inode, we now synchronously write 1 more item
   (an xattr item) for each property inherited from the parent dir
   (or subvolume). This could be done in an asynchronous way such
   as we do for dir intex items (delayed-inode.c), which could help
   reduce the file creation latency;

*) With properties, we now have larger fs trees. For this particular
   test each xattr item uses 75 bytes of leaf space in the fs tree.
   This could be less by using a new item for xattr items, instead of
   the current btrfs_dir_item, since we could cut the 'location' and
   'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
   total of 26 bytes per xattr item) from the btrfs_dir_item type.

Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.

Test script:

$ cat test.pl
  #!/usr/bin/perl -w

  use strict;
  use Time::HiRes qw(time);
  use constant NUM_FILES => 10_000;
  use constant FILE_SIZES => (64 * 1024);
  use constant DEV => '/dev/sdb4';
  use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
  use constant TEST_DIR => (MNT_POINT . '/testdir');

  system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";

  # following line for testing without properties
  #system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";

  # following 2 lines for testing with properties
  system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
  system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";

  system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
  my ($t1, $t2);

  $t1 = time();
  for (my $i = 1; $i <= NUM_FILES; $i++) {
      my $p = TEST_DIR . '/file_' . $i;
      open(my $f, '>', $p) or die "Error opening file!";
      $f->autoflush(1);
      for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
          print $f ('A' x 4096) or die "Error writing to file!";
      }
      close($f);
  }
  $t2 = time();
  print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
  system("umount", DEV) == 0 or die "umount failed!";
  system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";

  $t1 = time();
  system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
  $t2 = time();
  print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
  system("umount", DEV) == 0 or die "umount failed!";

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/Makefile          |   2 +-
 fs/btrfs/btrfs_inode.h     |   1 +
 fs/btrfs/ctree.h           |   4 +-
 fs/btrfs/inode.c           |  42 ++++-
 fs/btrfs/ioctl.c           |  19 +-
 fs/btrfs/props.c           | 427 +++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/props.h           |  42 +++++
 fs/btrfs/super.c           |   3 +
 fs/btrfs/xattr.c           |  12 +-
 include/uapi/linux/xattr.h |   3 +
 10 files changed, 545 insertions(+), 10 deletions(-)
 create mode 100644 fs/btrfs/props.c
 create mode 100644 fs/btrfs/props.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 1a44e42d602..af7f000e905 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-	   uuid-tree.o
+	   uuid-tree.o props.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 661b0ac90e8..8fed2125689 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -43,6 +43,7 @@
 #define BTRFS_INODE_COPY_EVERYTHING		8
 #define BTRFS_INODE_IN_DELALLOC_LIST		9
 #define BTRFS_INODE_READDIO_NEED_LOCK		10
+#define BTRFS_INODE_HAS_PROPS		        11
 
 /* in memory btrfs inode */
 struct btrfs_inode {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f52a60b9eba..3cebb4aeddc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3703,7 +3703,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root, u64 new_dirid);
+			     struct btrfs_root *new_root,
+			     struct btrfs_root *parent_root,
+			     u64 new_dirid);
 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 23f18eb5fb5..1ea19cea96d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,6 +58,7 @@
 #include "inode-map.h"
 #include "backref.h"
 #include "hash.h"
+#include "props.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -3265,7 +3266,8 @@ out:
  * slot is the slot the inode is in, objectid is the objectid of the inode
  */
 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
-					  int slot, u64 objectid)
+					  int slot, u64 objectid,
+					  int *first_xattr_slot)
 {
 	u32 nritems = btrfs_header_nritems(leaf);
 	struct btrfs_key found_key;
@@ -3281,6 +3283,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 	}
 
 	slot++;
+	*first_xattr_slot = -1;
 	while (slot < nritems) {
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
@@ -3290,6 +3293,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 
 		/* we found an xattr, assume we've got an acl */
 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+			if (*first_xattr_slot == -1)
+				*first_xattr_slot = slot;
 			if (found_key.offset == xattr_access ||
 			    found_key.offset == xattr_default)
 				return 1;
@@ -3318,6 +3323,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 	 * something larger than an xattr.  We have to assume the inode
 	 * has acls
 	 */
+	if (*first_xattr_slot == -1)
+		*first_xattr_slot = slot;
 	return 1;
 }
 
@@ -3337,6 +3344,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	u32 rdev;
 	int ret;
 	bool filled = false;
+	int first_xattr_slot;
 
 	ret = btrfs_fill_inode(inode, &rdev);
 	if (!ret)
@@ -3346,7 +3354,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	if (!path)
 		goto make_bad;
 
-	path->leave_spinning = 1;
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -3429,12 +3436,21 @@ cache_acl:
 	 * any xattrs or acls
 	 */
 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
-					   btrfs_ino(inode));
+					   btrfs_ino(inode), &first_xattr_slot);
+	if (first_xattr_slot != -1) {
+		path->slots[0] = first_xattr_slot;
+		ret = btrfs_load_inode_props(inode, path);
+		if (ret)
+			btrfs_err(root->fs_info,
+				  "error loading props for ino %llu (root %llu): %d\n",
+				  btrfs_ino(inode),
+				  root->root_key.objectid, ret);
+	}
+	btrfs_free_path(path);
+
 	if (!maybe_acls)
 		cache_no_acl(inode);
 
-	btrfs_free_path(path);
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
@@ -5607,6 +5623,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
 	btrfs_update_root_times(trans, root);
 
+	ret = btrfs_inode_inherit_props(trans, inode, dir);
+	if (ret)
+		btrfs_err(root->fs_info,
+			  "error inheriting props for ino %llu (root %llu): %d",
+			  btrfs_ino(inode), root->root_key.objectid, ret);
+
 	return inode;
 fail:
 	if (dir)
@@ -7889,7 +7911,9 @@ out:
  * create a new subvolume directory/inode (helper for the ioctl).
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root, u64 new_dirid)
+			     struct btrfs_root *new_root,
+			     struct btrfs_root *parent_root,
+			     u64 new_dirid)
 {
 	struct inode *inode;
 	int err;
@@ -7907,6 +7931,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 	set_nlink(inode, 1);
 	btrfs_i_size_write(inode, 0);
 
+	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
+	if (err)
+		btrfs_err(new_root->fs_info,
+			  "error inheriting subvolume %llu properties: %d\n",
+			  new_root->root_key.objectid, err);
+
 	err = btrfs_update_inode(trans, new_root, inode);
 
 	iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ed3edc28325..3970f32b2b8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,7 @@
 #include "rcu-string.h"
 #include "send.h"
 #include "dev-replace.h"
+#include "props.h"
 #include "sysfs.h"
 
 static int btrfs_clone(struct inode *src, struct inode *inode,
@@ -281,9 +282,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	if (flags & FS_NOCOMP_FL) {
 		ip->flags &= ~BTRFS_INODE_COMPRESS;
 		ip->flags |= BTRFS_INODE_NOCOMPRESS;
+
+		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+		if (ret && ret != -ENODATA)
+			goto out_drop;
 	} else if (flags & FS_COMPR_FL) {
+		const char *comp;
+
 		ip->flags |= BTRFS_INODE_COMPRESS;
 		ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+
+		if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+			comp = "lzo";
+		else
+			comp = "zlib";
+		ret = btrfs_set_prop(inode, "btrfs.compression",
+				     comp, strlen(comp), 0);
+		if (ret)
+			goto out_drop;
+
 	} else {
 		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
@@ -502,7 +519,7 @@ static noinline int create_subvol(struct inode *dir,
 
 	btrfs_record_root_in_trans(trans, new_root);
 
-	ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+	ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
 	if (ret) {
 		/* We potentially lose an unused inode item here */
 		btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
new file mode 100644
index 00000000000..129b1dd2852
--- /dev/null
+++ b/fs/btrfs/props.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/hashtable.h>
+#include "props.h"
+#include "btrfs_inode.h"
+#include "hash.h"
+#include "transaction.h"
+#include "xattr.h"
+
+#define BTRFS_PROP_HANDLERS_HT_BITS 8
+static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
+
+struct prop_handler {
+	struct hlist_node node;
+	const char *xattr_name;
+	int (*validate)(const char *value, size_t len);
+	int (*apply)(struct inode *inode, const char *value, size_t len);
+	const char *(*extract)(struct inode *inode);
+	int inheritable;
+};
+
+static int prop_compression_validate(const char *value, size_t len);
+static int prop_compression_apply(struct inode *inode,
+				  const char *value,
+				  size_t len);
+static const char *prop_compression_extract(struct inode *inode);
+
+static struct prop_handler prop_handlers[] = {
+	{
+		.xattr_name = XATTR_BTRFS_PREFIX "compression",
+		.validate = prop_compression_validate,
+		.apply = prop_compression_apply,
+		.extract = prop_compression_extract,
+		.inheritable = 1
+	},
+	{
+		.xattr_name = NULL
+	}
+};
+
+void __init btrfs_props_init(void)
+{
+	struct prop_handler *p;
+
+	hash_init(prop_handlers_ht);
+
+	for (p = &prop_handlers[0]; p->xattr_name; p++) {
+		u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
+
+		hash_add(prop_handlers_ht, &p->node, h);
+	}
+}
+
+static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
+{
+	struct hlist_head *h;
+
+	h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
+	if (hlist_empty(h))
+		return NULL;
+
+	return h;
+}
+
+static const struct prop_handler *
+find_prop_handler(const char *name,
+		  const struct hlist_head *handlers)
+{
+	struct prop_handler *h;
+
+	if (!handlers) {
+		u64 hash = btrfs_name_hash(name, strlen(name));
+
+		handlers = find_prop_handlers_by_hash(hash);
+		if (!handlers)
+			return NULL;
+	}
+
+	hlist_for_each_entry(h, handlers, node)
+		if (!strcmp(h->xattr_name, name))
+			return h;
+
+	return NULL;
+}
+
+static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
+			    struct inode *inode,
+			    const char *name,
+			    const char *value,
+			    size_t value_len,
+			    int flags)
+{
+	const struct prop_handler *handler;
+	int ret;
+
+	if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
+		return -EINVAL;
+
+	handler = find_prop_handler(name, NULL);
+	if (!handler)
+		return -EINVAL;
+
+	if (value_len == 0) {
+		ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+				       NULL, 0, flags);
+		if (ret)
+			return ret;
+
+		ret = handler->apply(inode, NULL, 0);
+		ASSERT(ret == 0);
+
+		return ret;
+	}
+
+	ret = handler->validate(value, value_len);
+	if (ret)
+		return ret;
+	ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+			       value, value_len, flags);
+	if (ret)
+		return ret;
+	ret = handler->apply(inode, value, value_len);
+	if (ret) {
+		__btrfs_setxattr(trans, inode, handler->xattr_name,
+				 NULL, 0, flags);
+		return ret;
+	}
+
+	set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+
+	return 0;
+}
+
+int btrfs_set_prop(struct inode *inode,
+		   const char *name,
+		   const char *value,
+		   size_t value_len,
+		   int flags)
+{
+	return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
+}
+
+static int iterate_object_props(struct btrfs_root *root,
+				struct btrfs_path *path,
+				u64 objectid,
+				void (*iterator)(void *,
+						 const struct prop_handler *,
+						 const char *,
+						 size_t),
+				void *ctx)
+{
+	int ret;
+	char *name_buf = NULL;
+	char *value_buf = NULL;
+	int name_buf_len = 0;
+	int value_buf_len = 0;
+
+	while (1) {
+		struct btrfs_key key;
+		struct btrfs_dir_item *di;
+		struct extent_buffer *leaf;
+		u32 total_len, cur, this_len;
+		int slot;
+		const struct hlist_head *handlers;
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != objectid)
+			break;
+		if (key.type != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		handlers = find_prop_handlers_by_hash(key.offset);
+		if (!handlers)
+			goto next_slot;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		cur = 0;
+		total_len = btrfs_item_size_nr(leaf, slot);
+
+		while (cur < total_len) {
+			u32 name_len = btrfs_dir_name_len(leaf, di);
+			u32 data_len = btrfs_dir_data_len(leaf, di);
+			unsigned long name_ptr, data_ptr;
+			const struct prop_handler *handler;
+
+			this_len = sizeof(*di) + name_len + data_len;
+			name_ptr = (unsigned long)(di + 1);
+			data_ptr = name_ptr + name_len;
+
+			if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
+			    memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
+						 name_ptr,
+						 XATTR_BTRFS_PREFIX_LEN))
+				goto next_dir_item;
+
+			if (name_len >= name_buf_len) {
+				kfree(name_buf);
+				name_buf_len = name_len + 1;
+				name_buf = kmalloc(name_buf_len, GFP_NOFS);
+				if (!name_buf) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+			read_extent_buffer(leaf, name_buf, name_ptr, name_len);
+			name_buf[name_len] = '\0';
+
+			handler = find_prop_handler(name_buf, handlers);
+			if (!handler)
+				goto next_dir_item;
+
+			if (data_len > value_buf_len) {
+				kfree(value_buf);
+				value_buf_len = data_len;
+				value_buf = kmalloc(data_len, GFP_NOFS);
+				if (!value_buf) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+			read_extent_buffer(leaf, value_buf, data_ptr, data_len);
+
+			iterator(ctx, handler, value_buf, data_len);
+next_dir_item:
+			cur += this_len;
+			di = (struct btrfs_dir_item *)((char *) di + this_len);
+		}
+
+next_slot:
+		path->slots[0]++;
+	}
+
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	kfree(name_buf);
+	kfree(value_buf);
+
+	return ret;
+}
+
+static void inode_prop_iterator(void *ctx,
+				const struct prop_handler *handler,
+				const char *value,
+				size_t len)
+{
+	struct inode *inode = ctx;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = handler->apply(inode, value, len);
+	if (unlikely(ret))
+		btrfs_warn(root->fs_info,
+			   "error applying prop %s to ino %llu (root %llu): %d",
+			   handler->xattr_name, btrfs_ino(inode),
+			   root->root_key.objectid, ret);
+	else
+		set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+}
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 ino = btrfs_ino(inode);
+	int ret;
+
+	ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+
+	return ret;
+}
+
+static int inherit_props(struct btrfs_trans_handle *trans,
+			 struct inode *inode,
+			 struct inode *parent)
+{
+	const struct prop_handler *h;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (!test_bit(BTRFS_INODE_HAS_PROPS,
+		      &BTRFS_I(parent)->runtime_flags))
+		return 0;
+
+	for (h = &prop_handlers[0]; h->xattr_name; h++) {
+		const char *value;
+		u64 num_bytes;
+
+		if (!h->inheritable)
+			continue;
+
+		value = h->extract(parent);
+		if (!value)
+			continue;
+
+		num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+		ret = btrfs_block_rsv_add(root, trans->block_rsv,
+					  num_bytes, BTRFS_RESERVE_NO_FLUSH);
+		if (ret)
+			goto out;
+		ret = __btrfs_set_prop(trans, inode, h->xattr_name,
+				       value, strlen(value), 0);
+		btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
+		if (ret)
+			goto out;
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+			      struct inode *inode,
+			      struct inode *dir)
+{
+	if (!dir)
+		return 0;
+
+	return inherit_props(trans, inode, dir);
+}
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_root *parent_root)
+{
+	struct btrfs_key key;
+	struct inode *parent_inode, *child_inode;
+	int ret;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
+				  parent_root, NULL);
+	if (IS_ERR(parent_inode))
+		return PTR_ERR(parent_inode);
+
+	child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+	if (IS_ERR(child_inode)) {
+		iput(parent_inode);
+		return PTR_ERR(child_inode);
+	}
+
+	ret = inherit_props(trans, child_inode, parent_inode);
+	iput(child_inode);
+	iput(parent_inode);
+
+	return ret;
+}
+
+static int prop_compression_validate(const char *value, size_t len)
+{
+	if (!strncmp("lzo", value, len))
+		return 0;
+	else if (!strncmp("zlib", value, len))
+		return 0;
+
+	return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode,
+				  const char *value,
+				  size_t len)
+{
+	int type;
+
+	if (len == 0) {
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+
+		return 0;
+	}
+
+	if (!strncmp("lzo", value, len))
+		type = BTRFS_COMPRESS_LZO;
+	else if (!strncmp("zlib", value, len))
+		type = BTRFS_COMPRESS_ZLIB;
+	else
+		return -EINVAL;
+
+	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	BTRFS_I(inode)->force_compress = type;
+
+	return 0;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+	switch (BTRFS_I(inode)->force_compress) {
+	case BTRFS_COMPRESS_ZLIB:
+		return "zlib";
+	case BTRFS_COMPRESS_LZO:
+		return "lzo";
+	}
+
+	return NULL;
+}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
new file mode 100644
index 00000000000..100f18829d5
--- /dev/null
+++ b/fs/btrfs/props.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_PROPS_H
+#define __BTRFS_PROPS_H
+
+#include "ctree.h"
+
+void __init btrfs_props_init(void);
+
+int btrfs_set_prop(struct inode *inode,
+		   const char *name,
+		   const char *value,
+		   size_t value_len,
+		   int flags);
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+			      struct inode *inode,
+			      struct inode *dir);
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_root *parent_root);
+
+#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 16d7fc751ba..461e41cb8ca 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,7 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "print-tree.h"
+#include "props.h"
 #include "xattr.h"
 #include "volumes.h"
 #include "export.h"
@@ -1865,6 +1866,8 @@ static int __init init_btrfs_fs(void)
 {
 	int err;
 
+	btrfs_props_init();
+
 	err = btrfs_init_sysfs();
 	if (err)
 		return err;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 05740b9789e..4b33765add3 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "xattr.h"
 #include "disk-io.h"
+#include "props.h"
 
 
 ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -331,7 +332,8 @@ static bool btrfs_is_valid_xattr(const char *name)
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
-	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+		!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
 }
 
 ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
@@ -373,6 +375,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	if (!btrfs_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
+	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		return btrfs_set_prop(dentry->d_inode, name,
+				      value, size, flags);
+
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
 
@@ -402,6 +408,10 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 	if (!btrfs_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
+	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		return btrfs_set_prop(dentry->d_inode, name,
+				      NULL, 0, XATTR_REPLACE);
+
 	return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
 				XATTR_REPLACE);
 }
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index e4629b93bdd..40bbc04b6f8 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -20,6 +20,9 @@
 #define XATTR_MAC_OSX_PREFIX "osx."
 #define XATTR_MAC_OSX_PREFIX_LEN (sizeof(XATTR_MAC_OSX_PREFIX) - 1)
 
+#define XATTR_BTRFS_PREFIX "btrfs."
+#define XATTR_BTRFS_PREFIX_LEN (sizeof(XATTR_BTRFS_PREFIX) - 1)
+
 #define XATTR_SECURITY_PREFIX	"security."
 #define XATTR_SECURITY_PREFIX_LEN (sizeof(XATTR_SECURITY_PREFIX) - 1)
 
-- 
cgit v1.2.3-70-g09d2


From 5039eddc19aee8c894191c24f2dde4e645ca1bbb Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 15 Jan 2014 13:34:13 -0500
Subject: Btrfs: make fsync latency less sucky

Looking into some performance related issues with large amounts of metadata
revealed that we can have some pretty huge swings in fsync() performance.  If we
have a lot of delayed refs backed up (as you will tend to do with lots of
metadata) fsync() will wander off and try to run some of those delayed refs
which can result in reading from disk and such.  Since the actual act of fsync()
doesn't create any delayed refs there is no need to make it throttle on delayed
ref stuff, that will be handled by other people.  With this patch we get much
smoother fsync performance with large amounts of metadata.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c        | 12 ++++++++++++
 fs/btrfs/transaction.c |  3 ++-
 fs/btrfs/transaction.h |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 030012e1710..72df63b0c79 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1928,12 +1928,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (file->private_data)
 		btrfs_ioctl_trans_end(file);
 
+	/*
+	 * We use start here because we will need to wait on the IO to complete
+	 * in btrfs_sync_log, which could require joining a transaction (for
+	 * example checking cross references in the nocow path).  If we use join
+	 * here we could get into a situation where we're waiting on IO to
+	 * happen that is blocked on a transaction trying to commit.  With start
+	 * we inc the extwriter counter, so we wait for all extwriters to exit
+	 * before we start blocking join'ers.  This comment is to keep somebody
+	 * from thinking they are super smart and changing this to
+	 * btrfs_join_transaction *cough*Josef*cough*.
+	 */
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
+	trans->sync = true;
 
 	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index da2ac4c6d78..b16352ce0f7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -474,6 +474,7 @@ again:
 	h->type = type;
 	h->allocating_chunk = false;
 	h->reloc_reserved = false;
+	h->sync = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 
@@ -713,7 +714,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		btrfs_create_pending_block_groups(trans, root);
 
 	trans->delayed_ref_updates = 0;
-	if (btrfs_should_throttle_delayed_refs(trans, root)) {
+	if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
 		cur = max_t(unsigned long, cur, 1);
 		trans->delayed_ref_updates = 0;
 		btrfs_run_delayed_refs(trans, root, cur);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d05b6013fea..6ac037e9f9f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -93,6 +93,7 @@ struct btrfs_trans_handle {
 	short adding_csums;
 	bool allocating_chunk;
 	bool reloc_reserved;
+	bool sync;
 	unsigned int type;
 	/*
 	 * this root is only needed to validate that the root passed to
-- 
cgit v1.2.3-70-g09d2


From d7df2c796d7eedd72a334dc89c65e1fec8171431 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 23 Jan 2014 09:21:38 -0500
Subject: Btrfs: attach delayed ref updates to delayed ref heads

Currently we have two rb-trees, one for delayed ref heads and one for all of the
delayed refs, including the delayed ref heads.  When we process the delayed refs
we have to hold onto the delayed ref lock for all of the selecting and merging
and such, which results in quite a bit of lock contention.  This was solved by
having a waitqueue and only one flusher at a time, however this hurts if we get
a lot of delayed refs queued up.

So instead just have an rb tree for the delayed ref heads, and then attach the
delayed ref updates to an rb tree that is per delayed ref head.  Then we only
need to take the delayed ref lock when adding new delayed refs and when
selecting a delayed ref head to process, all the rest of the time we deal with a
per delayed ref head lock which will be much less contentious.

The locking rules for this get a little more complicated since we have to lock
up to 3 things to properly process delayed refs, but I will address that problem
later.  For now this passes all of xfstests and my overnight stress tests.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c     |  23 ++--
 fs/btrfs/delayed-ref.c | 223 +++++++++++++++++-----------------
 fs/btrfs/delayed-ref.h |  23 ++--
 fs/btrfs/disk-io.c     |  79 ++++++------
 fs/btrfs/extent-tree.c | 317 ++++++++++++++++---------------------------------
 fs/btrfs/transaction.c |   7 +-
 6 files changed, 267 insertions(+), 405 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 835b6c9a26a..34a8952de8d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -538,14 +538,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 	if (extent_op && extent_op->update_key)
 		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
 
-	while ((n = rb_prev(n))) {
+	spin_lock(&head->lock);
+	n = rb_first(&head->ref_root);
+	while (n) {
 		struct btrfs_delayed_ref_node *node;
 		node = rb_entry(n, struct btrfs_delayed_ref_node,
 				rb_node);
-		if (node->bytenr != head->node.bytenr)
-			break;
-		WARN_ON(node->is_head);
-
+		n = rb_next(n);
 		if (node->seq > seq)
 			continue;
 
@@ -612,10 +611,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			WARN_ON(1);
 		}
 		if (ret)
-			return ret;
+			break;
 	}
-
-	return 0;
+	spin_unlock(&head->lock);
+	return ret;
 }
 
 /*
@@ -882,15 +881,15 @@ again:
 				btrfs_put_delayed_ref(&head->node);
 				goto again;
 			}
+			spin_unlock(&delayed_refs->lock);
 			ret = __add_delayed_refs(head, time_seq,
 						 &prefs_delayed);
 			mutex_unlock(&head->mutex);
-			if (ret) {
-				spin_unlock(&delayed_refs->lock);
+			if (ret)
 				goto out;
-			}
+		} else {
+			spin_unlock(&delayed_refs->lock);
 		}
-		spin_unlock(&delayed_refs->lock);
 	}
 
 	if (path->slots[0]) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index fab60c1ba06..f3bff89eecf 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -269,39 +269,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 
 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 				    struct btrfs_delayed_ref_root *delayed_refs,
+				    struct btrfs_delayed_ref_head *head,
 				    struct btrfs_delayed_ref_node *ref)
 {
-	rb_erase(&ref->rb_node, &delayed_refs->root);
 	if (btrfs_delayed_ref_is_head(ref)) {
-		struct btrfs_delayed_ref_head *head;
-
 		head = btrfs_delayed_node_to_head(ref);
 		rb_erase(&head->href_node, &delayed_refs->href_root);
+	} else {
+		assert_spin_locked(&head->lock);
+		rb_erase(&ref->rb_node, &head->ref_root);
 	}
 	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
-	delayed_refs->num_entries--;
+	atomic_dec(&delayed_refs->num_entries);
 	if (trans->delayed_ref_updates)
 		trans->delayed_ref_updates--;
 }
 
 static int merge_ref(struct btrfs_trans_handle *trans,
 		     struct btrfs_delayed_ref_root *delayed_refs,
+		     struct btrfs_delayed_ref_head *head,
 		     struct btrfs_delayed_ref_node *ref, u64 seq)
 {
 	struct rb_node *node;
-	int merged = 0;
 	int mod = 0;
 	int done = 0;
 
-	node = rb_prev(&ref->rb_node);
-	while (node) {
+	node = rb_next(&ref->rb_node);
+	while (!done && node) {
 		struct btrfs_delayed_ref_node *next;
 
 		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		node = rb_prev(node);
-		if (next->bytenr != ref->bytenr)
-			break;
+		node = rb_next(node);
 		if (seq && next->seq >= seq)
 			break;
 		if (comp_entry(ref, next, 0))
@@ -321,12 +320,11 @@ static int merge_ref(struct btrfs_trans_handle *trans,
 			mod = -next->ref_mod;
 		}
 
-		merged++;
-		drop_delayed_ref(trans, delayed_refs, next);
+		drop_delayed_ref(trans, delayed_refs, head, next);
 		ref->ref_mod += mod;
 		if (ref->ref_mod == 0) {
-			drop_delayed_ref(trans, delayed_refs, ref);
-			break;
+			drop_delayed_ref(trans, delayed_refs, head, ref);
+			done = 1;
 		} else {
 			/*
 			 * You can't have multiples of the same ref on a tree
@@ -335,13 +333,8 @@ static int merge_ref(struct btrfs_trans_handle *trans,
 			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
 				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		}
-
-		if (done)
-			break;
-		node = rb_prev(&ref->rb_node);
 	}
-
-	return merged;
+	return done;
 }
 
 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
@@ -352,6 +345,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 	struct rb_node *node;
 	u64 seq = 0;
 
+	assert_spin_locked(&head->lock);
 	/*
 	 * We don't have too much refs to merge in the case of delayed data
 	 * refs.
@@ -369,22 +363,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 
-	node = rb_prev(&head->node.rb_node);
+	node = rb_first(&head->ref_root);
 	while (node) {
 		struct btrfs_delayed_ref_node *ref;
 
 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
 			       rb_node);
-		if (ref->bytenr != head->node.bytenr)
-			break;
-
 		/* We can't merge refs that are outside of our seq count */
 		if (seq && ref->seq >= seq)
 			break;
-		if (merge_ref(trans, delayed_refs, ref, seq))
-			node = rb_prev(&head->node.rb_node);
+		if (merge_ref(trans, delayed_refs, head, ref, seq))
+			node = rb_first(&head->ref_root);
 		else
-			node = rb_prev(node);
+			node = rb_next(&ref->rb_node);
 	}
 }
 
@@ -412,64 +403,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
-			   struct list_head *cluster, u64 start)
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans)
 {
-	int count = 0;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *node;
-	struct btrfs_delayed_ref_head *head = NULL;
+	struct btrfs_delayed_ref_head *head;
+	u64 start;
+	bool loop = false;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	node = rb_first(&delayed_refs->href_root);
 
-	if (start) {
-		find_ref_head(&delayed_refs->href_root, start + 1, &head, 1);
-		if (head)
-			node = &head->href_node;
-	}
 again:
-	while (node && count < 32) {
-		head = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
-		if (list_empty(&head->cluster)) {
-			list_add_tail(&head->cluster, cluster);
-			delayed_refs->run_delayed_start =
-				head->node.bytenr;
-			count++;
-
-			WARN_ON(delayed_refs->num_heads_ready == 0);
-			delayed_refs->num_heads_ready--;
-		} else if (count) {
-			/* the goal of the clustering is to find extents
-			 * that are likely to end up in the same extent
-			 * leaf on disk.  So, we don't want them spread
-			 * all over the tree.  Stop now if we've hit
-			 * a head that was already in use
-			 */
-			break;
-		}
-		node = rb_next(node);
-	}
-	if (count) {
-		return 0;
-	} else if (start) {
-		/*
-		 * we've gone to the end of the rbtree without finding any
-		 * clusters.  start from the beginning and try again
-		 */
+	start = delayed_refs->run_delayed_start;
+	head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+	if (!head && !loop) {
+		delayed_refs->run_delayed_start = 0;
 		start = 0;
-		node = rb_first(&delayed_refs->href_root);
-		goto again;
+		loop = true;
+		head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+		if (!head)
+			return NULL;
+	} else if (!head && loop) {
+		return NULL;
 	}
-	return 1;
-}
 
-void btrfs_release_ref_cluster(struct list_head *cluster)
-{
-	struct list_head *pos, *q;
+	while (head->processing) {
+		struct rb_node *node;
+
+		node = rb_next(&head->href_node);
+		if (!node) {
+			if (loop)
+				return NULL;
+			delayed_refs->run_delayed_start = 0;
+			start = 0;
+			loop = true;
+			goto again;
+		}
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+	}
 
-	list_for_each_safe(pos, q, cluster)
-		list_del_init(pos);
+	head->processing = 1;
+	WARN_ON(delayed_refs->num_heads_ready == 0);
+	delayed_refs->num_heads_ready--;
+	delayed_refs->run_delayed_start = head->node.bytenr +
+		head->node.num_bytes;
+	return head;
 }
 
 /*
@@ -483,6 +462,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster)
 static noinline void
 update_existing_ref(struct btrfs_trans_handle *trans,
 		    struct btrfs_delayed_ref_root *delayed_refs,
+		    struct btrfs_delayed_ref_head *head,
 		    struct btrfs_delayed_ref_node *existing,
 		    struct btrfs_delayed_ref_node *update)
 {
@@ -495,7 +475,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 		 */
 		existing->ref_mod--;
 		if (existing->ref_mod == 0)
-			drop_delayed_ref(trans, delayed_refs, existing);
+			drop_delayed_ref(trans, delayed_refs, head, existing);
 		else
 			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -565,9 +545,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 		}
 	}
 	/*
-	 * update the reference mod on the head to reflect this new operation
+	 * update the reference mod on the head to reflect this new operation,
+	 * only need the lock for this case cause we could be processing it
+	 * currently, for refs we just added we know we're a-ok.
 	 */
+	spin_lock(&existing_ref->lock);
 	existing->ref_mod += update->ref_mod;
+	spin_unlock(&existing_ref->lock);
 }
 
 /*
@@ -575,13 +559,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
-					struct btrfs_trans_handle *trans,
-					struct btrfs_delayed_ref_node *ref,
-					u64 bytenr, u64 num_bytes,
-					int action, int is_data)
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, int action, int is_data)
 {
-	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
@@ -628,39 +612,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
+	head_ref->ref_root = RB_ROOT;
+	head_ref->processing = 0;
 
-	INIT_LIST_HEAD(&head_ref->cluster);
+	spin_lock_init(&head_ref->lock);
 	mutex_init(&head_ref->mutex);
 
 	trace_add_delayed_ref_head(ref, head_ref, action);
 
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
+	existing = htree_insert(&delayed_refs->href_root,
+				&head_ref->href_node);
 	if (existing) {
-		update_existing_head_ref(existing, ref);
+		update_existing_head_ref(&existing->node, ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+		head_ref = existing;
 	} else {
-		htree_insert(&delayed_refs->href_root, &head_ref->href_node);
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
-		delayed_refs->num_entries++;
+		atomic_inc(&delayed_refs->num_entries);
 		trans->delayed_ref_updates++;
 	}
+	return head_ref;
 }
 
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
-					 struct btrfs_trans_handle *trans,
-					 struct btrfs_delayed_ref_node *ref,
-					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, int level, int action,
-					 int for_cow)
+static noinline void
+add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_head *head_ref,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, u64 parent, u64 ref_root, int level,
+		     int action, int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
@@ -696,30 +684,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_tree_ref(ref, full_ref, action);
 
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
+	spin_lock(&head_ref->lock);
+	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
 	if (existing) {
-		update_existing_ref(trans, delayed_refs, existing, ref);
+		update_existing_ref(trans, delayed_refs, head_ref, existing,
+				    ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
 	} else {
-		delayed_refs->num_entries++;
+		atomic_inc(&delayed_refs->num_entries);
 		trans->delayed_ref_updates++;
 	}
+	spin_unlock(&head_ref->lock);
 }
 
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
-					 struct btrfs_trans_handle *trans,
-					 struct btrfs_delayed_ref_node *ref,
-					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, u64 owner, u64 offset,
-					 int action, int for_cow)
+static noinline void
+add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_head *head_ref,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
+		     u64 offset, int action, int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
@@ -757,19 +748,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_data_ref(ref, full_ref, action);
 
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
+	spin_lock(&head_ref->lock);
+	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
 	if (existing) {
-		update_existing_ref(trans, delayed_refs, existing, ref);
+		update_existing_ref(trans, delayed_refs, head_ref, existing,
+				    ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
 		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
 	} else {
-		delayed_refs->num_entries++;
+		atomic_inc(&delayed_refs->num_entries);
 		trans->delayed_ref_updates++;
 	}
+	spin_unlock(&head_ref->lock);
 }
 
 /*
@@ -808,10 +801,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, action, 0);
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+					bytenr, num_bytes, action, 0);
 
-	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, level, action,
 				   for_cow);
 	spin_unlock(&delayed_refs->lock);
@@ -856,10 +849,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, action, 1);
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+					bytenr, num_bytes, action, 1);
 
-	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
 				   action, for_cow);
 	spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a54c9d47918..4ba9b93022f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -81,7 +81,8 @@ struct btrfs_delayed_ref_head {
 	 */
 	struct mutex mutex;
 
-	struct list_head cluster;
+	spinlock_t lock;
+	struct rb_root ref_root;
 
 	struct rb_node href_node;
 
@@ -100,6 +101,7 @@ struct btrfs_delayed_ref_head {
 	 */
 	unsigned int must_insert_reserved:1;
 	unsigned int is_data:1;
+	unsigned int processing:1;
 };
 
 struct btrfs_delayed_tree_ref {
@@ -118,8 +120,6 @@ struct btrfs_delayed_data_ref {
 };
 
 struct btrfs_delayed_ref_root {
-	struct rb_root root;
-
 	/* head ref rbtree */
 	struct rb_root href_root;
 
@@ -129,7 +129,7 @@ struct btrfs_delayed_ref_root {
 	/* how many delayed ref updates we've queued, used by the
 	 * throttling code
 	 */
-	unsigned long num_entries;
+	atomic_t num_entries;
 
 	/* total number of head nodes in tree */
 	unsigned long num_heads;
@@ -137,15 +137,6 @@ struct btrfs_delayed_ref_root {
 	/* total number of head nodes ready for processing */
 	unsigned long num_heads_ready;
 
-	/*
-	 * bumped when someone is making progress on the delayed
-	 * refs, so that other procs know they are just adding to
-	 * contention intead of helping
-	 */
-	atomic_t procs_running_refs;
-	atomic_t ref_seq;
-	wait_queue_head_t wait;
-
 	/*
 	 * set when the tree is flushing before a transaction commit,
 	 * used by the throttling code to decide if new updates need
@@ -231,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 	mutex_unlock(&head->mutex);
 }
 
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
-			   struct list_head *cluster, u64 search_start);
-void btrfs_release_ref_cluster(struct list_head *cluster);
+
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans);
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9850a51a3f1..ed23127a4b0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3801,58 +3801,55 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	delayed_refs = &trans->delayed_refs;
 
 	spin_lock(&delayed_refs->lock);
-	if (delayed_refs->num_entries == 0) {
+	if (atomic_read(&delayed_refs->num_entries) == 0) {
 		spin_unlock(&delayed_refs->lock);
 		btrfs_info(root->fs_info, "delayed_refs has NO entry");
 		return ret;
 	}
 
-	while ((node = rb_first(&delayed_refs->root)) != NULL) {
-		struct btrfs_delayed_ref_head *head = NULL;
+	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+		struct btrfs_delayed_ref_head *head;
 		bool pin_bytes = false;
 
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		atomic_set(&ref->refs, 1);
-		if (btrfs_delayed_ref_is_head(ref)) {
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
 
-			head = btrfs_delayed_node_to_head(ref);
-			if (!mutex_trylock(&head->mutex)) {
-				atomic_inc(&ref->refs);
-				spin_unlock(&delayed_refs->lock);
-
-				/* Need to wait for the delayed ref to run */
-				mutex_lock(&head->mutex);
-				mutex_unlock(&head->mutex);
-				btrfs_put_delayed_ref(ref);
-
-				spin_lock(&delayed_refs->lock);
-				continue;
-			}
-
-			if (head->must_insert_reserved)
-				pin_bytes = true;
-			btrfs_free_delayed_extent_op(head->extent_op);
-			delayed_refs->num_heads--;
-			if (list_empty(&head->cluster))
-				delayed_refs->num_heads_ready--;
-			list_del_init(&head->cluster);
-		}
-
-		ref->in_tree = 0;
-		rb_erase(&ref->rb_node, &delayed_refs->root);
-		if (head)
-			rb_erase(&head->href_node, &delayed_refs->href_root);
-
-		delayed_refs->num_entries--;
-		spin_unlock(&delayed_refs->lock);
-		if (head) {
-			if (pin_bytes)
-				btrfs_pin_extent(root, ref->bytenr,
-						 ref->num_bytes, 1);
+			mutex_lock(&head->mutex);
 			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			spin_lock(&delayed_refs->lock);
+			continue;
+		}
+		spin_lock(&head->lock);
+		while ((node = rb_first(&head->ref_root)) != NULL) {
+			ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				       rb_node);
+			ref->in_tree = 0;
+			rb_erase(&ref->rb_node, &head->ref_root);
+			atomic_dec(&delayed_refs->num_entries);
+			btrfs_put_delayed_ref(ref);
+			cond_resched_lock(&head->lock);
 		}
-		btrfs_put_delayed_ref(ref);
+		if (head->must_insert_reserved)
+			pin_bytes = true;
+		btrfs_free_delayed_extent_op(head->extent_op);
+		delayed_refs->num_heads--;
+		if (head->processing == 0)
+			delayed_refs->num_heads_ready--;
+		atomic_dec(&delayed_refs->num_entries);
+		head->node.in_tree = 0;
+		rb_erase(&head->href_node, &delayed_refs->href_root);
+		spin_unlock(&head->lock);
+		spin_unlock(&delayed_refs->lock);
+		mutex_unlock(&head->mutex);
 
+		if (pin_bytes)
+			btrfs_pin_extent(root, head->node.bytenr,
+					 head->node.num_bytes, 1);
+		btrfs_put_delayed_ref(&head->node);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 77acc087389..c77156c77de 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -857,12 +857,14 @@ again:
 			btrfs_put_delayed_ref(&head->node);
 			goto search_again;
 		}
+		spin_lock(&head->lock);
 		if (head->extent_op && head->extent_op->update_flags)
 			extent_flags |= head->extent_op->flags_to_set;
 		else
 			BUG_ON(num_refs == 0);
 
 		num_refs += head->node.ref_mod;
+		spin_unlock(&head->lock);
 		mutex_unlock(&head->mutex);
 	}
 	spin_unlock(&delayed_refs->lock);
@@ -2287,40 +2289,33 @@ static noinline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
 	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref;
-	int action = BTRFS_ADD_DELAYED_REF;
-again:
+	struct btrfs_delayed_ref_node *ref, *last = NULL;;
+
 	/*
 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
 	 * this prevents ref count from going down to zero when
 	 * there still are pending delayed ref.
 	 */
-	node = rb_prev(&head->node.rb_node);
-	while (1) {
-		if (!node)
-			break;
+	node = rb_first(&head->ref_root);
+	while (node) {
 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
 				rb_node);
-		if (ref->bytenr != head->node.bytenr)
-			break;
-		if (ref->action == action)
+		if (ref->action == BTRFS_ADD_DELAYED_REF)
 			return ref;
-		node = rb_prev(node);
-	}
-	if (action == BTRFS_ADD_DELAYED_REF) {
-		action = BTRFS_DROP_DELAYED_REF;
-		goto again;
+		else if (last == NULL)
+			last = ref;
+		node = rb_next(node);
 	}
-	return NULL;
+	return last;
 }
 
 /*
  * Returns 0 on success or if called with an already aborted transaction.
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
-static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
-				       struct list_head *cluster)
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     unsigned long nr)
 {
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
@@ -2328,23 +2323,26 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_extent_op *extent_op;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
-	int count = 0;
+	unsigned long count = 0;
 	int must_insert_reserved = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	while (1) {
 		if (!locked_ref) {
-			/* pick a new head ref from the cluster list */
-			if (list_empty(cluster))
+			if (count >= nr)
 				break;
 
-			locked_ref = list_entry(cluster->next,
-				     struct btrfs_delayed_ref_head, cluster);
+			spin_lock(&delayed_refs->lock);
+			locked_ref = btrfs_select_ref_head(trans);
+			if (!locked_ref) {
+				spin_unlock(&delayed_refs->lock);
+				break;
+			}
 
 			/* grab the lock that says we are going to process
 			 * all the refs for this head */
 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
-
+			spin_unlock(&delayed_refs->lock);
 			/*
 			 * we may have dropped the spin lock to get the head
 			 * mutex lock, and that might have given someone else
@@ -2365,6 +2363,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		 * finish.  If we merged anything we need to re-loop so we can
 		 * get a good ref.
 		 */
+		spin_lock(&locked_ref->lock);
 		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
 					 locked_ref);
 
@@ -2376,17 +2375,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
 		if (ref && ref->seq &&
 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
-			/*
-			 * there are still refs with lower seq numbers in the
-			 * process of being added. Don't run this ref yet.
-			 */
-			list_del_init(&locked_ref->cluster);
+			spin_unlock(&locked_ref->lock);
 			btrfs_delayed_ref_unlock(locked_ref);
-			locked_ref = NULL;
+			spin_lock(&delayed_refs->lock);
+			locked_ref->processing = 0;
 			delayed_refs->num_heads_ready++;
 			spin_unlock(&delayed_refs->lock);
+			locked_ref = NULL;
 			cond_resched();
-			spin_lock(&delayed_refs->lock);
 			continue;
 		}
 
@@ -2401,6 +2397,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		locked_ref->extent_op = NULL;
 
 		if (!ref) {
+
+
 			/* All delayed refs have been processed, Go ahead
 			 * and send the head node to run_one_delayed_ref,
 			 * so that any accounting fixes can happen
@@ -2413,8 +2411,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			}
 
 			if (extent_op) {
-				spin_unlock(&delayed_refs->lock);
-
+				spin_unlock(&locked_ref->lock);
 				ret = run_delayed_extent_op(trans, root,
 							    ref, extent_op);
 				btrfs_free_delayed_extent_op(extent_op);
@@ -2428,23 +2425,38 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 					 */
 					if (must_insert_reserved)
 						locked_ref->must_insert_reserved = 1;
+					locked_ref->processing = 0;
 					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
-					spin_lock(&delayed_refs->lock);
 					btrfs_delayed_ref_unlock(locked_ref);
 					return ret;
 				}
-
-				goto next;
+				continue;
 			}
-		}
 
-		ref->in_tree = 0;
-		rb_erase(&ref->rb_node, &delayed_refs->root);
-		if (btrfs_delayed_ref_is_head(ref)) {
+			/*
+			 * Need to drop our head ref lock and re-aqcuire the
+			 * delayed ref lock and then re-check to make sure
+			 * nobody got added.
+			 */
+			spin_unlock(&locked_ref->lock);
+			spin_lock(&delayed_refs->lock);
+			spin_lock(&locked_ref->lock);
+			if (rb_first(&locked_ref->ref_root)) {
+				spin_unlock(&locked_ref->lock);
+				spin_unlock(&delayed_refs->lock);
+				continue;
+			}
+			ref->in_tree = 0;
+			delayed_refs->num_heads--;
 			rb_erase(&locked_ref->href_node,
 				 &delayed_refs->href_root);
+			spin_unlock(&delayed_refs->lock);
+		} else {
+			ref->in_tree = 0;
+			rb_erase(&ref->rb_node, &locked_ref->ref_root);
 		}
-		delayed_refs->num_entries--;
+		atomic_dec(&delayed_refs->num_entries);
+
 		if (!btrfs_delayed_ref_is_head(ref)) {
 			/*
 			 * when we play the delayed ref, also correct the
@@ -2461,20 +2473,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			default:
 				WARN_ON(1);
 			}
-		} else {
-			list_del_init(&locked_ref->cluster);
 		}
-		spin_unlock(&delayed_refs->lock);
+		spin_unlock(&locked_ref->lock);
 
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
 					  must_insert_reserved);
 
 		btrfs_free_delayed_extent_op(extent_op);
 		if (ret) {
+			locked_ref->processing = 0;
 			btrfs_delayed_ref_unlock(locked_ref);
 			btrfs_put_delayed_ref(ref);
 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
-			spin_lock(&delayed_refs->lock);
 			return ret;
 		}
 
@@ -2490,11 +2500,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		}
 		btrfs_put_delayed_ref(ref);
 		count++;
-next:
 		cond_resched();
-		spin_lock(&delayed_refs->lock);
 	}
-	return count;
+	return 0;
 }
 
 #ifdef SCRAMBLE_DELAYED_REFS
@@ -2576,16 +2584,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
-		      int count)
-{
-	int val = atomic_read(&delayed_refs->ref_seq);
-
-	if (val < seq || val >= seq + count)
-		return 1;
-	return 0;
-}
-
 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 {
 	u64 num_bytes;
@@ -2647,12 +2645,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_head *head;
-	struct list_head cluster;
 	int ret;
-	u64 delayed_start;
 	int run_all = count == (unsigned long)-1;
 	int run_most = 0;
-	int loops;
 
 	/* We'll clean this up in btrfs_cleanup_transaction */
 	if (trans->aborted)
@@ -2664,121 +2659,31 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	INIT_LIST_HEAD(&cluster);
 	if (count == 0) {
-		count = delayed_refs->num_entries * 2;
+		count = atomic_read(&delayed_refs->num_entries) * 2;
 		run_most = 1;
 	}
 
-	if (!run_all && !run_most) {
-		int old;
-		int seq = atomic_read(&delayed_refs->ref_seq);
-
-progress:
-		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
-		if (old) {
-			DEFINE_WAIT(__wait);
-			if (delayed_refs->flushing ||
-			    !btrfs_should_throttle_delayed_refs(trans, root))
-				return 0;
-
-			prepare_to_wait(&delayed_refs->wait, &__wait,
-					TASK_UNINTERRUPTIBLE);
-
-			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
-			if (old) {
-				schedule();
-				finish_wait(&delayed_refs->wait, &__wait);
-
-				if (!refs_newer(delayed_refs, seq, 256))
-					goto progress;
-				else
-					return 0;
-			} else {
-				finish_wait(&delayed_refs->wait, &__wait);
-				goto again;
-			}
-		}
-
-	} else {
-		atomic_inc(&delayed_refs->procs_running_refs);
-	}
-
 again:
-	loops = 0;
-	spin_lock(&delayed_refs->lock);
-
 #ifdef SCRAMBLE_DELAYED_REFS
 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
-
-	while (1) {
-		if (!(run_all || run_most) &&
-		    !btrfs_should_throttle_delayed_refs(trans, root))
-			break;
-
-		/*
-		 * go find something we can process in the rbtree.  We start at
-		 * the beginning of the tree, and then build a cluster
-		 * of refs to process starting at the first one we are able to
-		 * lock
-		 */
-		delayed_start = delayed_refs->run_delayed_start;
-		ret = btrfs_find_ref_cluster(trans, &cluster,
-					     delayed_refs->run_delayed_start);
-		if (ret)
-			break;
-
-		ret = run_clustered_refs(trans, root, &cluster);
-		if (ret < 0) {
-			btrfs_release_ref_cluster(&cluster);
-			spin_unlock(&delayed_refs->lock);
-			btrfs_abort_transaction(trans, root, ret);
-			atomic_dec(&delayed_refs->procs_running_refs);
-			wake_up(&delayed_refs->wait);
-			return ret;
-		}
-
-		atomic_add(ret, &delayed_refs->ref_seq);
-
-		count -= min_t(unsigned long, ret, count);
-
-		if (count == 0)
-			break;
-
-		if (delayed_start >= delayed_refs->run_delayed_start) {
-			if (loops == 0) {
-				/*
-				 * btrfs_find_ref_cluster looped. let's do one
-				 * more cycle. if we don't run any delayed ref
-				 * during that cycle (because we can't because
-				 * all of them are blocked), bail out.
-				 */
-				loops = 1;
-			} else {
-				/*
-				 * no runnable refs left, stop trying
-				 */
-				BUG_ON(run_all);
-				break;
-			}
-		}
-		if (ret) {
-			/* refs were run, let's reset staleness detection */
-			loops = 0;
-		}
+	ret = __btrfs_run_delayed_refs(trans, root, count);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
 	}
 
 	if (run_all) {
-		if (!list_empty(&trans->new_bgs)) {
-			spin_unlock(&delayed_refs->lock);
+		if (!list_empty(&trans->new_bgs))
 			btrfs_create_pending_block_groups(trans, root);
-			spin_lock(&delayed_refs->lock);
-		}
 
+		spin_lock(&delayed_refs->lock);
 		node = rb_first(&delayed_refs->href_root);
-		if (!node)
+		if (!node) {
+			spin_unlock(&delayed_refs->lock);
 			goto out;
+		}
 		count = (unsigned long)-1;
 
 		while (node) {
@@ -2807,16 +2712,10 @@ again:
 			node = rb_next(node);
 		}
 		spin_unlock(&delayed_refs->lock);
-		schedule_timeout(1);
+		cond_resched();
 		goto again;
 	}
 out:
-	atomic_dec(&delayed_refs->procs_running_refs);
-	smp_mb();
-	if (waitqueue_active(&delayed_refs->wait))
-		wake_up(&delayed_refs->wait);
-
-	spin_unlock(&delayed_refs->lock);
 	assert_qgroups_uptodate(trans);
 	return 0;
 }
@@ -2858,12 +2757,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 	struct rb_node *node;
 	int ret = 0;
 
-	ret = -ENOENT;
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 	head = btrfs_find_delayed_ref_head(trans, bytenr);
-	if (!head)
-		goto out;
+	if (!head) {
+		spin_unlock(&delayed_refs->lock);
+		return 0;
+	}
 
 	if (!mutex_trylock(&head->mutex)) {
 		atomic_inc(&head->node.refs);
@@ -2880,40 +2780,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 		btrfs_put_delayed_ref(&head->node);
 		return -EAGAIN;
 	}
+	spin_unlock(&delayed_refs->lock);
 
-	node = rb_prev(&head->node.rb_node);
-	if (!node)
-		goto out_unlock;
-
-	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
-	if (ref->bytenr != bytenr)
-		goto out_unlock;
+	spin_lock(&head->lock);
+	node = rb_first(&head->ref_root);
+	while (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
 
-	ret = 1;
-	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
-		goto out_unlock;
+		/* If it's a shared ref we know a cross reference exists */
+		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+			ret = 1;
+			break;
+		}
 
-	data_ref = btrfs_delayed_node_to_data_ref(ref);
+		data_ref = btrfs_delayed_node_to_data_ref(ref);
 
-	node = rb_prev(node);
-	if (node) {
-		int seq = ref->seq;
-
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		if (ref->bytenr == bytenr && ref->seq == seq)
-			goto out_unlock;
+		/*
+		 * If our ref doesn't match the one we're currently looking at
+		 * then we have a cross reference.
+		 */
+		if (data_ref->root != root->root_key.objectid ||
+		    data_ref->objectid != objectid ||
+		    data_ref->offset != offset) {
+			ret = 1;
+			break;
+		}
 	}
-
-	if (data_ref->root != root->root_key.objectid ||
-	    data_ref->objectid != objectid || data_ref->offset != offset)
-		goto out_unlock;
-
-	ret = 0;
-out_unlock:
+	spin_unlock(&head->lock);
 	mutex_unlock(&head->mutex);
-out:
-	spin_unlock(&delayed_refs->lock);
 	return ret;
 }
 
@@ -5953,8 +5848,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-	struct rb_node *node;
 	int ret = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -5963,14 +5856,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (!head)
 		goto out;
 
-	node = rb_prev(&head->node.rb_node);
-	if (!node)
-		goto out;
-
-	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
-	/* there are still entries for this ref, we can't drop it */
-	if (ref->bytenr == bytenr)
+	spin_lock(&head->lock);
+	if (rb_first(&head->ref_root))
 		goto out;
 
 	if (head->extent_op) {
@@ -5992,20 +5879,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	 * ahead and process it.
 	 */
 	head->node.in_tree = 0;
-	rb_erase(&head->node.rb_node, &delayed_refs->root);
 	rb_erase(&head->href_node, &delayed_refs->href_root);
 
-	delayed_refs->num_entries--;
+	atomic_dec(&delayed_refs->num_entries);
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
 	 * tree, so we just steal the ref the tree was holding.
 	 */
 	delayed_refs->num_heads--;
-	if (list_empty(&head->cluster))
+	if (head->processing == 0)
 		delayed_refs->num_heads_ready--;
-
-	list_del_init(&head->cluster);
+	head->processing = 0;
+	spin_unlock(&head->lock);
 	spin_unlock(&delayed_refs->lock);
 
 	BUG_ON(head->extent_op);
@@ -6016,6 +5902,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	btrfs_put_delayed_ref(&head->node);
 	return ret;
 out:
+	spin_unlock(&head->lock);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b16352ce0f7..fd1446496fe 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -62,7 +62,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
-		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.root));
 		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
 		while (!list_empty(&transaction->pending_chunks)) {
 			struct extent_map *em;
@@ -184,9 +183,8 @@ loop:
 	atomic_set(&cur_trans->use_count, 2);
 	cur_trans->start_time = get_seconds();
 
-	cur_trans->delayed_refs.root = RB_ROOT;
 	cur_trans->delayed_refs.href_root = RB_ROOT;
-	cur_trans->delayed_refs.num_entries = 0;
+	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
 	cur_trans->delayed_refs.num_heads_ready = 0;
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
@@ -206,9 +204,6 @@ loop:
 	atomic64_set(&fs_info->tree_mod_seq, 0);
 
 	spin_lock_init(&cur_trans->delayed_refs.lock);
-	atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
-	atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
-	init_waitqueue_head(&cur_trans->delayed_refs.wait);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	INIT_LIST_HEAD(&cur_trans->ordered_operations);
-- 
cgit v1.2.3-70-g09d2


From 0a2b2a844af616addc87cac3cc18dcaba2a9d0fb Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 23 Jan 2014 10:54:11 -0500
Subject: Btrfs: throttle delayed refs better

On one of our gluster clusters we noticed some pretty big lag spikes.  This
turned out to be because our transaction commit was taking like 3 minutes to
complete.  This is because we have like 30 gigs of metadata, so our global
reserve would end up being the max which is like 512 mb.  So our throttling code
would allow a ridiculous amount of delayed refs to build up and then they'd all
get run at transaction commit time, and for a cold mounted file system that
could take up to 3 minutes to run.  So fix the throttling to be based on both
the size of the global reserve and how long it takes us to run delayed refs.
This patch tracks the time it takes to run delayed refs and then only allows 1
seconds worth of outstanding delayed refs at a time.  This way it will auto-tune
itself from cold cache up to when everything is in memory and it no longer has
to go to disk.  This makes our transaction commits take much less time to run.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/transaction.c |  4 ++--
 4 files changed, 46 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3cebb4aeddc..ca6bcc33d03 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1360,6 +1360,7 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
+	u64 avg_delayed_ref_runtime;
 
 	/*
 	 * this is updated to the current trans every time a full commit
@@ -3172,6 +3173,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
 
 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ed23127a4b0..f0e7bbe1482 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb,
 	fs_info->free_chunk_space = 0;
 	fs_info->tree_mod_log = RB_ROOT;
 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-
+	fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
 	spin_lock_init(&fs_info->reada_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c77156c77de..b5322596d60 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	struct btrfs_delayed_extent_op *extent_op;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	ktime_t start = ktime_get();
 	int ret;
 	unsigned long count = 0;
+	unsigned long actual_count = 0;
 	int must_insert_reserved = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 				 &delayed_refs->href_root);
 			spin_unlock(&delayed_refs->lock);
 		} else {
+			actual_count++;
 			ref->in_tree = 0;
 			rb_erase(&ref->rb_node, &locked_ref->ref_root);
 		}
@@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		count++;
 		cond_resched();
 	}
+
+	/*
+	 * We don't want to include ref heads since we can have empty ref heads
+	 * and those will drastically skew our runtime down since we just do
+	 * accounting, no actual extent tree updates.
+	 */
+	if (actual_count > 0) {
+		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+		u64 avg;
+
+		/*
+		 * We weigh the current average higher than our current runtime
+		 * to avoid large swings in the average.
+		 */
+		spin_lock(&delayed_refs->lock);
+		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+		avg = div64_u64(avg, 4);
+		fs_info->avg_delayed_ref_runtime = avg;
+		spin_unlock(&delayed_refs->lock);
+	}
 	return 0;
 }
 
@@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root)
 {
 	struct btrfs_block_rsv *global_rsv;
@@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 num_entries =
+		atomic_read(&trans->transaction->delayed_refs.num_entries);
+	u64 avg_runtime;
+
+	smp_mb();
+	avg_runtime = fs_info->avg_delayed_ref_runtime;
+	if (num_entries * avg_runtime >= NSEC_PER_SEC)
+		return 1;
+
+	return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fd1446496fe..5e2bfdaf8d1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -645,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
 	if (root->fs_info->global_block_rsv.space_info->full &&
-	    btrfs_should_throttle_delayed_refs(trans, root))
+	    btrfs_check_space_for_delayed_refs(trans, root))
 		return 1;
 
 	return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
@@ -710,7 +710,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	trans->delayed_ref_updates = 0;
 	if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
-		cur = max_t(unsigned long, cur, 1);
+		cur = max_t(unsigned long, cur, 32);
 		trans->delayed_ref_updates = 0;
 		btrfs_run_delayed_refs(trans, root, cur);
 	}
-- 
cgit v1.2.3-70-g09d2


From 580f0a678ebeba85d30b6a7f22ce32c472263c72 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 23 Jan 2014 16:03:45 -0500
Subject: Btrfs: fix extent_from_logical to deal with skinny metadata

I don't think this is an issue and I've not seen it in practice but
extent_from_logical will fail to find a skinny extent because it uses
btrfs_previous_item and gives it the normal extent item type.  This is just not
a place to use btrfs_previous_item since we care about either normal extents or
skinny extents, so open code btrfs_previous_item to properly check.  This would
only affect metadata and the only place this is used for metadata is scrub and
I'm pretty sure it's just for printing stuff out, not actually doing any work so
hopefully it was never a problem other than a cosmetic one.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 34a8952de8d..dcf2448c16f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1302,20 +1302,45 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	ret = btrfs_previous_item(fs_info->extent_root, path,
-					0, BTRFS_EXTENT_ITEM_KEY);
-	if (ret < 0)
-		return ret;
 
-	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+	while (1) {
+		u32 nritems;
+		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_prev_leaf(fs_info->extent_root, path);
+			if (ret != 0) {
+				if (ret > 0) {
+					pr_debug("logical %llu is not within "
+						 "any extent\n", logical);
+					ret = -ENOENT;
+				}
+				return ret;
+			}
+		} else {
+			path->slots[0]--;
+		}
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (nritems == 0) {
+			pr_debug("logical %llu is not within any extent\n",
+				 logical);
+			return -ENOENT;
+		}
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
+		btrfs_item_key_to_cpu(path->nodes[0], found_key,
+				      path->slots[0]);
+		if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
+		    found_key->type == BTRFS_METADATA_ITEM_KEY)
+			break;
+	}
+
 	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
 		size = fs_info->extent_root->leafsize;
 	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
 		size = found_key->offset;
 
-	if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
-	     found_key->type != BTRFS_METADATA_ITEM_KEY) ||
-	    found_key->objectid > logical ||
+	if (found_key->objectid > logical ||
 	    found_key->objectid + size <= logical) {
 		pr_debug("logical %llu is not within any extent\n", logical);
 		return -ENOENT;
-- 
cgit v1.2.3-70-g09d2


From 3a6d75e846224542151e9ff186cb89df5a6ca2c6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 23 Jan 2014 16:45:10 -0500
Subject: Btrfs: fix qgroup rescan to work with skinny metadata

Could have sworn I fixed this before but apparently not.  This makes us pass
btrfs/022 with skinny metadata enabled.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d22e0a14dde..472302a2d74 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1897,9 +1897,17 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
 	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+		u64 num_bytes;
+
 		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
-		if (found.type != BTRFS_EXTENT_ITEM_KEY)
+		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
+		    found.type != BTRFS_METADATA_ITEM_KEY)
 			continue;
+		if (found.type == BTRFS_METADATA_ITEM_KEY)
+			num_bytes = fs_info->extent_root->leafsize;
+		else
+			num_bytes = found.offset;
+
 		ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
 					   tree_mod_seq_elem.seq, &roots);
 		if (ret < 0)
@@ -1944,12 +1952,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 			struct btrfs_qgroup_list *glist;
 
 			qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
-			qg->rfer += found.offset;
-			qg->rfer_cmpr += found.offset;
+			qg->rfer += num_bytes;
+			qg->rfer_cmpr += num_bytes;
 			WARN_ON(qg->tag >= seq);
 			if (qg->refcnt - seq == roots->nnodes) {
-				qg->excl += found.offset;
-				qg->excl_cmpr += found.offset;
+				qg->excl += num_bytes;
+				qg->excl_cmpr += num_bytes;
 			}
 			qgroup_dirty(fs_info, qg);
 
-- 
cgit v1.2.3-70-g09d2


From 7ef81ac86c8a44ab9f4e6e04e1f4c9ea53615b8a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 24 Jan 2014 14:05:42 -0500
Subject: Btrfs: only process as many file extents as there are refs

The backref walking code will search down to the key it is looking for and then
proceed to walk _all_ of the extents on the file until it hits the end.  This is
suboptimal with large files, we only need to look for as many extents as we have
references for that inode.  I have a testcase that creates a randomly written 4
gig file and before this patch it took 6min 30sec to do the initial send, with
this patch it takes 2min 30sec to do the intial send.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index dcf2448c16f..15384968a84 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -209,18 +209,19 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 }
 
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
-				struct ulist *parents, int level,
-				struct btrfs_key *key_for_search, u64 time_seq,
-				u64 wanted_disk_byte,
-				const u64 *extent_item_pos)
+			   struct ulist *parents, struct __prelim_ref *ref,
+			   int level, u64 time_seq, const u64 *extent_item_pos)
 {
 	int ret = 0;
 	int slot;
 	struct extent_buffer *eb;
 	struct btrfs_key key;
+	struct btrfs_key *key_for_search = &ref->key_for_search;
 	struct btrfs_file_extent_item *fi;
 	struct extent_inode_elem *eie = NULL, *old = NULL;
 	u64 disk_byte;
+	u64 wanted_disk_byte = ref->wanted_disk_byte;
+	u64 count = 0;
 
 	if (level != 0) {
 		eb = path->nodes[level];
@@ -238,7 +239,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
 		ret = btrfs_next_old_leaf(root, path, time_seq);
 
-	while (!ret) {
+	while (!ret && count < ref->count) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
 
@@ -254,6 +255,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 		if (disk_byte == wanted_disk_byte) {
 			eie = NULL;
 			old = NULL;
+			count++;
 			if (extent_item_pos) {
 				ret = check_extent_in_eb(&key, eb, fi,
 						*extent_item_pos,
@@ -334,9 +336,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 		eb = path->nodes[level];
 	}
 
-	ret = add_all_parents(root, path, parents, level, &ref->key_for_search,
-				time_seq, ref->wanted_disk_byte,
-				extent_item_pos);
+	ret = add_all_parents(root, path, parents, ref, level, time_seq,
+			      extent_item_pos);
 out:
 	path->lowest_level = 0;
 	btrfs_release_path(path);
-- 
cgit v1.2.3-70-g09d2


From f1de968376340c97ac2d7acd25fa3107c398e0e5 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 9 Jan 2014 10:06:10 +0800
Subject: Btrfs: fix the race between write back and nocow buffered write

When we ran the 274th case of xfstests with nodatacow mount option,
We met the following warning message:
WARNING: CPU: 1 PID: 14185 at fs/btrfs/extent-tree.c:3734 btrfs_free_reserved_data_space+0xa6/0xd0

It is caused by the race between the write back and nocow buffered
write:
  Task1				Task2
  __btrfs_buffered_write()
    skip data reservation
    reserve the metadata space
    copy the data
    dirty the pages
    unlock the pages
				write back the pages
				release the data space
   				  becasue there is no
				  noreserve flag
   set the noreserve flag

This patch fixes this problem by unlocking the pages after
the noreserve flag is set.

Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 72df63b0c79..3dfd8db0e24 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1591,9 +1591,10 @@ again:
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state,
 					     GFP_NOFS);
-		btrfs_drop_pages(pages, num_pages);
-		if (ret)
+		if (ret) {
+			btrfs_drop_pages(pages, num_pages);
 			break;
+		}
 
 		release_bytes = 0;
 		if (only_release_metadata && copied > 0) {
@@ -1607,6 +1608,8 @@ again:
 			only_release_metadata = false;
 		}
 
+		btrfs_drop_pages(pages, num_pages);
+
 		cond_resched();
 
 		balance_dirty_pages_ratelimited(inode->i_mapping);
-- 
cgit v1.2.3-70-g09d2


From de6e8200669f9b60694ca87eadf0a0a99cbdb6aa Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 9 Jan 2014 14:57:06 +0800
Subject: Btrfs: release subvolume's block_rsv before transaction commit

We don't have to keep subvolume's block_rsv during transaction commit,
and within transaction commit, we may also need the free space reclaimed
from this block_rsv to process delayed refs.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3970f32b2b8..332b624e25d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -436,7 +436,9 @@ static noinline int create_subvol(struct inode *dir,
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		goto out;
+		btrfs_subvolume_release_metadata(root, &block_rsv,
+						 qgroup_reserved);
+		return ret;
 	}
 	trans->block_rsv = &block_rsv;
 	trans->bytes_reserved = block_rsv.size;
@@ -561,6 +563,8 @@ static noinline int create_subvol(struct inode *dir,
 fail:
 	trans->block_rsv = NULL;
 	trans->bytes_reserved = 0;
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+
 	if (async_transid) {
 		*async_transid = trans->transid;
 		err = btrfs_commit_transaction_async(trans, root, 1);
@@ -574,14 +578,10 @@ fail:
 
 	if (!ret) {
 		inode = btrfs_lookup_dentry(dir, dentry);
-		if (IS_ERR(inode)) {
-			ret = PTR_ERR(inode);
-			goto out;
-		}
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
 		d_instantiate(dentry, inode);
 	}
-out:
-	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From f499e40fd97698a1c48d188279647009b21905fe Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Fri, 10 Jan 2014 21:25:46 +0800
Subject: Btrfs: optimize to remove unnecessary removal with ulist reallocation

Here we are not going to free memory, no need to remove every node
one by one, just init root node here is ok.

Cc:  Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ulist.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index b0a523b2c60..35f5de9dd49 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -207,9 +207,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 		void *old = NULL;
 		int i;
 
-		for (i = 0; i < ulist->nnodes; i++)
-			rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
-
 		/*
 		 * if nodes_alloced == ULIST_SIZE no memory has been allocated
 		 * yet, so pass NULL to krealloc
@@ -234,6 +231,7 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 		 * pointers, so we have to do it ourselves.  Otherwise we may
 		 * be bitten by crashes.
 		 */
+		ulist->root = RB_ROOT;
 		for (i = 0; i < ulist->nnodes; i++) {
 			ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
 			if (ret < 0)
-- 
cgit v1.2.3-70-g09d2


From c57c2b3ed248b3f1712e4172eb85b361199582f2 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sat, 11 Jan 2014 21:31:25 +0000
Subject: Btrfs: unlock inodes in correct order in clone ioctl

In the clone ioctl, when the source and target inodes are different,
we can acquire their mutexes in 2 possible different orders. After
we're done cloning, we were releasing the mutexes always in the same
order - the most correct way of doing it is to release them by the
reverse order they were acquired.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 332b624e25d..5ed5bd00108 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3263,9 +3263,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
-	mutex_unlock(&src->i_mutex);
-	if (!same_inode)
-		mutex_unlock(&inode->i_mutex);
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_unlock(&src->i_mutex);
+			mutex_unlock(&inode->i_mutex);
+		} else {
+			mutex_unlock(&inode->i_mutex);
+			mutex_unlock(&src->i_mutex);
+		}
+	} else {
+		mutex_unlock(&src->i_mutex);
+	}
 out_fput:
 	fdput(src_file);
 out_drop_write:
-- 
cgit v1.2.3-70-g09d2


From 14a958e678cd77cae475b60ca46c0797b1c006a1 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 12 Jan 2014 02:22:46 +0000
Subject: Btrfs: fix btrfs boot when compiled as built-in

After the change titled "Btrfs: add support for inode properties", if
btrfs was built-in the kernel (i.e. not as a module), it would cause a
kernel panic, as reported recently by Fengguang:

[    2.024722] BUG: unable to handle kernel NULL pointer dereference at           (null)
[    2.027814] IP: [<ffffffff81501594>] crc32c+0xc/0x6b
[    2.028684] PGD 0
[    2.028684] Oops: 0000 [#1] SMP
[    2.028684] Modules linked in:
[    2.028684] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc7-04795-ga7b57c2 #1
[    2.028684] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[    2.028684] task: ffff88000edba100 ti: ffff88000edd6000 task.ti: ffff88000edd6000
[    2.028684] RIP: 0010:[<ffffffff81501594>]  [<ffffffff81501594>] crc32c+0xc/0x6b
[    2.028684] RSP: 0000:ffff88000edd7e58  EFLAGS: 00010246
[    2.028684] RAX: 0000000000000000 RBX: ffffffff82295550 RCX: 0000000000000000
[    2.028684] RDX: 0000000000000011 RSI: ffffffff81efe393 RDI: 00000000fffffffe
[    2.028684] RBP: ffff88000edd7e60 R08: 0000000000000003 R09: 0000000000015d20
[    2.028684] R10: ffffffff81ef225e R11: ffffffff811b0222 R12: ffffffffffffffff
[    2.028684] R13: 0000000000000239 R14: 0000000000000000 R15: 0000000000000000
[    2.028684] FS:  0000000000000000(0000) GS:ffff88000fa00000(0000) knlGS:0000000000000000
[    2.028684] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    2.028684] CR2: 0000000000000000 CR3: 000000000220c000 CR4: 00000000000006f0
[    2.028684] Stack:
[    2.028684]  ffffffff82295550 ffff88000edd7e80 ffffffff8238af62 ffffffff8238ac05
[    2.028684]  0000000000000000 ffff88000edd7e98 ffffffff8238ac0f ffffffff8238ac05
[    2.028684]  ffff88000edd7f08 ffffffff810002ba ffff88000edd7f00 ffffffff810e2404
[    2.028684] Call Trace:
[    2.028684]  [<ffffffff8238af62>] btrfs_props_init+0x4f/0x96
[    2.028684]  [<ffffffff8238ac05>] ? ftrace_define_fields_btrfs_space_reservation+0x145/0x145
[    2.028684]  [<ffffffff8238ac0f>] init_btrfs_fs+0xa/0xf0
[    2.028684]  [<ffffffff8238ac05>] ? ftrace_define_fields_btrfs_space_reservation+0x145/0x145
[    2.028684]  [<ffffffff810002ba>] do_one_initcall+0xa4/0x13a
[    2.028684]  [<ffffffff810e2404>] ? parse_args+0x25f/0x33d
[    2.028684]  [<ffffffff8234cf75>] kernel_init_freeable+0x1aa/0x230
[    2.028684]  [<ffffffff8234c785>] ? do_early_param+0x88/0x88
[    2.028684]  [<ffffffff819f61b5>] ? rest_init+0x89/0x89
[    2.028684]  [<ffffffff819f61c3>] kernel_init+0xe/0x109

The issue here is that the initialization function of btrfs (super.c:init_btrfs_fs)
started using crc32c (from lib/libcrc32c.c). But when it needs to call crc32c (as
part of the properties initialization routine), the libcrc32c is not yet initialized,
so crc32c derreferenced a NULL pointer (lib/libcrc32c.c:tfm), causing the kernel
panic on boot.

The approach to fix this is to use crypto component directly to use its crc32c (which
is basically what lib/libcrc32c.c is, a wrapper around crypto). This is what ext4 is
doing as well, it uses crypto directly to get crc32c functionality.

Verified this works both when btrfs is built-in and when it's loadable kernel module.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/Kconfig       |  3 ++-
 fs/btrfs/Makefile      |  2 +-
 fs/btrfs/extent-tree.c |  6 +++---
 fs/btrfs/hash.c        | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/hash.h        | 11 ++++++++---
 fs/btrfs/super.c       | 10 +++++++++-
 6 files changed, 73 insertions(+), 9 deletions(-)
 create mode 100644 fs/btrfs/hash.c

(limited to 'fs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index aa976eced2d..a66768ebc8d 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,6 +1,7 @@
 config BTRFS_FS
 	tristate "Btrfs filesystem support"
-	select LIBCRC32C
+	select CRYPTO
+	select CRYPTO_CRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index af7f000e905..f341a98031d 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-	   uuid-tree.o props.o
+	   uuid-tree.o props.o hash.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b5322596d60..db1e32ffcfb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1074,11 +1074,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
new file mode 100644
index 00000000000..85889aa82c6
--- /dev/null
+++ b/fs/btrfs/hash.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include "hash.h"
+
+static struct crypto_shash *tfm;
+
+int __init btrfs_hash_init(void)
+{
+	tfm = crypto_alloc_shash("crc32c", 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	return 0;
+}
+
+void btrfs_hash_exit(void)
+{
+	crypto_free_shash(tfm);
+}
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
+{
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(tfm)];
+	} desc;
+	int err;
+
+	desc.shash.tfm = tfm;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
+
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
+
+	return *(u32 *)desc.ctx;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 1d982812ab6..118a2316e5d 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,10 +19,15 @@
 #ifndef __HASH__
 #define __HASH__
 
-#include <linux/crc32c.h>
+int __init btrfs_hash_init(void);
+
+void btrfs_hash_exit(void);
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
+
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-	return crc32c((u32)~1, name, len);
+	return btrfs_crc32c((u32)~1, name, len);
 }
 
 /*
@@ -31,7 +36,7 @@ static inline u64 btrfs_name_hash(const char *name, int len)
 static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
 				    int len)
 {
-	return (u64) crc32c(parent_objectid, name, len);
+	return (u64) btrfs_crc32c(parent_objectid, name, len);
 }
 
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 461e41cb8ca..f44cc6a0eb2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,7 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "print-tree.h"
+#include "hash.h"
 #include "props.h"
 #include "xattr.h"
 #include "volumes.h"
@@ -1866,11 +1867,15 @@ static int __init init_btrfs_fs(void)
 {
 	int err;
 
+	err = btrfs_hash_init();
+	if (err)
+		return err;
+
 	btrfs_props_init();
 
 	err = btrfs_init_sysfs();
 	if (err)
-		return err;
+		goto free_hash;
 
 	btrfs_init_compress();
 
@@ -1945,6 +1950,8 @@ free_cachep:
 free_compress:
 	btrfs_exit_compress();
 	btrfs_exit_sysfs();
+free_hash:
+	btrfs_hash_exit();
 	return err;
 }
 
@@ -1963,6 +1970,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
 	btrfs_exit_compress();
+	btrfs_hash_exit();
 }
 
 module_init(init_btrfs_fs)
-- 
cgit v1.2.3-70-g09d2


From 28e5dd8f35202ff56b2eb1725ac77f0d0fcb4758 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 12 Jan 2014 02:26:28 +0000
Subject: Btrfs: fix send to not send non-aligned clone operations

It is possible for the send feature to send clone operations that
request a cloning range (offset + length) that is not aligned with
the block size. This makes the btrfs receive command send issue a
clone ioctl call that will fail, as the ioctl will return an -EINVAL
error because of the unaligned range.

Fix this by not sending clone operations for non block aligned ranges,
and instead send regular write operation for these (less common) cases.

The following xfstest reproduces this issue, which fails on the second
btrfs receive command without this change:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  tmp=`mktemp -d`

  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      rm -fr $tmp
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _need_to_be_root

  rm -f $seqres.full

  _scratch_mkfs >/dev/null 2>&1
  _scratch_mount

  $XFS_IO_PROG -f -c "truncate 819200" $SCRATCH_MNT/foo | _filter_xfs_io
  $BTRFS_UTIL_PROG filesystem sync $SCRATCH_MNT | _filter_scratch

  $XFS_IO_PROG -c "falloc -k 819200 667648" $SCRATCH_MNT/foo | _filter_xfs_io
  $BTRFS_UTIL_PROG filesystem sync $SCRATCH_MNT | _filter_scratch

  $XFS_IO_PROG -f -c "pwrite 1482752 2978" $SCRATCH_MNT/foo | _filter_xfs_io
  $BTRFS_UTIL_PROG filesystem sync $SCRATCH_MNT | _filter_scratch

  $BTRFS_UTIL_PROG subvol snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 | \
      _filter_scratch

  $XFS_IO_PROG -f -c "truncate 883305" $SCRATCH_MNT/foo | _filter_xfs_io
  $BTRFS_UTIL_PROG filesystem sync $SCRATCH_MNT | _filter_scratch

  $BTRFS_UTIL_PROG subvol snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap2 | \
      _filter_scratch

  $BTRFS_UTIL_PROG send $SCRATCH_MNT/mysnap1 -f $tmp/1.snap 2>&1 | _filter_scratch
  $BTRFS_UTIL_PROG send -p $SCRATCH_MNT/mysnap1 $SCRATCH_MNT/mysnap2 \
      -f $tmp/2.snap 2>&1 | _filter_scratch

  md5sum $SCRATCH_MNT/foo | _filter_scratch
  md5sum $SCRATCH_MNT/mysnap1/foo | _filter_scratch
  md5sum $SCRATCH_MNT/mysnap2/foo | _filter_scratch

  _scratch_unmount
  _check_btrfs_filesystem $SCRATCH_DEV
  _scratch_mkfs >/dev/null 2>&1
  _scratch_mount

  $BTRFS_UTIL_PROG receive $SCRATCH_MNT -f $tmp/1.snap
  md5sum $SCRATCH_MNT/mysnap1/foo | _filter_scratch

  $BTRFS_UTIL_PROG receive $SCRATCH_MNT -f $tmp/2.snap
  md5sum $SCRATCH_MNT/mysnap2/foo | _filter_scratch

  _scratch_unmount
  _check_btrfs_filesystem $SCRATCH_DEV

  status=0
  exit

The tests expected output is:

  QA output created by 025
  FSSync 'SCRATCH_MNT'
  FSSync 'SCRATCH_MNT'
  wrote 2978/2978 bytes at offset 1482752
  XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  FSSync 'SCRATCH_MNT'
  Create a readonly snapshot of 'SCRATCH_MNT' in 'SCRATCH_MNT/mysnap1'
  FSSync 'SCRATCH_MNT'
  Create a readonly snapshot of 'SCRATCH_MNT' in 'SCRATCH_MNT/mysnap2'
  At subvol SCRATCH_MNT/mysnap1
  At subvol SCRATCH_MNT/mysnap2
  129b8eaee8d3c2bcad49bec596591cb3  SCRATCH_MNT/foo
  42b6369eae2a8725c1aacc0440e597aa  SCRATCH_MNT/mysnap1/foo
  129b8eaee8d3c2bcad49bec596591cb3  SCRATCH_MNT/mysnap2/foo
  At subvol mysnap1
  42b6369eae2a8725c1aacc0440e597aa  SCRATCH_MNT/mysnap1/foo
  At snapshot mysnap2
  129b8eaee8d3c2bcad49bec596591cb3  SCRATCH_MNT/mysnap2/foo

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 591063dac0e..84aed2f30aa 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3761,6 +3761,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	u64 len;
 	u32 l;
 	u8 type;
+	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
 
 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			struct btrfs_file_extent_item);
@@ -3784,7 +3785,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 		goto out;
 	}
 
-	if (clone_root) {
+	if (clone_root && IS_ALIGNED(offset + len, bs)) {
 		ret = send_clone(sctx, offset, len, clone_root);
 	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
 		ret = send_update_extent(sctx, offset, len);
-- 
cgit v1.2.3-70-g09d2


From 7c76edb77c23db673a83793686b4a53e2eec4de4 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sun, 12 Jan 2014 21:38:32 +0800
Subject: Btrfs: fix missing skinny metadata check in scrub_stripe()

Check if we support skinny metadata firstly and fix to use
right type to search.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 7806e2c47f8..e0677e42d66 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2373,8 +2373,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			scrub_blocked_if_needed(fs_info);
 		}
 
+		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+			key.type = BTRFS_METADATA_ITEM_KEY;
+		else
+			key.type = BTRFS_EXTENT_ITEM_KEY;
 		key.objectid = logical;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
 		key.offset = (u64)-1;
 
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-- 
cgit v1.2.3-70-g09d2


From ade2e0b3eeca941a5cd486bac21599ff87f288c8 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sun, 12 Jan 2014 21:38:33 +0800
Subject: Btrfs: fix to search previous metadata extent item since skinny
 metadata

There is a bug that using btrfs_previous_item() to search metadata extent item.
This is because in btrfs_previous_item(), we need type match, however, since
skinny metada was introduced by josef, we may mix this two types. So just
use btrfs_previous_item() is not working right.

To keep btrfs_previous_item() like normal tree search, i introduce another
function btrfs_previous_extent_item().

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/scrub.c |  3 +--
 3 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e9de68eb81..30f5b11d7dd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5955,3 +5955,46 @@ int btrfs_previous_item(struct btrfs_root *root,
 	}
 	return 1;
 }
+
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+
+	while (1) {
+		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    found_key.type == BTRFS_METADATA_ITEM_KEY)
+			return 0;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
+			break;
+	}
+	return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ca6bcc33d03..3708fd7ee83 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3334,6 +3334,8 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
+int btrfs_previous_extent_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid);
 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
 			     struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e0677e42d66..51c342b9f5e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2385,8 +2385,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			goto out;
 
 		if (ret > 0) {
-			ret = btrfs_previous_item(root, path, 0,
-						  BTRFS_EXTENT_ITEM_KEY);
+			ret = btrfs_previous_extent_item(root, path, 0);
 			if (ret < 0)
 				goto out;
 			if (ret > 0) {
-- 
cgit v1.2.3-70-g09d2


From 3818aea275423236db38a2d2d0a4951bc6da2e01 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 13 Jan 2014 13:36:06 +0800
Subject: btrfs: Add noinode_cache mount option

Add noinode_cache mount option for btrfs.

Since inode map cache involves all the btrfs_find_free_ino/return_ino
things and if just trigger the mount_opt,
an inode number get from inode map cache will not returned to inode map
cache.

To keep the find and return inode both in the same behavior,
a new bit in mount_opt, CHANGE_INODE_CACHE, is introduced for this idea.
CHANGE_INODE_CACHE is set/cleared in remounting, and the original
INODE_MAP_CACHE is set/cleared according to CHANGE_INODE_CACHE after a
success transaction.
Since find/return inode is all done between btrfs_start_transaction and
btrfs_commit_transaction, this will keep consistent behavior.

Also noinode_cache mount option will not stop the caching_kthread.

Cc: David Sterba <dsterba@suse.cz>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  4 ++++
 fs/btrfs/super.c       | 10 ++++++++--
 fs/btrfs/transaction.c |  9 +++++++++
 4 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3708fd7ee83..52c96db0ec0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2023,6 +2023,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
 #define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
+#define	BTRFS_MOUNT_CHANGE_INODE_CACHE	(1 << 24)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f0e7bbe1482..4f142c98654 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2776,6 +2776,10 @@ retry_root_backup:
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
+	/* Set the real inode map cache flag */
+	if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
+		btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
+
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
 		ret = btrfsic_mount(tree_root, fs_devices,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f44cc6a0eb2..362aef44a17 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -327,7 +327,7 @@ enum {
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
-	Opt_datasum, Opt_treelog,
+	Opt_datasum, Opt_treelog, Opt_noinode_cache,
 	Opt_err,
 };
 
@@ -370,6 +370,7 @@ static match_table_t tokens = {
 	{Opt_defrag, "autodefrag"},
 	{Opt_nodefrag, "noautodefrag"},
 	{Opt_inode_cache, "inode_cache"},
+	{Opt_noinode_cache, "noinode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
 	{Opt_skip_balance, "skip_balance"},
@@ -627,7 +628,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			break;
 		case Opt_inode_cache:
 			btrfs_info(root->fs_info, "enabling inode map caching");
-			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+			btrfs_set_opt(info->mount_opt, CHANGE_INODE_CACHE);
+			break;
+		case Opt_noinode_cache:
+			if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
+				btrfs_info(root->fs_info, "disabling inode map caching");
+			btrfs_clear_opt(info->mount_opt, CHANGE_INODE_CACHE);
 			break;
 		case Opt_clear_cache:
 			btrfs_info(root->fs_info, "force clearing of disk cache");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5e2bfdaf8d1..34cd83184c4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1826,6 +1826,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		goto cleanup_transaction;
 	}
 
+	/*
+	 * Since the transaction is done, we should set the inode map cache flag
+	 * before any other comming transaction.
+	 */
+	if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
+		btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+	else
+		btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+
 	/* commit_fs_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
 	 */
-- 
cgit v1.2.3-70-g09d2


From 078025347c8ed43ff330e392476d8866ac1b297f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Mon, 13 Jan 2014 13:36:07 +0800
Subject: btrfs: Cleanup the btrfs_parse_options for remount.

Since remount will pending the new mount options to the original mount
options, which will make btrfs_parse_options check the old options then
new options, causing some stupid output like "enabling XXX" following by
"disable XXX".

This patch will add extra check before every btrfs_info to skip the
output from old options checking.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 137 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 77 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 362aef44a17..c02f6335689 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -383,6 +383,20 @@ static match_table_t tokens = {
 	{Opt_err, NULL},
 };
 
+#define btrfs_set_and_info(root, opt, fmt, args...)			\
+{									\
+	if (!btrfs_test_opt(root, opt))					\
+		btrfs_info(root->fs_info, fmt, ##args);			\
+	btrfs_set_opt(root->fs_info->mount_opt, opt);			\
+}
+
+#define btrfs_clear_and_info(root, opt, fmt, args...)			\
+{									\
+	if (btrfs_test_opt(root, opt))					\
+		btrfs_info(root->fs_info, fmt, ##args);			\
+	btrfs_clear_opt(root->fs_info->mount_opt, opt);			\
+}
+
 /*
  * Regular mount options parser.  Everything that is needed only when
  * reading in a new superblock is parsed here.
@@ -398,6 +412,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	int ret = 0;
 	char *compress_type;
 	bool compress_force = false;
+	bool compress = false;
 
 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (cache_gen)
@@ -437,24 +452,28 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			 */
 			break;
 		case Opt_nodatasum:
-			btrfs_info(root->fs_info, "setting nodatasum");
-			btrfs_set_opt(info->mount_opt, NODATASUM);
+			btrfs_set_and_info(root, NODATASUM,
+					   "setting nodatasum");
 			break;
 		case Opt_datasum:
-			if (btrfs_test_opt(root, NODATACOW))
-				btrfs_info(root->fs_info, "setting datasum, datacow enabled");
-			else
-				btrfs_info(root->fs_info, "setting datasum");
+			if (btrfs_test_opt(root, NODATASUM)) {
+				if (btrfs_test_opt(root, NODATACOW))
+					btrfs_info(root->fs_info, "setting datasum, datacow enabled");
+				else
+					btrfs_info(root->fs_info, "setting datasum");
+			}
 			btrfs_clear_opt(info->mount_opt, NODATACOW);
 			btrfs_clear_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-			if (!btrfs_test_opt(root, COMPRESS) ||
-				!btrfs_test_opt(root, FORCE_COMPRESS)) {
+			if (!btrfs_test_opt(root, NODATACOW)) {
+				if (!btrfs_test_opt(root, COMPRESS) ||
+				    !btrfs_test_opt(root, FORCE_COMPRESS)) {
 					btrfs_info(root->fs_info,
-						"setting nodatacow, compression disabled");
-			} else {
-				btrfs_info(root->fs_info, "setting nodatacow");
+						   "setting nodatacow, compression disabled");
+				} else {
+					btrfs_info(root->fs_info, "setting nodatacow");
+				}
 			}
 			btrfs_clear_opt(info->mount_opt, COMPRESS);
 			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
@@ -462,9 +481,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_datacow:
-			if (btrfs_test_opt(root, NODATACOW))
-				btrfs_info(root->fs_info, "setting datacow");
-			btrfs_clear_opt(info->mount_opt, NODATACOW);
+			btrfs_clear_and_info(root, NODATACOW,
+					     "setting datacow");
 			break;
 		case Opt_compress_force:
 		case Opt_compress_force_type:
@@ -472,6 +490,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			/* Fallthrough */
 		case Opt_compress:
 		case Opt_compress_type:
+			compress = true;
 			if (token == Opt_compress ||
 			    token == Opt_compress_force ||
 			    strcmp(args[0].from, "zlib") == 0) {
@@ -498,37 +517,36 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			}
 
 			if (compress_force) {
-				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
-				btrfs_info(root->fs_info, "force %s compression",
-					compress_type);
-			} else if (btrfs_test_opt(root, COMPRESS)) {
-				pr_info("btrfs: use %s compression\n",
-					compress_type);
+				btrfs_set_and_info(root, FORCE_COMPRESS,
+						   "force %s compression",
+						   compress_type);
+			} else if (compress) {
+				if (!btrfs_test_opt(root, COMPRESS))
+					btrfs_info(root->fs_info,
+						   "btrfs: use %s compression\n",
+						   compress_type);
 			}
 			break;
 		case Opt_ssd:
-			btrfs_info(root->fs_info, "use ssd allocation scheme");
-			btrfs_set_opt(info->mount_opt, SSD);
+			btrfs_set_and_info(root, SSD,
+					   "use ssd allocation scheme");
 			break;
 		case Opt_ssd_spread:
-			btrfs_info(root->fs_info, "use spread ssd allocation scheme");
-			btrfs_set_opt(info->mount_opt, SSD);
-			btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+			btrfs_set_and_info(root, SSD_SPREAD,
+					   "use spread ssd allocation scheme");
 			break;
 		case Opt_nossd:
-			btrfs_info(root->fs_info, "not using ssd allocation scheme");
-			btrfs_set_opt(info->mount_opt, NOSSD);
+			btrfs_clear_and_info(root, NOSSD,
+					     "not using ssd allocation scheme");
 			btrfs_clear_opt(info->mount_opt, SSD);
-			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
 			break;
 		case Opt_barrier:
-			if (btrfs_test_opt(root, NOBARRIER))
-				btrfs_info(root->fs_info, "turning on barriers");
-			btrfs_clear_opt(info->mount_opt, NOBARRIER);
+			btrfs_clear_and_info(root, NOBARRIER,
+					     "turning on barriers");
 			break;
 		case Opt_nobarrier:
-			btrfs_info(root->fs_info, "turning off barriers");
-			btrfs_set_opt(info->mount_opt, NOBARRIER);
+			btrfs_set_and_info(root, NOBARRIER,
+					   "turning off barriers");
 			break;
 		case Opt_thread_pool:
 			ret = match_int(&args[0], &intarg);
@@ -580,22 +598,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
 			break;
 		case Opt_notreelog:
-			btrfs_info(root->fs_info, "disabling tree log");
-			btrfs_set_opt(info->mount_opt, NOTREELOG);
+			btrfs_set_and_info(root, NOTREELOG,
+					   "disabling tree log");
 			break;
 		case Opt_treelog:
-			if (btrfs_test_opt(root, NOTREELOG))
-				btrfs_info(root->fs_info, "enabling tree log");
-			btrfs_clear_opt(info->mount_opt, NOTREELOG);
+			btrfs_clear_and_info(root, NOTREELOG,
+					     "enabling tree log");
 			break;
 		case Opt_flushoncommit:
-			btrfs_info(root->fs_info, "turning on flush-on-commit");
-			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+			btrfs_set_and_info(root, FLUSHONCOMMIT,
+					   "turning on flush-on-commit");
 			break;
 		case Opt_noflushoncommit:
-			if (btrfs_test_opt(root, FLUSHONCOMMIT))
-				btrfs_info(root->fs_info, "turning off flush-on-commit");
-			btrfs_clear_opt(info->mount_opt, FLUSHONCOMMIT);
+			btrfs_clear_and_info(root, FLUSHONCOMMIT,
+					     "turning off flush-on-commit");
 			break;
 		case Opt_ratio:
 			ret = match_int(&args[0], &intarg);
@@ -611,33 +627,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			}
 			break;
 		case Opt_discard:
-			btrfs_set_opt(info->mount_opt, DISCARD);
+			btrfs_set_and_info(root, DISCARD,
+					   "turning on discard");
 			break;
 		case Opt_nodiscard:
-			btrfs_clear_opt(info->mount_opt, DISCARD);
+			btrfs_clear_and_info(root, DISCARD,
+					     "turning off discard");
 			break;
 		case Opt_space_cache:
-			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+			btrfs_set_and_info(root, SPACE_CACHE,
+					   "enabling disk space caching");
 			break;
 		case Opt_rescan_uuid_tree:
 			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
 			break;
 		case Opt_no_space_cache:
-			btrfs_info(root->fs_info, "disabling disk space caching");
-			btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+			btrfs_clear_and_info(root, SPACE_CACHE,
+					     "disabling disk space caching");
 			break;
 		case Opt_inode_cache:
-			btrfs_info(root->fs_info, "enabling inode map caching");
-			btrfs_set_opt(info->mount_opt, CHANGE_INODE_CACHE);
+			btrfs_set_and_info(root, CHANGE_INODE_CACHE,
+					   "enabling inode map caching");
 			break;
 		case Opt_noinode_cache:
-			if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
-				btrfs_info(root->fs_info, "disabling inode map caching");
-			btrfs_clear_opt(info->mount_opt, CHANGE_INODE_CACHE);
+			btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+					     "disabling inode map caching");
 			break;
 		case Opt_clear_cache:
-			btrfs_info(root->fs_info, "force clearing of disk cache");
-			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+			btrfs_set_and_info(root, CLEAR_CACHE,
+					   "force clearing of disk cache");
 			break;
 		case Opt_user_subvol_rm_allowed:
 			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
@@ -649,13 +667,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
 			break;
 		case Opt_defrag:
-			btrfs_info(root->fs_info, "enabling auto defrag");
-			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
+			btrfs_set_and_info(root, AUTO_DEFRAG,
+					   "enabling auto defrag");
 			break;
 		case Opt_nodefrag:
-			if (btrfs_test_opt(root, AUTO_DEFRAG))
-				btrfs_info(root->fs_info, "disabling auto defrag");
-			btrfs_clear_opt(info->mount_opt, AUTO_DEFRAG);
+			btrfs_clear_and_info(root, AUTO_DEFRAG,
+					     "disabling auto defrag");
 			break;
 		case Opt_recovery:
 			btrfs_info(root->fs_info, "enabling auto recovery");
-- 
cgit v1.2.3-70-g09d2


From 1a4319cc3c495d5b6b8e41f4d4c73b950d54c2be Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 13 Jan 2014 19:53:53 +0800
Subject: Btrfs: fix extent state leak on transaction abortion

When transaction is aborted, we fail to commit transaction, instead we do
cleanup work.  After that when we umount btrfs, we get to free fs roots' log
trees respectively, but that happens after we unpin extents, so those extents
pinned by freeing log trees will remain in memory and lead to the leak.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4f142c98654..47c2bc21caa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2065,6 +2065,12 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
 		for (i = 0; i < ret; i++)
 			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
 	}
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		btrfs_free_log_root_tree(NULL, fs_info);
+		btrfs_destroy_pinned_extent(fs_info->tree_root,
+					    fs_info->pinned_extents);
+	}
 }
 
 int open_ctree(struct super_block *sb,
@@ -3455,10 +3461,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 	if (btrfs_root_refs(&root->root_item) == 0)
 		synchronize_srcu(&fs_info->subvol_srcu);
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		btrfs_free_log(NULL, root);
-		btrfs_free_log_root_tree(NULL, fs_info);
-	}
 
 	__btrfs_remove_free_space_cache(root->free_ino_pinned);
 	__btrfs_remove_free_space_cache(root->free_ino_ctl);
@@ -3569,8 +3573,6 @@ int close_ctree(struct btrfs_root *root)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		btrfs_error_commit_super(root);
 
-	btrfs_put_block_group_cache(fs_info);
-
 	kthread_stop(fs_info->transaction_kthread);
 	kthread_stop(fs_info->cleaner_kthread);
 
@@ -3588,6 +3590,8 @@ int close_ctree(struct btrfs_root *root)
 
 	del_fs_roots(fs_info);
 
+	btrfs_put_block_group_cache(fs_info);
+
 	btrfs_free_block_groups(fs_info);
 
 	btrfs_stop_all_workers(fs_info);
-- 
cgit v1.2.3-70-g09d2


From e4355f34ef9fc75a93875fd075137ef2ea378883 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 13 Jan 2014 19:35:01 +0000
Subject: Btrfs: faster file extent item search in clone ioctl

When we are looking for file extent items that intersect the cloning
range, for each one that falls completely outside the range, don't
release the path and do another full tree search - just move on
to the next slot and copy the file extent item into our buffer only
if the item intersects the cloning range.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5ed5bd00108..829999dafd9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2905,12 +2905,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 		 * note the key will change type as we walk through the
 		 * tree.
 		 */
+		path->leave_spinning = 1;
 		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
 				0, 0);
 		if (ret < 0)
 			goto out;
 
 		nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
 			if (ret < 0)
@@ -2937,11 +2939,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 			u8 comp;
 			u64 endoff;
 
-			size = btrfs_item_size_nr(leaf, slot);
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
-
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 			comp = btrfs_file_extent_compression(leaf, extent);
@@ -2960,11 +2957,20 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 				datal = btrfs_file_extent_ram_bytes(leaf,
 								    extent);
 			}
-			btrfs_release_path(path);
 
 			if (key.offset + datal <= off ||
-			    key.offset >= off + len - 1)
-				goto next;
+			    key.offset >= off + len - 1) {
+				path->slots[0]++;
+				goto process_slot;
+			}
+
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+
+			btrfs_release_path(path);
+			path->leave_spinning = 0;
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.objectid = btrfs_ino(inode);
@@ -3135,7 +3141,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 			}
 			ret = btrfs_end_transaction(trans, root);
 		}
-next:
 		btrfs_release_path(path);
 		key.offset++;
 	}
-- 
cgit v1.2.3-70-g09d2


From 2c21b4d733d6e50514e30ffd87110364ddda695b Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 14 Jan 2014 19:42:20 +0800
Subject: Btrfs: fix transaction abortion when remounting btrfs from RW to RO

Steps to reproduce:
 # mkfs.btrfs -f /dev/sda8
 # mount /dev/sda8 /mnt -o flushoncommit
 # dd if=/dev/zero of=/mnt/data bs=4k count=102400 &
 # mount /dev/sda8 /mnt -o remount, ro

When remounting RW to RO, the logic is to firstly set flag
to RO and then commit transaction, however with option
flushoncommit enabled,we will do RO check within committing
transaction, so we get a transaction abortion here.

Actually,here check is wrong, we should check if FS_STATE_ERROR
is set, fix it.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Suggested-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ea19cea96d..7b61ea3141e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8477,7 +8477,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
 	int ret;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return -EROFS;
 
 	ret = __start_delalloc_inodes(root, delay_iput);
@@ -8503,7 +8503,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 	struct list_head splice;
 	int ret;
 
-	if (fs_info->sb->s_flags & MS_RDONLY)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
 	INIT_LIST_HEAD(&splice);
-- 
cgit v1.2.3-70-g09d2


From 23c671a58831a5aaca3b56b915c8394a274a96df Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 14 Jan 2014 20:31:52 +0800
Subject: Btrfs: flush the dirty pages of the ordered extent aggressively
 during logging csum

The performance of fsync dropped down suddenly sometimes, the main reason
of this problem was that we might only flush part dirty pages in a ordered
extent, then got that ordered extent, wait for the csum calcucation. But if
no task flushed the left part, we would wait until the flusher flushed them,
sometimes we need wait for several seconds, it made the performance drop
down suddenly. (On my box, it drop down from 56MB/s to 4-10MB/s)

This patch improves the above problem by flushing left dirty pages aggressively.

Test Environment:
CPU:		2CPU * 2Cores
Memory:		4GB
Partition:	20GB(HDD)

Test Command:
 # sysbench --num-threads=8 --test=fileio --file-num=1 \
 > --file-total-size=8G --file-block-size=32768 \
 > --file-io-mode=sync --file-fsync-freq=100 \
 > --file-fsync-end=no --max-requests=10000 \
 > --file-test-mode=rndwr run

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b561e7a4007..b142b6dc96c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3637,7 +3637,11 @@ again:
 		 * start over after this.
 		 */
 
-		wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+		if (ordered->csum_bytes_left) {
+			btrfs_start_ordered_extent(inode, ordered, 0);
+			wait_event(ordered->wait,
+				   ordered->csum_bytes_left == 0);
+		}
 
 		list_for_each_entry(sum, &ordered->list, list) {
 			ret = btrfs_csum_file_blocks(trans, log, sum);
-- 
cgit v1.2.3-70-g09d2


From ffcfaf81795471be3c07d6e3143bff31edca5d5a Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 15 Jan 2014 00:26:43 +0800
Subject: Btrfs: fix wrong search path initialization before searching tree
 root

To search tree root without transaction protection, we should neither search commit
root nor skip locking here, fix it.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 84aed2f30aa..aa60cbe7066 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2095,7 +2095,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	char *name = NULL;
 	int namelen;
 
-	path = alloc_path_for_send();
+	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From 26b47ff65bcdff8473b87680d8f876c66208087b Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 15 Jan 2014 20:00:54 +0800
Subject: Btrfs: change the members' order of btrfs_space_info structure to
 reduce the cache miss

It is better that the position of the lock is close to the data which is
protected by it, because they may be in the same cache line, we will load
less cache lines when we access them. So we rearrange the members' position
of btrfs_space_info structure to make the lock be closer to the its data.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 52c96db0ec0..84d4c052fcd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1105,7 +1105,7 @@ struct btrfs_qgroup_limit_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_space_info {
-	u64 flags;
+	spinlock_t lock;
 
 	u64 total_bytes;	/* total bytes in the space,
 				   this doesn't take mirrors into account */
@@ -1115,14 +1115,25 @@ struct btrfs_space_info {
 				   transaction finishes */
 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
 				   current allocations */
-	u64 bytes_readonly;	/* total bytes that are read only */
-
 	u64 bytes_may_use;	/* number of bytes that may be used for
 				   delalloc/allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	unsigned int full:1;	/* indicates that we cannot allocate any more
+				   chunks for this space */
+	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
+
+	unsigned int flush:1;		/* set if we are trying to make space */
+
+	unsigned int force_alloc;	/* set if we need to force a chunk
+					   alloc for this space */
+
 	u64 disk_used;		/* total bytes used on disk */
 	u64 disk_total;		/* total bytes on disk, takes mirrors into
 				   account */
 
+	u64 flags;
+
 	/*
 	 * bytes_pinned is kept in line with what is actually pinned, as in
 	 * we've called update_block_group and dropped the bytes_used counter
@@ -1135,21 +1146,11 @@ struct btrfs_space_info {
 	 */
 	struct percpu_counter total_bytes_pinned;
 
-	unsigned int full:1;	/* indicates that we cannot allocate any more
-				   chunks for this space */
-	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
-
-	unsigned int flush:1;		/* set if we are trying to make space */
-
-	unsigned int force_alloc;	/* set if we need to force a chunk
-					   alloc for this space */
-
 	struct list_head list;
 
+	struct rw_semaphore groups_sem;
 	/* for block groups in our same type */
 	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
-	spinlock_t lock;
-	struct rw_semaphore groups_sem;
 	wait_queue_head_t wait;
 
 	struct kobject kobj;
-- 
cgit v1.2.3-70-g09d2


From 920e4a58d27ea146b34674cf9565ab0373f9ca51 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 15 Jan 2014 20:00:55 +0800
Subject: Btrfs: cleanup the redundant code for the block group allocation and
 init

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 94 +++++++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db1e32ffcfb..1efcc262be4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8364,6 +8364,41 @@ static void __link_block_group(struct btrfs_space_info *space_info,
 	up_write(&space_info->groups_sem);
 }
 
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = start;
+	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+	cache->sectorsize = root->sectorsize;
+	cache->fs_info = root->fs_info;
+	cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       start);
+	atomic_set(&cache->count, 1);
+	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
+	btrfs_init_free_space_ctl(cache);
+
+	return cache;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -8399,26 +8434,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		if (ret != 0)
 			goto error;
+
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		cache = kzalloc(sizeof(*cache), GFP_NOFS);
+
+		cache = btrfs_create_block_group_cache(root, found_key.objectid,
+						       found_key.offset);
 		if (!cache) {
 			ret = -ENOMEM;
 			goto error;
 		}
-		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-						GFP_NOFS);
-		if (!cache->free_space_ctl) {
-			kfree(cache);
-			ret = -ENOMEM;
-			goto error;
-		}
-
-		atomic_set(&cache->count, 1);
-		spin_lock_init(&cache->lock);
-		cache->fs_info = info;
-		INIT_LIST_HEAD(&cache->list);
-		INIT_LIST_HEAD(&cache->cluster_list);
 
 		if (need_clear) {
 			/*
@@ -8439,16 +8464,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
-		memcpy(&cache->key, &found_key, sizeof(found_key));
+		cache->flags = btrfs_block_group_flags(&cache->item);
 
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(path);
-		cache->flags = btrfs_block_group_flags(&cache->item);
-		cache->sectorsize = root->sectorsize;
-		cache->full_stripe_len = btrfs_full_stripe_len(root,
-					       &root->fs_info->mapping_tree,
-					       found_key.objectid);
-		btrfs_init_free_space_ctl(cache);
 
 		/*
 		 * We need to exclude the super stripes now so that the space
@@ -8462,8 +8481,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			 * case.
 			 */
 			free_excluded_extents(root, cache);
-			kfree(cache->free_space_ctl);
-			kfree(cache);
+			btrfs_put_block_group(cache);
 			goto error;
 		}
 
@@ -8594,38 +8612,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	root->fs_info->last_trans_log_full_commit = trans->transid;
 
-	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	cache = btrfs_create_block_group_cache(root, chunk_offset, size);
 	if (!cache)
 		return -ENOMEM;
-	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-					GFP_NOFS);
-	if (!cache->free_space_ctl) {
-		kfree(cache);
-		return -ENOMEM;
-	}
-
-	cache->key.objectid = chunk_offset;
-	cache->key.offset = size;
-	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-	cache->sectorsize = root->sectorsize;
-	cache->fs_info = root->fs_info;
-	cache->full_stripe_len = btrfs_full_stripe_len(root,
-					       &root->fs_info->mapping_tree,
-					       chunk_offset);
-
-	atomic_set(&cache->count, 1);
-	spin_lock_init(&cache->lock);
-	INIT_LIST_HEAD(&cache->list);
-	INIT_LIST_HEAD(&cache->cluster_list);
-	INIT_LIST_HEAD(&cache->new_bg_list);
-
-	btrfs_init_free_space_ctl(cache);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
-	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
 
+	cache->flags = type;
 	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
 	ret = exclude_super_stripes(root, cache);
@@ -8635,8 +8630,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 		 * case.
 		 */
 		free_excluded_extents(root, cache);
-		kfree(cache->free_space_ctl);
-		kfree(cache);
+		btrfs_put_block_group(cache);
 		return ret;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 215a63d139b1e04ce4b595eeca84671782eb5758 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 15 Jan 2014 20:00:56 +0800
Subject: Btrfs: cleanup the code of used_block_group in find_free_extent()

used_block_group is just used for the space cluster which doesn't
belong to the current block group, the other place needn't use it.
Or the logic of code seems unclear.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1efcc262be4..b55a4fd13ec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6159,7 +6159,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
-	struct btrfs_block_group_cache *used_block_group;
 	u64 search_start = 0;
 	u64 max_extent_size = 0;
 	int empty_cluster = 2 * 1024 * 1024;
@@ -6220,7 +6219,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	if (search_start == hint_byte) {
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
-		used_block_group = block_group;
 		/*
 		 * we don't want to use the block group if it doesn't match our
 		 * allocation bits, or if its not cached.
@@ -6257,7 +6255,6 @@ search:
 		u64 offset;
 		int cached;
 
-		used_block_group = block_group;
 		btrfs_get_block_group(block_group);
 		search_start = block_group->key.objectid;
 
@@ -6300,6 +6297,7 @@ have_block_group:
 		 * lets look there
 		 */
 		if (last_ptr) {
+			struct btrfs_block_group_cache *used_block_group;
 			unsigned long aligned_cluster;
 			/*
 			 * the refill lock keeps out other
@@ -6310,10 +6308,8 @@ have_block_group:
 			if (used_block_group != block_group &&
 			    (!used_block_group ||
 			     used_block_group->ro ||
-			     !block_group_bits(used_block_group, flags))) {
-				used_block_group = block_group;
+			     !block_group_bits(used_block_group, flags)))
 				goto refill_cluster;
-			}
 
 			if (used_block_group != block_group)
 				btrfs_get_block_group(used_block_group);
@@ -6328,16 +6324,17 @@ have_block_group:
 				spin_unlock(&last_ptr->refill_lock);
 				trace_btrfs_reserve_extent_cluster(root,
 					block_group, search_start, num_bytes);
+				if (used_block_group != block_group) {
+					btrfs_put_block_group(block_group);
+					block_group = used_block_group;
+				}
 				goto checks;
 			}
 
 			WARN_ON(last_ptr->block_group != used_block_group);
-			if (used_block_group != block_group) {
+			if (used_block_group != block_group)
 				btrfs_put_block_group(used_block_group);
-				used_block_group = block_group;
-			}
 refill_cluster:
-			BUG_ON(used_block_group != block_group);
 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
 			 * set up a new clusters, so lets just skip it
 			 * and let the allocator find whatever block
@@ -6456,25 +6453,25 @@ unclustered_alloc:
 			goto loop;
 		}
 checks:
-		search_start = stripe_align(root, used_block_group,
+		search_start = stripe_align(root, block_group,
 					    offset, num_bytes);
 
 		/* move on to the next group */
 		if (search_start + num_bytes >
-		    used_block_group->key.objectid + used_block_group->key.offset) {
-			btrfs_add_free_space(used_block_group, offset, num_bytes);
+		    block_group->key.objectid + block_group->key.offset) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
 		}
 
 		if (offset < search_start)
-			btrfs_add_free_space(used_block_group, offset,
+			btrfs_add_free_space(block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
 						  alloc_type);
 		if (ret == -EAGAIN) {
-			btrfs_add_free_space(used_block_group, offset, num_bytes);
+			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
 		}
 
@@ -6484,16 +6481,12 @@ checks:
 
 		trace_btrfs_reserve_extent(orig_root, block_group,
 					   search_start, num_bytes);
-		if (used_block_group != block_group)
-			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 		break;
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
 		BUG_ON(index != get_block_group_index(block_group));
-		if (used_block_group != block_group)
-			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 	}
 	up_read(&space_info->groups_sem);
-- 
cgit v1.2.3-70-g09d2


From 89d4346a36a00ab1f9bd71f929564e9fc1c7c539 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 15 Jan 2014 20:00:57 +0800
Subject: Btrfs: fix wrong block group in trace during the free space
 allocation

We allocate the free space from the former block group, not the current
one, so should use the former one to output the trace information.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b55a4fd13ec..73b55d94b95 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6323,7 +6323,8 @@ have_block_group:
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
 				trace_btrfs_reserve_extent_cluster(root,
-					block_group, search_start, num_bytes);
+						used_block_group,
+						search_start, num_bytes);
 				if (used_block_group != block_group) {
 					btrfs_put_block_group(block_group);
 					block_group = used_block_group;
-- 
cgit v1.2.3-70-g09d2


From d024206133ce21936b3d5780359afc00247655b7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 15 Jan 2014 18:15:52 +0100
Subject: btrfs: restrict snapshotting to own subvolumes

Currently, any user can snapshot any subvolume if the path is accessible and
thus indirectly create and keep files he does not own under his direcotries.
This is not possible with traditional directories.

In security context, a user can snapshot root filesystem and pin any
potentially buggy binaries, even if the updates are applied.

All the snapshots are visible to the administrator, so it's possible to
verify if there are suspicious snapshots.

Another more practical problem is that any user can pin the space used
by eg. root and cause ENOSPC.

Original report:
https://bugs.launchpad.net/ubuntu/+source/apparmor/+bug/484786

CC: stable@vger.kernel.org
Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 829999dafd9..8d4457f329f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1574,6 +1574,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 			btrfs_info(BTRFS_I(src_inode)->root->fs_info,
 				   "Snapshot src from another FS");
 			ret = -EINVAL;
+		} else if (!inode_owner_or_capable(src_inode)) {
+			/*
+			 * Subvolume creation is not restricted, but snapshots
+			 * are limited to own subvolumes only
+			 */
+			ret = -EPERM;
 		} else {
 			ret = btrfs_mksubvol(&file->f_path, name, namelen,
 					     BTRFS_I(src_inode)->root,
-- 
cgit v1.2.3-70-g09d2


From bd60ea0fe947029df4b7b7aa9d6557baf2a5a138 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 16 Jan 2014 15:50:22 +0100
Subject: btrfs: call permission checks earlier in ioctls and return EPERM

The owner and capability checks in IOC_SUBVOL_SETFLAGS and
SET_RECEIVED_SUBVOL should be called before any other checks are done.

Also unify the error code to EPERM.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d4457f329f..62c62b4fa55 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -192,6 +192,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	unsigned int i_oldflags;
 	umode_t mode;
 
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
@@ -202,9 +205,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	if (ret)
 		return ret;
 
-	if (!inode_owner_or_capable(inode))
-		return -EACCES;
-
 	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
@@ -1697,6 +1697,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 	u64 flags;
 	int ret = 0;
 
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
 	ret = mnt_want_write_file(file);
 	if (ret)
 		goto out;
@@ -1721,11 +1724,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 		goto out_drop_write;
 	}
 
-	if (!inode_owner_or_capable(inode)) {
-		ret = -EACCES;
-		goto out_drop_write;
-	}
-
 	down_write(&root->fs_info->subvol_sem);
 
 	/* nothing to do */
@@ -4403,6 +4401,9 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 	int ret = 0;
 	int received_uuid_changed;
 
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
 	ret = mnt_want_write_file(file);
 	if (ret < 0)
 		return ret;
@@ -4419,11 +4420,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 		goto out;
 	}
 
-	if (!inode_owner_or_capable(inode)) {
-		ret = -EACCES;
-		goto out;
-	}
-
 	sa = memdup_user(arg, sizeof(*sa));
 	if (IS_ERR(sa)) {
 		ret = PTR_ERR(sa);
-- 
cgit v1.2.3-70-g09d2


From 66b4bbd4f5895e4b9da03885c00e42ba64f7038e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 21 Jan 2014 18:56:06 +0100
Subject: btrfs: sysfs: don't show reserved incompat feature

The COMPRESS_LZOv2 incompat featue is currently not implemented, the bit
is only reserved, no point to list it in sysfs.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ba94b277e98..1a893860d66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -198,7 +198,6 @@ BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
 BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
 BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
 BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
-BTRFS_FEAT_ATTR_INCOMPAT(compress_lzov2, COMPRESS_LZOv2);
 BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
@@ -209,7 +208,6 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(default_subvol),
 	BTRFS_FEAT_ATTR_PTR(mixed_groups),
 	BTRFS_FEAT_ATTR_PTR(compress_lzo),
-	BTRFS_FEAT_ATTR_PTR(compress_lzov2),
 	BTRFS_FEAT_ATTR_PTR(big_metadata),
 	BTRFS_FEAT_ATTR_PTR(extended_iref),
 	BTRFS_FEAT_ATTR_PTR(raid56),
-- 
cgit v1.2.3-70-g09d2


From c736c095de53c1a0a23909239d69bb56693df8ef Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 21 Jan 2014 18:56:09 +0100
Subject: btrfs: sysfs: list the NO_HOLES feature

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/sysfs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1a893860d66..782374d8fd1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
 BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
 
 static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -212,6 +213,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(extended_iref),
 	BTRFS_FEAT_ATTR_PTR(raid56),
 	BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+	BTRFS_FEAT_ATTR_PTR(no_holes),
 	NULL
 };
 
-- 
cgit v1.2.3-70-g09d2


From c41570c9d29764f797fa35490d72b7395a0105c3 Mon Sep 17 00:00:00 2001
From: Justin Maggard <jmaggard10@gmail.com>
Date: Tue, 21 Jan 2014 11:18:29 -0800
Subject: btrfs: fix defrag 32-bit integer overflow

When defragging a very large file, the cluster variable can wrap its 32-bit
signed int type and become negative, which eventually gets passed to
btrfs_force_ra() as a very large unsigned long value.  On 32-bit platforms,
this eventually results in an Oops from the SLAB allocator.

Change the cluster and max_cluster signed int variables to unsigned long to
match the readahead functions.  This also allows the min() comparison in
btrfs_defrag_file() to work as intended.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 62c62b4fa55..34772cbcc7a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1036,7 +1036,7 @@ out:
 static int cluster_pages_for_defrag(struct inode *inode,
 				    struct page **pages,
 				    unsigned long start_index,
-				    int num_pages)
+				    unsigned long num_pages)
 {
 	unsigned long file_end;
 	u64 isize = i_size_read(inode);
@@ -1194,8 +1194,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	int defrag_count = 0;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 	int extent_thresh = range->extent_thresh;
-	int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
-	int cluster = max_cluster;
+	unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	unsigned long cluster = max_cluster;
 	u64 new_align = ~((u64)128 * 1024 - 1);
 	struct page **pages = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From f74b86d85533a98ef7f573487af38f9dd514becb Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 21 Jan 2014 23:36:38 +0000
Subject: Btrfs: fix snprintf usage by send's gen_unique_name

The buffer size argument passed to snprintf must account for the
trailing null byte added by snprintf, and it returns a value >= then
sizeof(buffer) when the string can't fit in the buffer.

Since our buffer has a size of 64 characters, and the maximum orphan
name we can generate is 63 characters wide, we must pass 64 as the
buffer size to snprintf, and not 63.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index aa60cbe7066..fc1f0abb8fe 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1336,7 +1336,7 @@ static int gen_unique_name(struct send_ctx *sctx,
 		return -ENOMEM;
 
 	while (1) {
-		len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
+		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
 				ino, gen, idx);
 		if (len >= sizeof(tmp)) {
 			/* should really not happen */
-- 
cgit v1.2.3-70-g09d2


From 2365dd3ca02bbb6d3412404482e1d85752549953 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Wed, 22 Jan 2014 11:15:51 +0800
Subject: btrfs: undo sysfs when open_ctree() fails

reproducer:
mkfs.btrfs -f /dev/sdb &&\
mount /dev/sdb /btrfs &&\
btrfs dev add -f /dev/sdc /btrfs &&\
umount /btrfs &&\
wipefs -a /dev/sdc &&\
mount -o degraded /dev/sdb /btrfs
//above mount fails so try with RO
mount -o degraded,ro /dev/sdb /btrfs

------
sysfs: cannot create duplicate filename '/fs/btrfs/3f48c79e-5ed0-4e87-b189-86e749e503f4'
::

dump_stack+0x49/0x5e
warn_slowpath_common+0x87/0xb0
warn_slowpath_fmt+0x41/0x50
strlcat+0x69/0x80
sysfs_warn_dup+0x87/0xa0
sysfs_add_one+0x40/0x50
create_dir+0x76/0xc0
sysfs_create_dir_ns+0x7a/0xc0
kobject_add_internal+0xad/0x220
kobject_add_varg+0x38/0x60
kobject_init_and_add+0x53/0x70
mutex_lock+0x11/0x40
__free_pages+0x25/0x30
free_pages+0x41/0x50
selinux_sb_copy_data+0x14e/0x1e0
mount_fs+0x3e/0x1a0
vfs_kern_mount+0x71/0x120
do_mount+0x3f7/0x980
SyS_mount+0x8b/0xe0
system_call_fastpath+0x16/0x1b
------

further 'modprobe -r btrfs' fails as well

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 47c2bc21caa..7619147da38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2745,13 +2745,13 @@ retry_root_backup:
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
 		printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
-		goto fail_block_groups;
+		goto fail_sysfs;
 	}
 
 	ret = btrfs_read_block_groups(extent_root);
 	if (ret) {
 		printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
-		goto fail_block_groups;
+		goto fail_sysfs;
 	}
 	fs_info->num_tolerated_disk_barrier_failures =
 		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
@@ -2760,13 +2760,13 @@ retry_root_backup:
 	    !(sb->s_flags & MS_RDONLY)) {
 		printk(KERN_WARNING "BTRFS: "
 			"too many missing devices, writeable mount is not allowed\n");
-		goto fail_block_groups;
+		goto fail_sysfs;
 	}
 
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
 	if (IS_ERR(fs_info->cleaner_kthread))
-		goto fail_block_groups;
+		goto fail_sysfs;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
@@ -2948,6 +2948,9 @@ fail_cleaner:
 	 */
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 
+fail_sysfs:
+	btrfs_sysfs_remove_one(fs_info);
+
 fail_block_groups:
 	btrfs_put_block_group_cache(fs_info);
 	btrfs_free_block_groups(fs_info);
-- 
cgit v1.2.3-70-g09d2


From 4db658ea0ca2312b5d168230476ec7729385aefe Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 28 Jan 2014 18:06:18 -0800
Subject: ceph: Fix up after semantic merge conflict

The previous ceph-client merge resulted in ceph not even building,
because there was a merge conflict that wasn't visible as an actual data
conflict: commit 7221fe4c2ed7 ("ceph: add acl for cephfs") added support
for POSIX ACL's into Ceph, but unluckily we also had the VFS tree change
a lot of the POSIX ACL helper functions to be much more helpful to
filesystems (see for example commits 2aeccbe957d0 "fs: add generic
xattr_acl handlers", 5bf3258fd2ac "fs: make posix_acl_chmod more useful"
and 37bc15392a23 "fs: make posix_acl_create more useful")

The reason this conflict wasn't obvious was many-fold: because it was a
semantic conflict rather than a data conflict, it wasn't visible in the
git merge as a conflict.  And because the VFS tree hadn't been in
linux-next, people hadn't become aware of it that way.  And because I
was at jury duty this morning, I was using my laptop and as a result not
doing constant "allmodconfig" builds.

Anyway, this fixes the build and generally removes a fair chunk of the
Ceph POSIX ACL support code, since the improved helpers seem to match
really well for Ceph too.  But I don't actually have any way to *test*
the end result, and I was really hoping for some ACK's for this.  Oh,
well.

Not compiling certainly doesn't make things easier to test, so I'm
committing this without the acks after having waited for four hours...
Plus it's what I would have done for the merge had I noticed the
semantic conflict..

Reported-by: Dave Jones <davej@redhat.com>
Cc: Sage Weil <sage@inktank.com>
Cc: Guangliang Zhao <lucienchao@gmail.com>
Cc: Li Wang <li.wang@ubuntykylin.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ceph/acl.c   | 103 +-------------------------------------------------------
 fs/ceph/inode.c |   3 +-
 fs/ceph/super.h |   3 --
 fs/ceph/xattr.c |   5 +--
 4 files changed, 6 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 64fddbc1d17..f6911284c9b 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -213,7 +213,7 @@ int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
 			if (ret)
 				goto out_release;
 		}
-		ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
 		if (ret < 0)
 			goto out;
 		else if (ret > 0)
@@ -229,104 +229,3 @@ out_release:
 out:
 	return ret;
 }
-
-int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
-{
-	struct posix_acl *acl;
-	int ret = 0;
-
-	if (S_ISLNK(inode->i_mode)) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!IS_POSIXACL(inode))
-		goto out;
-
-	acl = ceph_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR_OR_NULL(acl)) {
-		ret = PTR_ERR(acl);
-		goto out;
-	}
-
-	ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-	if (ret)
-		goto out;
-	ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS);
-	posix_acl_release(acl);
-out:
-	return ret;
-}
-
-static int ceph_xattr_acl_get(struct dentry *dentry, const char *name,
-				void *value, size_t size, int type)
-{
-	struct posix_acl *acl;
-	int ret = 0;
-
-	if (!IS_POSIXACL(dentry->d_inode))
-		return -EOPNOTSUPP;
-
-	acl = ceph_get_acl(dentry->d_inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-
-	ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-	posix_acl_release(acl);
-
-	return ret;
-}
-
-static int ceph_xattr_acl_set(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags, int type)
-{
-	int ret = 0;
-	struct posix_acl *acl = NULL;
-
-	if (!inode_owner_or_capable(dentry->d_inode)) {
-		ret = -EPERM;
-		goto out;
-	}
-
-	if (!IS_POSIXACL(dentry->d_inode)) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl)) {
-			ret = PTR_ERR(acl);
-			goto out;
-		}
-
-		if (acl) {
-			ret = posix_acl_valid(acl);
-			if (ret)
-				goto out_release;
-		}
-	}
-
-	ret = ceph_set_acl(dentry, dentry->d_inode, acl, type);
-
-out_release:
-	posix_acl_release(acl);
-out:
-	return ret;
-}
-
-const struct xattr_handler ceph_xattr_acl_default_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
-	.flags  = ACL_TYPE_DEFAULT,
-	.get    = ceph_xattr_acl_get,
-	.set    = ceph_xattr_acl_set,
-};
-
-const struct xattr_handler ceph_xattr_acl_access_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
-	.flags  = ACL_TYPE_ACCESS,
-	.get    = ceph_xattr_acl_get,
-	.set    = ceph_xattr_acl_set,
-};
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6fc10a7d7c5..8b8b506636c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,6 +9,7 @@
 #include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
+#include <linux/posix_acl.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -1805,7 +1806,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		__mark_inode_dirty(inode, inode_dirty_flags);
 
 	if (ia_valid & ATTR_MODE) {
-		err = ceph_acl_chmod(dentry, inode);
+		err = posix_acl_chmod(inode, attr->ia_mode);
 		if (err)
 			goto out_put;
 	}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c299f7d19bf..345933948b6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -736,15 +736,12 @@ extern void __init ceph_xattr_init(void);
 extern void ceph_xattr_exit(void);
 
 /* acl.c */
-extern const struct xattr_handler ceph_xattr_acl_access_handler;
-extern const struct xattr_handler ceph_xattr_acl_default_handler;
 extern const struct xattr_handler *ceph_xattr_handlers[];
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 
 struct posix_acl *ceph_get_acl(struct inode *, int);
 int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
-int ceph_acl_chmod(struct dentry *, struct inode *);
 void ceph_forget_all_cached_acls(struct inode *inode);
 
 #else
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c7581f3733c..898b6565ad3 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -6,6 +6,7 @@
 #include <linux/ceph/decode.h>
 
 #include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/slab.h>
 
 #define XATTR_CEPH_PREFIX "ceph."
@@ -17,8 +18,8 @@
  */
 const struct xattr_handler *ceph_xattr_handlers[] = {
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
-	&ceph_xattr_acl_access_handler,
-	&ceph_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	NULL,
 };
-- 
cgit v1.2.3-70-g09d2


From 13116dfd13c8c9d60ea04ece13419af2de8e2e37 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Jan 2014 18:29:24 +0100
Subject: fanotify: Fix use after free in mask checking

We cannot use the event structure returned from
fsnotify_add_notify_event() because that event can be freed by the time
that function returns. Use the mask argument passed into the event
handler directly instead. This also fixes a possible problem when we
could unnecessarily wait for permission response for a normal fanotify
event which got merged with a permission event.

We also disallow merging of permission event with any other event so
that we know the permission event which we just created is the one on
which we should wait for permission response.

Reported-and-tested-by: Jiri Kosina <jkosina@suse.cz>
Reported-and-tested-by: Dave Jones <davej@fedoraproject.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 58772623f02..cc78e2fbc8e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -16,12 +16,6 @@ static bool should_merge(struct fsnotify_event *old_fsn,
 {
 	struct fanotify_event_info *old, *new;
 
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	/* dont merge two permission events */
-	if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) &&
-	    (new_fsn->mask & FAN_ALL_PERM_EVENTS))
-		return false;
-#endif
 	pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
 	old = FANOTIFY_E(old_fsn);
 	new = FANOTIFY_E(new_fsn);
@@ -42,6 +36,16 @@ static struct fsnotify_event *fanotify_merge(struct list_head *list,
 
 	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	/*
+	 * Don't merge a permission event with any other event so that we know
+	 * the event structure we have created in fanotify_handle_event() is the
+	 * one we should check for permission response.
+	 */
+	if (event->mask & FAN_ALL_PERM_EVENTS)
+		return NULL;
+#endif
+
 	list_for_each_entry_reverse(test_event, list, list) {
 		if (should_merge(test_event, event)) {
 			do_merge = true;
@@ -195,13 +199,10 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 		fsnotify_destroy_event(group, fsn_event);
 		if (IS_ERR(notify_fsn_event))
 			return PTR_ERR(notify_fsn_event);
-		/* We need to ask about a different events after a merge... */
-		event = FANOTIFY_E(notify_fsn_event);
-		fsn_event = notify_fsn_event;
 	}
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (fsn_event->mask & FAN_ALL_PERM_EVENTS)
+	if (mask & FAN_ALL_PERM_EVENTS)
 		ret = fanotify_get_response_from_access(group, event);
 #endif
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 83c0e1b442b488571f4fef4a91c2fe52eed6c705 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Jan 2014 18:53:22 +0100
Subject: fsnotify: Do not return merged event from fsnotify_add_notify_event()

The event returned from fsnotify_add_notify_event() cannot ever be used
safely as the event may be freed by the time the function returns (after
dropping notification_mutex). So change the prototype to just return
whether the event was added or merged into some existing event.

Reported-and-tested-by: Jiri Kosina <jkosina@suse.cz>
Reported-and-tested-by: Dave Jones <davej@fedoraproject.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c        | 18 +++++++-----------
 fs/notify/inotify/inotify_fsnotify.c | 19 +++++++------------
 fs/notify/notification.c             | 24 ++++++++++++------------
 include/linux/fsnotify_backend.h     |  8 ++++----
 4 files changed, 30 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index cc78e2fbc8e..c7e5e8f5474 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -28,8 +28,7 @@ static bool should_merge(struct fsnotify_event *old_fsn,
 }
 
 /* and the list better be locked by something too! */
-static struct fsnotify_event *fanotify_merge(struct list_head *list,
-					     struct fsnotify_event *event)
+static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 {
 	struct fsnotify_event *test_event;
 	bool do_merge = false;
@@ -43,7 +42,7 @@ static struct fsnotify_event *fanotify_merge(struct list_head *list,
 	 * one we should check for permission response.
 	 */
 	if (event->mask & FAN_ALL_PERM_EVENTS)
-		return NULL;
+		return 0;
 #endif
 
 	list_for_each_entry_reverse(test_event, list, list) {
@@ -54,10 +53,10 @@ static struct fsnotify_event *fanotify_merge(struct list_head *list,
 	}
 
 	if (!do_merge)
-		return NULL;
+		return 0;
 
 	test_event->mask |= event->mask;
-	return test_event;
+	return 1;
 }
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -153,7 +152,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	int ret = 0;
 	struct fanotify_event_info *event;
 	struct fsnotify_event *fsn_event;
-	struct fsnotify_event *notify_fsn_event;
 
 	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
 	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
@@ -192,13 +190,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	event->response = 0;
 #endif
 
-	notify_fsn_event = fsnotify_add_notify_event(group, fsn_event,
-						     fanotify_merge);
-	if (notify_fsn_event) {
+	ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
+	if (ret) {
 		/* Our event wasn't used in the end. Free it. */
 		fsnotify_destroy_event(group, fsn_event);
-		if (IS_ERR(notify_fsn_event))
-			return PTR_ERR(notify_fsn_event);
+		ret = 0;
 	}
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index aad1a35e9af..d5ee56348bb 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -53,15 +53,13 @@ static bool event_compare(struct fsnotify_event *old_fsn,
 	return false;
 }
 
-static struct fsnotify_event *inotify_merge(struct list_head *list,
-					    struct fsnotify_event *event)
+static int inotify_merge(struct list_head *list,
+			  struct fsnotify_event *event)
 {
 	struct fsnotify_event *last_event;
 
 	last_event = list_entry(list->prev, struct fsnotify_event, list);
-	if (!event_compare(last_event, event))
-		return NULL;
-	return last_event;
+	return event_compare(last_event, event);
 }
 
 int inotify_handle_event(struct fsnotify_group *group,
@@ -73,9 +71,8 @@ int inotify_handle_event(struct fsnotify_group *group,
 {
 	struct inotify_inode_mark *i_mark;
 	struct inotify_event_info *event;
-	struct fsnotify_event *added_event;
 	struct fsnotify_event *fsn_event;
-	int ret = 0;
+	int ret;
 	int len = 0;
 	int alloc_len = sizeof(struct inotify_event_info);
 
@@ -110,18 +107,16 @@ int inotify_handle_event(struct fsnotify_group *group,
 	if (len)
 		strcpy(event->name, file_name);
 
-	added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
-	if (added_event) {
+	ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
+	if (ret) {
 		/* Our event wasn't used in the end. Free it. */
 		fsnotify_destroy_event(group, fsn_event);
-		if (IS_ERR(added_event))
-			ret = PTR_ERR(added_event);
 	}
 
 	if (inode_mark->mask & IN_ONESHOT)
 		fsnotify_destroy_mark(inode_mark, group);
 
-	return ret;
+	return 0;
 }
 
 static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 952237b8e2d..18b3c4427dc 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -79,15 +79,15 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
 
 /*
  * Add an event to the group notification queue.  The group can later pull this
- * event off the queue to deal with.  If the event is successfully added to the
- * group's notification queue, a reference is taken on event.
+ * event off the queue to deal with.  The function returns 0 if the event was
+ * added to the queue, 1 if the event was merged with some other queued event.
  */
-struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
-						 struct fsnotify_event *event,
-						 struct fsnotify_event *(*merge)(struct list_head *,
-										 struct fsnotify_event *))
+int fsnotify_add_notify_event(struct fsnotify_group *group,
+			      struct fsnotify_event *event,
+			      int (*merge)(struct list_head *,
+					   struct fsnotify_event *))
 {
-	struct fsnotify_event *return_event = NULL;
+	int ret = 0;
 	struct list_head *list = &group->notification_list;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@@ -98,14 +98,14 @@ struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
 		/* Queue overflow event only if it isn't already queued */
 		if (list_empty(&group->overflow_event.list))
 			event = &group->overflow_event;
-		return_event = event;
+		ret = 1;
 	}
 
 	if (!list_empty(list) && merge) {
-		return_event = merge(list, event);
-		if (return_event) {
+		ret = merge(list, event);
+		if (ret) {
 			mutex_unlock(&group->notification_mutex);
-			return return_event;
+			return ret;
 		}
 	}
 
@@ -115,7 +115,7 @@ struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
 
 	wake_up(&group->notification_waitq);
 	kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
-	return return_event;
+	return ret;
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7d8d5e60859..3d286ff49ab 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -322,10 +322,10 @@ extern int fsnotify_fasync(int fd, struct file *file, int on);
 extern void fsnotify_destroy_event(struct fsnotify_group *group,
 				   struct fsnotify_event *event);
 /* attach the event to the group notification queue */
-extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
-							struct fsnotify_event *event,
-							struct fsnotify_event *(*merge)(struct list_head *,
-											struct fsnotify_event *));
+extern int fsnotify_add_notify_event(struct fsnotify_group *group,
+				     struct fsnotify_event *event,
+				     int (*merge)(struct list_head *,
+						  struct fsnotify_event *));
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
-- 
cgit v1.2.3-70-g09d2


From 85816794240b9659e66e4d9b0df7c6e814e5f603 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Jan 2014 21:38:06 +0100
Subject: fanotify: Fix use after free for permission events

Currently struct fanotify_event_info has been destroyed immediately
after reporting its contents to userspace. However that is wrong for
permission events because those need to stay around until userspace
provides response which is filled back in fanotify_event_info. So change
to code to free permission events only after we have got the response
from userspace.

Reported-and-tested-by: Jiri Kosina <jkosina@suse.cz>
Reported-and-tested-by: Dave Jones <davej@fedoraproject.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      | 5 ++++-
 fs/notify/fanotify/fanotify.h      | 7 +++++++
 fs/notify/fanotify/fanotify_user.c | 7 ++++++-
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index c7e5e8f5474..0e792f5e314 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -192,14 +192,17 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 
 	ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
 	if (ret) {
+		BUG_ON(mask & FAN_ALL_PERM_EVENTS);
 		/* Our event wasn't used in the end. Free it. */
 		fsnotify_destroy_event(group, fsn_event);
 		ret = 0;
 	}
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-	if (mask & FAN_ALL_PERM_EVENTS)
+	if (mask & FAN_ALL_PERM_EVENTS) {
 		ret = fanotify_get_response_from_access(group, event);
+		fsnotify_destroy_event(group, fsn_event);
+	}
 #endif
 	return ret;
 }
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 0e90174a116..32a2f034fb9 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -4,6 +4,13 @@
 
 extern struct kmem_cache *fanotify_event_cachep;
 
+/*
+ * Lifetime of the structure differs for normal and permission events. In both
+ * cases the structure is allocated in fanotify_handle_event(). For normal
+ * events the structure is freed immediately after reporting it to userspace.
+ * For permission events we free it only after we receive response from
+ * userspace.
+ */
 struct fanotify_event_info {
 	struct fsnotify_event fse;
 	/*
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1fd66abe574..b6175fa11bf 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -319,7 +319,12 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 			if (IS_ERR(kevent))
 				break;
 			ret = copy_event_to_user(group, kevent, buf);
-			fsnotify_destroy_event(group, kevent);
+			/*
+			 * Permission events get destroyed after we
+			 * receive response
+			 */
+			if (!(kevent->mask & FAN_ALL_PERM_EVENTS))
+				fsnotify_destroy_event(group, kevent);
 			if (ret < 0)
 				break;
 			buf += ret;
-- 
cgit v1.2.3-70-g09d2


From 9f03740a956d7ac6a1b8f8c455da6fa5cae11c22 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Wed, 22 Jan 2014 10:00:53 +0000
Subject: Btrfs: fix infinite path build loops in incremental send

The send operation processes inodes by their ascending number, and assumes
that any rename/move operation can be successfully performed (sent to the
caller) once all previous inodes (those with a smaller inode number than the
one we're currently processing) were processed.

This is not true when an incremental send had to process an hierarchical change
between 2 snapshots where the parent-children relationship between directory
inodes was reversed - that is, parents became children and children became
parents. This situation made the path building code go into an infinite loop,
which kept allocating more and more memory that eventually lead to a krealloc
warning being displayed in dmesg:

  WARNING: CPU: 1 PID: 5705 at mm/page_alloc.c:2477 __alloc_pages_nodemask+0x365/0xad0()
  Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) snd_hda_codec_hdmi snd_hda_codec_realtek joydev radeon snd_hda_intel snd_hda_codec snd_hwdep snd_seq_midi snd_pcm psmouse i915 snd_rawmidi serio_raw snd_seq_midi_event lpc_ich snd_seq snd_timer ttm snd_seq_device rfcomm drm_kms_helper parport_pc bnep bluetooth drm ppdev snd soundcore i2c_algo_bit snd_page_alloc binfmt_misc video lp parport r8169 mii hid_generic usbhid hid
  CPU: 1 PID: 5705 Comm: btrfs Tainted: G           O 3.13.0-rc7-fdm-btrfs-next-18+ #3
  Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012
  [ 5381.660441]  00000000000009ad ffff8806f6f2f4e8 ffffffff81777434 0000000000000007
  [ 5381.660447]  0000000000000000 ffff8806f6f2f528 ffffffff8104a9ec ffff8807038f36f0
  [ 5381.660452]  0000000000000000 0000000000000206 ffff8807038f2490 ffff8807038f36f0
  [ 5381.660457] Call Trace:
  [ 5381.660464]  [<ffffffff81777434>] dump_stack+0x4e/0x68
  [ 5381.660471]  [<ffffffff8104a9ec>] warn_slowpath_common+0x8c/0xc0
  [ 5381.660476]  [<ffffffff8104aa3a>] warn_slowpath_null+0x1a/0x20
  [ 5381.660480]  [<ffffffff81144995>] __alloc_pages_nodemask+0x365/0xad0
  [ 5381.660487]  [<ffffffff8108313f>] ? local_clock+0x4f/0x60
  [ 5381.660491]  [<ffffffff811430e8>] ? free_one_page+0x98/0x440
  [ 5381.660495]  [<ffffffff8108313f>] ? local_clock+0x4f/0x60
  [ 5381.660502]  [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
  [ 5381.660508]  [<ffffffff81095fb8>] ? trace_hardirqs_off_caller+0x28/0xd0
  [ 5381.660515]  [<ffffffff81183caf>] alloc_pages_current+0x10f/0x1f0
  [ 5381.660520]  [<ffffffff8113fae4>] ? __get_free_pages+0x14/0x50
  [ 5381.660524]  [<ffffffff8113fae4>] __get_free_pages+0x14/0x50
  [ 5381.660530]  [<ffffffff8115dace>] kmalloc_order_trace+0x3e/0x100
  [ 5381.660536]  [<ffffffff81191ea0>] __kmalloc_track_caller+0x220/0x230
  [ 5381.660560]  [<ffffffffa0729fdb>] ? fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
  [ 5381.660564]  [<ffffffff8178085c>] ? retint_restore_args+0xe/0xe
  [ 5381.660569]  [<ffffffff811580ef>] krealloc+0x6f/0xb0
  [ 5381.660586]  [<ffffffffa0729fdb>] fs_path_ensure_buf.part.12+0x6b/0x200 [btrfs]
  [ 5381.660601]  [<ffffffffa072a208>] fs_path_prepare_for_add+0x98/0xb0 [btrfs]
  [ 5381.660615]  [<ffffffffa072a2bc>] fs_path_add_path+0x2c/0x60 [btrfs]
  [ 5381.660628]  [<ffffffffa072c55c>] get_cur_path+0x7c/0x1c0 [btrfs]

Even without this loop, the incremental send couldn't succeed, because it would attempt
to send a rename/move operation for the lower inode before the highest inode number was
renamed/move. This issue is easy to trigger with the following steps:

  $ mkfs.btrfs -f /dev/sdb3
  $ mount /dev/sdb3 /mnt/btrfs
  $ mkdir -p /mnt/btrfs/a/b/c/d
  $ mkdir /mnt/btrfs/a/b/c2
  $ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
  $ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c2/d2
  $ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2/d2/cc
  $ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
  $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send

The structure of the filesystem when the first snapshot is taken is:

	 .                       (ino 256)
	 |-- a                   (ino 257)
	     |-- b               (ino 258)
	         |-- c           (ino 259)
	         |   |-- d       (ino 260)
                 |
	         |-- c2          (ino 261)

And its structure when the second snapshot is taken is:

	 .                       (ino 256)
	 |-- a                   (ino 257)
	     |-- b               (ino 258)
	         |-- c2          (ino 261)
	             |-- d2      (ino 260)
	                 |-- cc  (ino 259)

Before the move/rename operation is performed for the inode 259, the
move/rename for inode 260 must be performed, since 259 is now a child
of 260.

A test case for xfstests, with a more complex scenario, will follow soon.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 539 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 518 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fc1f0abb8fe..c96e879bcb1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -121,6 +121,74 @@ struct send_ctx {
 	int name_cache_size;
 
 	char *read_buf;
+
+	/*
+	 * We process inodes by their increasing order, so if before an
+	 * incremental send we reverse the parent/child relationship of
+	 * directories such that a directory with a lower inode number was
+	 * the parent of a directory with a higher inode number, and the one
+	 * becoming the new parent got renamed too, we can't rename/move the
+	 * directory with lower inode number when we finish processing it - we
+	 * must process the directory with higher inode number first, then
+	 * rename/move it and then rename/move the directory with lower inode
+	 * number. Example follows.
+	 *
+	 * Tree state when the first send was performed:
+	 *
+	 * .
+	 * |-- a                   (ino 257)
+	 *     |-- b               (ino 258)
+	 *         |
+	 *         |
+	 *         |-- c           (ino 259)
+	 *         |   |-- d       (ino 260)
+	 *         |
+	 *         |-- c2          (ino 261)
+	 *
+	 * Tree state when the second (incremental) send is performed:
+	 *
+	 * .
+	 * |-- a                   (ino 257)
+	 *     |-- b               (ino 258)
+	 *         |-- c2          (ino 261)
+	 *             |-- d2      (ino 260)
+	 *                 |-- cc  (ino 259)
+	 *
+	 * The sequence of steps that lead to the second state was:
+	 *
+	 * mv /a/b/c/d /a/b/c2/d2
+	 * mv /a/b/c /a/b/c2/d2/cc
+	 *
+	 * "c" has lower inode number, but we can't move it (2nd mv operation)
+	 * before we move "d", which has higher inode number.
+	 *
+	 * So we just memorize which move/rename operations must be performed
+	 * later when their respective parent is processed and moved/renamed.
+	 */
+
+	/* Indexed by parent directory inode number. */
+	struct rb_root pending_dir_moves;
+
+	/*
+	 * Reverse index, indexed by the inode number of a directory that
+	 * is waiting for the move/rename of its immediate parent before its
+	 * own move/rename can be performed.
+	 */
+	struct rb_root waiting_dir_moves;
+};
+
+struct pending_dir_move {
+	struct rb_node node;
+	struct list_head list;
+	u64 parent_ino;
+	u64 ino;
+	u64 gen;
+	struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+	struct rb_node node;
+	u64 ino;
 };
 
 struct name_cache_entry {
@@ -144,6 +212,8 @@ struct name_cache_entry {
 	char name[];
 };
 
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
 static int need_send_hole(struct send_ctx *sctx)
 {
 	return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -1897,6 +1967,7 @@ static void name_cache_free(struct send_ctx *sctx)
  */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
 				     u64 ino, u64 gen,
+				     int skip_name_cache,
 				     u64 *parent_ino,
 				     u64 *parent_gen,
 				     struct fs_path *dest)
@@ -1906,6 +1977,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	struct btrfs_path *path = NULL;
 	struct name_cache_entry *nce = NULL;
 
+	if (skip_name_cache)
+		goto get_ref;
 	/*
 	 * First check if we already did a call to this function with the same
 	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -1950,11 +2023,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 		goto out_cache;
 	}
 
+get_ref:
 	/*
 	 * Depending on whether the inode was already processed or not, use
 	 * send_root or parent_root for ref lookup.
 	 */
-	if (ino < sctx->send_progress)
+	if (ino < sctx->send_progress && !skip_name_cache)
 		ret = get_first_ref(sctx->send_root, ino,
 				    parent_ino, parent_gen, dest);
 	else
@@ -1978,6 +2052,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 			goto out;
 		ret = 1;
 	}
+	if (skip_name_cache)
+		goto out;
 
 out_cache:
 	/*
@@ -2045,6 +2121,9 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	u64 parent_inode = 0;
 	u64 parent_gen = 0;
 	int stop = 0;
+	u64 start_ino = ino;
+	u64 start_gen = gen;
+	int skip_name_cache = 0;
 
 	name = fs_path_alloc();
 	if (!name) {
@@ -2052,19 +2131,32 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 		goto out;
 	}
 
+	if (is_waiting_for_move(sctx, ino))
+		skip_name_cache = 1;
+
+again:
 	dest->reversed = 1;
 	fs_path_reset(dest);
 
 	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
 		fs_path_reset(name);
 
-		ret = __get_cur_name_and_parent(sctx, ino, gen,
+		ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
 				&parent_inode, &parent_gen, name);
 		if (ret < 0)
 			goto out;
 		if (ret)
 			stop = 1;
 
+		if (!skip_name_cache &&
+		    is_waiting_for_move(sctx, parent_inode)) {
+			ino = start_ino;
+			gen = start_gen;
+			stop = 0;
+			skip_name_cache = 1;
+			goto again;
+		}
+
 		ret = fs_path_add_path(dest, name);
 		if (ret < 0)
 			goto out;
@@ -2636,10 +2728,349 @@ out:
 	return ret;
 }
 
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+	struct waiting_dir_move *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct waiting_dir_move, node);
+		if (ino < entry->ino)
+			n = n->rb_left;
+		else if (ino > entry->ino)
+			n = n->rb_right;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+	struct rb_node *parent = NULL;
+	struct waiting_dir_move *entry, *dm;
+
+	dm = kmalloc(sizeof(*dm), GFP_NOFS);
+	if (!dm)
+		return -ENOMEM;
+	dm->ino = ino;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct waiting_dir_move, node);
+		if (ino < entry->ino) {
+			p = &(*p)->rb_left;
+		} else if (ino > entry->ino) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(dm);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&dm->node, parent, p);
+	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+	return 0;
+}
+
+#ifdef CONFIG_BTRFS_ASSERT
+
+static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+	struct waiting_dir_move *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct waiting_dir_move, node);
+		if (ino < entry->ino) {
+			n = n->rb_left;
+		} else if (ino > entry->ino) {
+			n = n->rb_right;
+		} else {
+			rb_erase(&entry->node, &sctx->waiting_dir_moves);
+			kfree(entry);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+#endif
+
+static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+{
+	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+	struct rb_node *parent = NULL;
+	struct pending_dir_move *entry, *pm;
+	struct recorded_ref *cur;
+	int exists = 0;
+	int ret;
+
+	pm = kmalloc(sizeof(*pm), GFP_NOFS);
+	if (!pm)
+		return -ENOMEM;
+	pm->parent_ino = parent_ino;
+	pm->ino = sctx->cur_ino;
+	pm->gen = sctx->cur_inode_gen;
+	INIT_LIST_HEAD(&pm->list);
+	INIT_LIST_HEAD(&pm->update_refs);
+	RB_CLEAR_NODE(&pm->node);
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct pending_dir_move, node);
+		if (parent_ino < entry->parent_ino) {
+			p = &(*p)->rb_left;
+		} else if (parent_ino > entry->parent_ino) {
+			p = &(*p)->rb_right;
+		} else {
+			exists = 1;
+			break;
+		}
+	}
+
+	list_for_each_entry(cur, &sctx->deleted_refs, list) {
+		ret = dup_ref(cur, &pm->update_refs);
+		if (ret < 0)
+			goto out;
+	}
+	list_for_each_entry(cur, &sctx->new_refs, list) {
+		ret = dup_ref(cur, &pm->update_refs);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = add_waiting_dir_move(sctx, pm->ino);
+	if (ret)
+		goto out;
+
+	if (exists) {
+		list_add_tail(&pm->list, &entry->list);
+	} else {
+		rb_link_node(&pm->node, parent, p);
+		rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+	}
+	ret = 0;
+out:
+	if (ret) {
+		__free_recorded_refs(&pm->update_refs);
+		kfree(pm);
+	}
+	return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+						      u64 parent_ino)
+{
+	struct rb_node *n = sctx->pending_dir_moves.rb_node;
+	struct pending_dir_move *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct pending_dir_move, node);
+		if (parent_ino < entry->parent_ino)
+			n = n->rb_left;
+		else if (parent_ino > entry->parent_ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+	struct fs_path *from_path = NULL;
+	struct fs_path *to_path = NULL;
+	u64 orig_progress = sctx->send_progress;
+	struct recorded_ref *cur;
+	int ret;
+
+	from_path = fs_path_alloc();
+	if (!from_path)
+		return -ENOMEM;
+
+	sctx->send_progress = pm->ino;
+	ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+	if (ret < 0)
+		goto out;
+
+	to_path = fs_path_alloc();
+	if (!to_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sctx->send_progress = sctx->cur_ino + 1;
+	ASSERT(del_waiting_dir_move(sctx, pm->ino) == 0);
+	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+	if (ret < 0)
+		goto out;
+
+	ret = send_rename(sctx, from_path, to_path);
+	if (ret < 0)
+		goto out;
+
+	ret = send_utimes(sctx, pm->ino, pm->gen);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * After rename/move, need to update the utimes of both new parent(s)
+	 * and old parent(s).
+	 */
+	list_for_each_entry(cur, &pm->update_refs, list) {
+		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	fs_path_free(from_path);
+	fs_path_free(to_path);
+	sctx->send_progress = orig_progress;
+
+	return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+	if (!list_empty(&m->list))
+		list_del(&m->list);
+	if (!RB_EMPTY_NODE(&m->node))
+		rb_erase(&m->node, &sctx->pending_dir_moves);
+	__free_recorded_refs(&m->update_refs);
+	kfree(m);
+}
+
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+				      struct list_head *stack)
+{
+	if (list_empty(&moves->list)) {
+		list_add_tail(&moves->list, stack);
+	} else {
+		LIST_HEAD(list);
+		list_splice_init(&moves->list, &list);
+		list_add_tail(&moves->list, stack);
+		list_splice_tail(&list, stack);
+	}
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+	struct pending_dir_move *pm;
+	struct list_head stack;
+	u64 parent_ino = sctx->cur_ino;
+	int ret = 0;
+
+	pm = get_pending_dir_moves(sctx, parent_ino);
+	if (!pm)
+		return 0;
+
+	INIT_LIST_HEAD(&stack);
+	tail_append_pending_moves(pm, &stack);
+
+	while (!list_empty(&stack)) {
+		pm = list_first_entry(&stack, struct pending_dir_move, list);
+		parent_ino = pm->ino;
+		ret = apply_dir_move(sctx, pm);
+		free_pending_move(sctx, pm);
+		if (ret)
+			goto out;
+		pm = get_pending_dir_moves(sctx, parent_ino);
+		if (pm)
+			tail_append_pending_moves(pm, &stack);
+	}
+	return 0;
+
+out:
+	while (!list_empty(&stack)) {
+		pm = list_first_entry(&stack, struct pending_dir_move, list);
+		free_pending_move(sctx, pm);
+	}
+	return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+				struct recorded_ref *parent_ref)
+{
+	int ret;
+	u64 ino = parent_ref->dir;
+	u64 parent_ino_before, parent_ino_after;
+	u64 new_gen, old_gen;
+	struct fs_path *path_before = NULL;
+	struct fs_path *path_after = NULL;
+	int len1, len2;
+
+	if (parent_ref->dir <= sctx->cur_ino)
+		return 0;
+
+	if (is_waiting_for_move(sctx, ino))
+		return 1;
+
+	ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
+			     NULL, NULL, NULL, NULL);
+	if (ret == -ENOENT)
+		return 0;
+	else if (ret < 0)
+		return ret;
+
+	ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
+			     NULL, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (new_gen != old_gen)
+		return 0;
+
+	path_before = fs_path_alloc();
+	if (!path_before)
+		return -ENOMEM;
+
+	ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+			    NULL, path_before);
+	if (ret == -ENOENT) {
+		ret = 0;
+		goto out;
+	} else if (ret < 0) {
+		goto out;
+	}
+
+	path_after = fs_path_alloc();
+	if (!path_after) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+			    NULL, path_after);
+	if (ret == -ENOENT) {
+		ret = 0;
+		goto out;
+	} else if (ret < 0) {
+		goto out;
+	}
+
+	len1 = fs_path_len(path_before);
+	len2 = fs_path_len(path_after);
+	if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
+	     memcmp(path_before->start, path_after->start, len1))) {
+		ret = 1;
+		goto out;
+	}
+	ret = 0;
+
+out:
+	fs_path_free(path_before);
+	fs_path_free(path_after);
+
+	return ret;
+}
+
 /*
  * This does all the move/link/unlink/rmdir magic.
  */
-static int process_recorded_refs(struct send_ctx *sctx)
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 {
 	int ret = 0;
 	struct recorded_ref *cur;
@@ -2788,11 +3219,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				 * dirs, we always have one new and one deleted
 				 * ref. The deleted ref is ignored later.
 				 */
-				ret = send_rename(sctx, valid_path,
-						cur->full_path);
-				if (ret < 0)
-					goto out;
-				ret = fs_path_copy(valid_path, cur->full_path);
+				if (wait_for_parent_move(sctx, cur)) {
+					ret = add_pending_dir_move(sctx,
+								   cur->dir);
+					*pending_move = 1;
+				} else {
+					ret = send_rename(sctx, valid_path,
+							  cur->full_path);
+					if (!ret)
+						ret = fs_path_copy(valid_path,
+							       cur->full_path);
+				}
 				if (ret < 0)
 					goto out;
 			} else {
@@ -3161,6 +3598,7 @@ static int process_all_refs(struct send_ctx *sctx,
 	struct extent_buffer *eb;
 	int slot;
 	iterate_inode_ref_t cb;
+	int pending_move = 0;
 
 	path = alloc_path_for_send();
 	if (!path)
@@ -3204,7 +3642,9 @@ static int process_all_refs(struct send_ctx *sctx,
 	}
 	btrfs_release_path(path);
 
-	ret = process_recorded_refs(sctx);
+	ret = process_recorded_refs(sctx, &pending_move);
+	/* Only applicable to an incremental send. */
+	ASSERT(pending_move == 0);
 
 out:
 	btrfs_free_path(path);
@@ -4165,7 +4605,9 @@ out:
 	return ret;
 }
 
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+					   int *pending_move,
+					   int *refs_processed)
 {
 	int ret = 0;
 
@@ -4177,17 +4619,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
 	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
 		goto out;
 
-	ret = process_recorded_refs(sctx);
+	ret = process_recorded_refs(sctx, pending_move);
 	if (ret < 0)
 		goto out;
 
-	/*
-	 * We have processed the refs and thus need to advance send_progress.
-	 * Now, calls to get_cur_xxx will take the updated refs of the current
-	 * inode into account.
-	 */
-	sctx->send_progress = sctx->cur_ino + 1;
-
+	*refs_processed = 1;
 out:
 	return ret;
 }
@@ -4203,11 +4639,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 	u64 right_gid;
 	int need_chmod = 0;
 	int need_chown = 0;
+	int pending_move = 0;
+	int refs_processed = 0;
 
-	ret = process_recorded_refs_if_needed(sctx, at_end);
+	ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+					      &refs_processed);
 	if (ret < 0)
 		goto out;
 
+	/*
+	 * We have processed the refs and thus need to advance send_progress.
+	 * Now, calls to get_cur_xxx will take the updated refs of the current
+	 * inode into account.
+	 *
+	 * On the other hand, if our current inode is a directory and couldn't
+	 * be moved/renamed because its parent was renamed/moved too and it has
+	 * a higher inode number, we can only move/rename our current inode
+	 * after we moved/renamed its parent. Therefore in this case operate on
+	 * the old path (pre move/rename) of our current inode, and the
+	 * move/rename will be performed later.
+	 */
+	if (refs_processed && !pending_move)
+		sctx->send_progress = sctx->cur_ino + 1;
+
 	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
 		goto out;
 	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
@@ -4269,9 +4723,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 	}
 
 	/*
-	 * Need to send that every time, no matter if it actually changed
-	 * between the two trees as we have done changes to the inode before.
+	 * If other directory inodes depended on our current directory
+	 * inode's move/rename, now do their move/rename operations.
 	 */
+	if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+		ret = apply_children_dir_moves(sctx);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * Need to send that every time, no matter if it actually
+	 * changed between the two trees as we have done changes to
+	 * the inode before.
+	 */
+	sctx->send_progress = sctx->cur_ino + 1;
 	ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
 	if (ret < 0)
 		goto out;
@@ -4839,6 +5305,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 		goto out;
 	}
 
+	sctx->pending_dir_moves = RB_ROOT;
+	sctx->waiting_dir_moves = RB_ROOT;
+
 	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
 			(arg->clone_sources_count + 1));
 	if (!sctx->clone_roots) {
@@ -4947,6 +5416,34 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	}
 
 out:
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+		struct rb_node *n;
+		struct pending_dir_move *pm;
+
+		n = rb_first(&sctx->pending_dir_moves);
+		pm = rb_entry(n, struct pending_dir_move, node);
+		while (!list_empty(&pm->list)) {
+			struct pending_dir_move *pm2;
+
+			pm2 = list_first_entry(&pm->list,
+					       struct pending_dir_move, list);
+			free_pending_move(sctx, pm2);
+		}
+		free_pending_move(sctx, pm);
+	}
+
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+		struct rb_node *n;
+		struct waiting_dir_move *dm;
+
+		n = rb_first(&sctx->waiting_dir_moves);
+		dm = rb_entry(n, struct waiting_dir_move, node);
+		rb_erase(&dm->node, &sctx->waiting_dir_moves);
+		kfree(dm);
+	}
+
 	if (sort_clone_roots) {
 		for (i = 0; i < sctx->clone_roots_cnt; i++)
 			btrfs_root_dec_send_in_progress(
-- 
cgit v1.2.3-70-g09d2


From 3c9665df0c5d3f471b07efc32181459386678ebd Mon Sep 17 00:00:00 2001
From: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
Date: Thu, 23 Jan 2014 13:41:09 +0800
Subject: btrfs: fix warning while merging two adjacent extents

When we have two adjacent extents in relink_extent_backref,
we try to merge them. When we use btrfs_search_slot to locate the
slot for the current extent, we shouldn't set "ins_len = 1",
because we will merge it into the previous extent rather than
insert a new item. Otherwise, we may happen to create a new leaf
in btrfs_search_slot and path->slot[0] will be 0. Then we try to
fetch the previous item using "path->slots[0]--", and it will cause
a warning as follows:

	[  145.713385] WARNING: CPU: 3 PID: 1796 at fs/btrfs/extent_io.c:5043 map_private_extent_buffer+0xd4/0xe0
	[  145.713387] btrfs bad mapping eb start 5337088 len 4096, wanted 167772306 8
	...
	[  145.713462]  [<ffffffffa034b1f4>] map_private_extent_buffer+0xd4/0xe0
	[  145.713476]  [<ffffffffa030097a>] ? btrfs_free_path+0x2a/0x40
	[  145.713485]  [<ffffffffa0340864>] btrfs_get_token_64+0x64/0xf0
	[  145.713498]  [<ffffffffa033472c>] relink_extent_backref+0x41c/0x820
	[  145.713508]  [<ffffffffa0334d69>] btrfs_finish_ordered_io+0x239/0xa80

I encounter this warning when running defrag having mkfs.btrfs
with option -M. At the same time there are read/writes & snapshots
running at background.

Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7b61ea3141e..3b6598783be 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2314,7 +2314,7 @@ again:
 		u64 extent_len;
 		struct btrfs_key found_key;
 
-		ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 		if (ret < 0)
 			goto out_free_path;
 
-- 
cgit v1.2.3-70-g09d2


From 538f72cdf03cad1c21c551ea542c8ce7d9fa2d81 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Thu, 23 Jan 2014 13:47:48 +0800
Subject: Btrfs: fix protection between walking backrefs and root deletion

There is a race condition between resolving indirect ref and root deletion,
and we should gurantee that root can not be destroyed to avoid accessing
broken tree here.

Here we fix it by holding @subvol_srcu, and we will release it as soon
as we have held root node lock.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 15384968a84..10ae5700ab1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -301,23 +301,34 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	int ret = 0;
 	int root_level;
 	int level = ref->level;
+	int index;
 
 	root_key.objectid = ref->root_id;
 	root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root_key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
 	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 	if (IS_ERR(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
 		ret = PTR_ERR(root);
 		goto out;
 	}
 
 	root_level = btrfs_old_root_level(root, time_seq);
 
-	if (root_level + 1 == level)
+	if (root_level + 1 == level) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
 		goto out;
+	}
 
 	path->lowest_level = level;
 	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+
+	/* root node has been locked, we can release @subvol_srcu safely here */
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
 	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
 		 "%d for key (%llu %u %llu)\n",
 		 ref->root_id, level, ref->count, ret,
-- 
cgit v1.2.3-70-g09d2


From 95def2ede1a9dd12b164932eaf5fefb67aefc41c Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Thu, 23 Jan 2014 13:47:49 +0800
Subject: Btrfs: fix to catch all errors when resolving indirect ref

We can only tolerate ENOENT here, for other errors, we should
return directly.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 10ae5700ab1..55ffcf44b90 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -388,10 +388,16 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			continue;
 		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
 					     parents, extent_item_pos);
-		if (err == -ENOMEM)
-			goto out;
-		if (err)
+		/*
+		 * we can only tolerate ENOENT,otherwise,we should catch error
+		 * and return directly.
+		 */
+		if (err == -ENOENT) {
 			continue;
+		} else if (err) {
+			ret = err;
+			goto out;
+		}
 
 		/* we put the first parent into the ref at hand */
 		ULIST_ITER_INIT(&uiter);
-- 
cgit v1.2.3-70-g09d2


From 7fdd29d02e0ab595a857fe9c7b71e752ff665372 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 24 Jan 2014 17:42:09 +0000
Subject: Btrfs: make send's file extent item search more efficient

Instead of looking for a file extent item, process it, release the path
and do a btree search for the next file extent item, just process all
file extent items in a leaf without intermediate btree searches. This way
we save cpu and we're not blocking other tasks or affecting concurrency on
the btree, because send's paths use the commit root and skip btree node/leaf
locking.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c96e879bcb1..4d31f72bdf4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4573,17 +4573,25 @@ static int process_all_extents(struct send_ctx *sctx)
 	key.objectid = sctx->cmp_key->objectid;
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = 0;
-	while (1) {
-		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0)
-			goto out;
-		if (ret) {
-			ret = 0;
-			goto out;
-		}
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
 
+	while (1) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
 
 		if (found_key.objectid != key.objectid ||
@@ -4596,8 +4604,7 @@ static int process_all_extents(struct send_ctx *sctx)
 		if (ret < 0)
 			goto out;
 
-		btrfs_release_path(path);
-		key.offset = found_key.offset + 1;
+		path->slots[0]++;
 	}
 
 out:
-- 
cgit v1.2.3-70-g09d2


From bca1a290033d20981e11f81ae4207e4d0fa5b1e6 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sun, 26 Jan 2014 22:32:18 +0800
Subject: Btrfs: add a reschedule point in btrfs_find_all_roots()

I can easily trigger the following warnings when enabling quota
in my virtual machine(running Opensuse), Steps are firstly creating
a subvolume full of fragment extents, and then create many snapshots
(500 in my test case).

[ 2362.808459] BUG: soft lockup - CPU#0 stuck for 22s! [btrfs-qgroup-re:1970]

[ 2362.809023] task: e4af8450 ti: e371c000 task.ti: e371c000
[ 2362.809026] EIP: 0060:[<fa38f4ae>] EFLAGS: 00000246 CPU: 0
[ 2362.809049] EIP is at __merge_refs+0x5e/0x100 [btrfs]
[ 2362.809051] EAX: 00000000 EBX: cfadbcf0 ECX: 00000000 EDX: cfadbcb0
[ 2362.809052] ESI: dd8d3370 EDI: e371dde0 EBP: e371dd6c ESP: e371dd5c
[ 2362.809054]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
[ 2362.809055] CR0: 80050033 CR2: ac454d50 CR3: 009a9000 CR4: 001407d0
[ 2362.809099] Stack:
[ 2362.809100]  00000001 e371dde0 dfcc6890 f29f8000 e371de28 fa39016d 00000011 00000001
[ 2362.809105]  99bfc000 00000000 93928000 00000000 00000001 00000050 e371dda8 00000001
[ 2362.809109]  f3a31000 f3413000 00000001 e371ddb8 000040a8 00000202 00000000 00000023
[ 2362.809113] Call Trace:
[ 2362.809136]  [<fa39016d>] find_parent_nodes+0x34d/0x1280 [btrfs]
[ 2362.809156]  [<fa391172>] btrfs_find_all_roots+0xb2/0x110 [btrfs]
[ 2362.809174]  [<fa3934a8>] btrfs_qgroup_rescan_worker+0x358/0x7a0 [btrfs]
[ 2362.809180]  [<c024d0ce>] ? lock_timer_base.isra.39+0x1e/0x40
[ 2362.809199]  [<fa3648df>] worker_loop+0xff/0x470 [btrfs]
[ 2362.809204]  [<c027a88a>] ? __wake_up_locked+0x1a/0x20
[ 2362.809221]  [<fa3647e0>] ? btrfs_queue_worker+0x2b0/0x2b0 [btrfs]
[ 2362.809225]  [<c025ebbc>] kthread+0x9c/0xb0
[ 2362.809229]  [<c06b487b>] ret_from_kernel_thread+0x1b/0x30
[ 2362.809233]  [<c025eb20>] ? kthread_create_on_node+0x110/0x110

By adding a reschedule point at the end of btrfs_find_all_roots(), i no longer
hit these warnings.

Cc: Josef Bacik <jbacik@fb.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 55ffcf44b90..7966acd5dc7 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1118,6 +1118,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 		if (!node)
 			break;
 		bytenr = node->val;
+		cond_resched();
 	}
 
 	ulist_free(tmp);
-- 
cgit v1.2.3-70-g09d2


From bf54f412f0624786ac8a115110b5203430a9eebb Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 28 Jan 2014 01:38:06 +0000
Subject: Btrfs: fix send file hole detection leading to data corruption

There was a case where file hole detection was incorrect and it would
cause an incremental send to override a section of a file with zeroes.

This happened in the case where between the last leaf we processed which
contained a file extent item for our current inode and the leaf we're
currently are at (and has a file extent item for our current inode) there
are only leafs containing exclusively file extent items for our current
inode, and none of them was updated since the previous send operation.
The file hole detection code would incorrectly consider the file range
covered by these leafs as a hole.

A test case for xfstests follows soon.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4d31f72bdf4..85259cba784 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4489,6 +4489,21 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
 		extent_end = key->offset +
 			btrfs_file_extent_num_bytes(path->nodes[0], fi);
 	}
+
+	if (path->slots[0] == 0 &&
+	    sctx->cur_inode_last_extent < key->offset) {
+		/*
+		 * We might have skipped entire leafs that contained only
+		 * file extent items for our current inode. These leafs have
+		 * a generation number smaller (older) than the one in the
+		 * current leaf and the leaf our last extent came from, and
+		 * are located between these 2 leafs.
+		 */
+		ret = get_last_extent(sctx, key->offset - 1);
+		if (ret)
+			return ret;
+	}
+
 	if (sctx->cur_inode_last_extent < key->offset)
 		ret = send_hole(sctx, key->offset);
 	sctx->cur_inode_last_extent = extent_end;
-- 
cgit v1.2.3-70-g09d2


From f05c474688762f186b16a26366755b6ef0bfed0c Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 28 Jan 2014 19:13:38 +0800
Subject: Btrfs: fix memory leaks on walking backrefs failure

When walking backrefs, we may iterate every inode's extent
and add/merge them into ulist, and the caller will free memory
from ulist.

However, if we fail to allocate inode's extents element
memory or ulist_add() fail to allocate memory, we won't
add allocated memory into ulist, and the caller won't
free some allocated memory thus memory leaks happen.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7966acd5dc7..aded3ef3d3d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
 	return 0;
 }
 
+static void free_inode_elem_list(struct extent_inode_elem *eie)
+{
+	struct extent_inode_elem *eie_next;
+
+	for (; eie; eie = eie_next) {
+		eie_next = eie->next;
+		kfree(eie);
+	}
+}
+
 static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
 				u64 extent_item_pos,
 				struct extent_inode_elem **eie)
@@ -275,6 +285,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 					old = old->next;
 				old->next = eie;
 			}
+			eie = NULL;
 		}
 next:
 		ret = btrfs_next_old_item(root, path, time_seq);
@@ -282,6 +293,8 @@ next:
 
 	if (ret > 0)
 		ret = 0;
+	else if (ret < 0)
+		free_inode_elem_list(eie);
 	return ret;
 }
 
@@ -845,6 +858,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct list_head prefs_delayed;
 	struct list_head prefs;
 	struct __prelim_ref *ref;
+	struct extent_inode_elem *eie = NULL;
 
 	INIT_LIST_HEAD(&prefs);
 	INIT_LIST_HEAD(&prefs_delayed);
@@ -958,7 +972,6 @@ again:
 				goto out;
 		}
 		if (ref->count && ref->parent) {
-			struct extent_inode_elem *eie = NULL;
 			if (extent_item_pos && !ref->inode_list) {
 				u32 bsz;
 				struct extent_buffer *eb;
@@ -993,6 +1006,7 @@ again:
 					eie = eie->next;
 				eie->next = ref->inode_list;
 			}
+			eie = NULL;
 		}
 		list_del(&ref->list);
 		kmem_cache_free(btrfs_prelim_ref_cache, ref);
@@ -1011,7 +1025,8 @@ out:
 		list_del(&ref->list);
 		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
-
+	if (ret < 0)
+		free_inode_elem_list(eie);
 	return ret;
 }
 
@@ -1019,7 +1034,6 @@ static void free_leaf_list(struct ulist *blocks)
 {
 	struct ulist_node *node = NULL;
 	struct extent_inode_elem *eie;
-	struct extent_inode_elem *eie_next;
 	struct ulist_iterator uiter;
 
 	ULIST_ITER_INIT(&uiter);
@@ -1027,10 +1041,7 @@ static void free_leaf_list(struct ulist *blocks)
 		if (!node->aux)
 			continue;
 		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
-		for (; eie; eie = eie_next) {
-			eie_next = eie->next;
-			kfree(eie);
-		}
+		free_inode_elem_list(eie);
 		node->aux = 0;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 4c7a6f74ceeafd738b55d1c57349327f7ea8e895 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 29 Jan 2014 00:25:34 +0800
Subject: Btrfs: rework ulist with list+rb_tree

We are really suffering from now ulist's implementation, some developers
gave their try, and i just gave some of my ideas for things:

 1. use list+rb_tree instead of arrary+rb_tree

 2. add cur_list to iterator rather than ulist structure.

 3. add seqnum into every node when they are added, this is
 used to do selfcheck when iterating node.

I noticed Zach Brown's comments before, long term is to kick off
ulist implementation, however, for now, we need at least avoid
arrary from ulist.

Cc: Liu Bo <bo.li.liu@oracle.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Zach Brown <zab@redhat.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ulist.c | 105 +++++++++++++++++++++++--------------------------------
 fs/btrfs/ulist.h |  38 ++++++--------------
 2 files changed, 55 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 35f5de9dd49..8dd0e8dfdaf 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include "ulist.h"
+#include "ctree.h"
 
 /*
  * ulist is a generic data structure to hold a collection of unique u64
@@ -14,10 +15,6 @@
  * enumerating it.
  * It is possible to store an auxiliary value along with the key.
  *
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
- *
  * A sample usage for ulists is the enumeration of directed graphs without
  * visiting a node twice. The pseudo-code could look like this:
  *
@@ -50,10 +47,9 @@
  */
 void ulist_init(struct ulist *ulist)
 {
-	ulist->nnodes = 0;
-	ulist->nodes = ulist->int_nodes;
-	ulist->nodes_alloced = ULIST_SIZE;
+	INIT_LIST_HEAD(&ulist->nodes);
 	ulist->root = RB_ROOT;
+	ulist->nnodes = 0;
 }
 EXPORT_SYMBOL(ulist_init);
 
@@ -66,14 +62,14 @@ EXPORT_SYMBOL(ulist_init);
  */
 void ulist_fini(struct ulist *ulist)
 {
-	/*
-	 * The first ULIST_SIZE elements are stored inline in struct ulist.
-	 * Only if more elements are alocated they need to be freed.
-	 */
-	if (ulist->nodes_alloced > ULIST_SIZE)
-		kfree(ulist->nodes);
-	ulist->nodes_alloced = 0;	/* in case ulist_fini is called twice */
+	struct ulist_node *node;
+	struct ulist_node *next;
+
+	list_for_each_entry_safe(node, next, &ulist->nodes, list) {
+		kfree(node);
+	}
 	ulist->root = RB_ROOT;
+	INIT_LIST_HEAD(&ulist->nodes);
 }
 EXPORT_SYMBOL(ulist_fini);
 
@@ -192,57 +188,29 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 		    u64 *old_aux, gfp_t gfp_mask)
 {
-	int ret = 0;
-	struct ulist_node *node = NULL;
+	int ret;
+	struct ulist_node *node;
+
 	node = ulist_rbtree_search(ulist, val);
 	if (node) {
 		if (old_aux)
 			*old_aux = node->aux;
 		return 0;
 	}
+	node = kmalloc(sizeof(*node), gfp_mask);
+	if (!node)
+		return -ENOMEM;
 
-	if (ulist->nnodes >= ulist->nodes_alloced) {
-		u64 new_alloced = ulist->nodes_alloced + 128;
-		struct ulist_node *new_nodes;
-		void *old = NULL;
-		int i;
-
-		/*
-		 * if nodes_alloced == ULIST_SIZE no memory has been allocated
-		 * yet, so pass NULL to krealloc
-		 */
-		if (ulist->nodes_alloced > ULIST_SIZE)
-			old = ulist->nodes;
+	node->val = val;
+	node->aux = aux;
+#ifdef CONFIG_BTRFS_DEBUG
+	node->seqnum = ulist->nnodes;
+#endif
 
-		new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
-				     gfp_mask);
-		if (!new_nodes)
-			return -ENOMEM;
-
-		if (!old)
-			memcpy(new_nodes, ulist->int_nodes,
-			       sizeof(ulist->int_nodes));
-
-		ulist->nodes = new_nodes;
-		ulist->nodes_alloced = new_alloced;
-
-		/*
-		 * krealloc actually uses memcpy, which does not copy rb_node
-		 * pointers, so we have to do it ourselves.  Otherwise we may
-		 * be bitten by crashes.
-		 */
-		ulist->root = RB_ROOT;
-		for (i = 0; i < ulist->nnodes; i++) {
-			ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	ulist->nodes[ulist->nnodes].val = val;
-	ulist->nodes[ulist->nnodes].aux = aux;
-	ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]);
-	BUG_ON(ret);
-	++ulist->nnodes;
+	ret = ulist_rbtree_insert(ulist, node);
+	ASSERT(!ret);
+	list_add_tail(&node->list, &ulist->nodes);
+	ulist->nnodes++;
 
 	return 1;
 }
@@ -266,11 +234,26 @@ EXPORT_SYMBOL(ulist_add);
  */
 struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
 {
-	if (ulist->nnodes == 0)
+	struct ulist_node *node;
+
+	if (list_empty(&ulist->nodes))
 		return NULL;
-	if (uiter->i < 0 || uiter->i >= ulist->nnodes)
+	if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
 		return NULL;
-
-	return &ulist->nodes[uiter->i++];
+	if (uiter->cur_list) {
+		uiter->cur_list = uiter->cur_list->next;
+	} else {
+		uiter->cur_list = ulist->nodes.next;
+#ifdef CONFIG_BTRFS_DEBUG
+		uiter->i = 0;
+#endif
+	}
+	node = list_entry(uiter->cur_list, struct ulist_node, list);
+#ifdef CONFIG_BTRFS_DEBUG
+	ASSERT(node->seqnum == uiter->i);
+	ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
+	uiter->i++;
+#endif
+	return node;
 }
 EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index fb36731074b..2be7102d807 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -17,18 +17,12 @@
  * enumerating it.
  * It is possible to store an auxiliary value along with the key.
  *
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
  */
-
-/*
- * number of elements statically allocated inside struct ulist
- */
-#define ULIST_SIZE 16
-
 struct ulist_iterator {
+#ifdef CONFIG_BTRFS_DEBUG
 	int i;
+#endif
+	struct list_head *cur_list;  /* hint to start search */
 };
 
 /*
@@ -37,6 +31,12 @@ struct ulist_iterator {
 struct ulist_node {
 	u64 val;		/* value to store */
 	u64 aux;		/* auxiliary value saved along with the val */
+
+#ifdef CONFIG_BTRFS_DEBUG
+	int seqnum;		/* sequence number this node is added */
+#endif
+
+	struct list_head list;  /* used to link node */
 	struct rb_node rb_node;	/* used to speed up search */
 };
 
@@ -46,24 +46,8 @@ struct ulist {
 	 */
 	unsigned long nnodes;
 
-	/*
-	 * number of nodes we already have room for
-	 */
-	unsigned long nodes_alloced;
-
-	/*
-	 * pointer to the array storing the elements. The first ULIST_SIZE
-	 * elements are stored inline. In this case the it points to int_nodes.
-	 * After exceeding ULIST_SIZE, dynamic memory is allocated.
-	 */
-	struct ulist_node *nodes;
-
+	struct list_head nodes;
 	struct rb_root root;
-
-	/*
-	 * inline storage space for the first ULIST_SIZE entries
-	 */
-	struct ulist_node int_nodes[ULIST_SIZE];
 };
 
 void ulist_init(struct ulist *ulist);
@@ -77,6 +61,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 struct ulist_node *ulist_next(struct ulist *ulist,
 			      struct ulist_iterator *uiter);
 
-#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
+#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 49fc647a2c558862145357f3a25892248042f6fe Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 29 Jan 2014 00:25:35 +0800
Subject: Btrfs: do not export ulist functions

There are not any users that use ulist except Btrfs,don't
export them.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ulist.c | 10 +---------
 fs/btrfs/ulist.h |  1 -
 2 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 8dd0e8dfdaf..840a38b2778 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,6 @@
  */
 
 #include <linux/slab.h>
-#include <linux/export.h>
 #include "ulist.h"
 #include "ctree.h"
 
@@ -51,7 +50,6 @@ void ulist_init(struct ulist *ulist)
 	ulist->root = RB_ROOT;
 	ulist->nnodes = 0;
 }
-EXPORT_SYMBOL(ulist_init);
 
 /**
  * ulist_fini - free up additionally allocated memory for the ulist
@@ -60,7 +58,7 @@ EXPORT_SYMBOL(ulist_init);
  * This is useful in cases where the base 'struct ulist' has been statically
  * allocated.
  */
-void ulist_fini(struct ulist *ulist)
+static void ulist_fini(struct ulist *ulist)
 {
 	struct ulist_node *node;
 	struct ulist_node *next;
@@ -71,7 +69,6 @@ void ulist_fini(struct ulist *ulist)
 	ulist->root = RB_ROOT;
 	INIT_LIST_HEAD(&ulist->nodes);
 }
-EXPORT_SYMBOL(ulist_fini);
 
 /**
  * ulist_reinit - prepare a ulist for reuse
@@ -85,7 +82,6 @@ void ulist_reinit(struct ulist *ulist)
 	ulist_fini(ulist);
 	ulist_init(ulist);
 }
-EXPORT_SYMBOL(ulist_reinit);
 
 /**
  * ulist_alloc - dynamically allocate a ulist
@@ -104,7 +100,6 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
 
 	return ulist;
 }
-EXPORT_SYMBOL(ulist_alloc);
 
 /**
  * ulist_free - free dynamically allocated ulist
@@ -119,7 +114,6 @@ void ulist_free(struct ulist *ulist)
 	ulist_fini(ulist);
 	kfree(ulist);
 }
-EXPORT_SYMBOL(ulist_free);
 
 static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
 {
@@ -214,7 +208,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 
 	return 1;
 }
-EXPORT_SYMBOL(ulist_add);
 
 /**
  * ulist_next - iterate ulist
@@ -256,4 +249,3 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
 #endif
 	return node;
 }
-EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 2be7102d807..7f78cbf5cf4 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -51,7 +51,6 @@ struct ulist {
 };
 
 void ulist_init(struct ulist *ulist);
-void ulist_fini(struct ulist *ulist);
 void ulist_reinit(struct ulist *ulist);
 struct ulist *ulist_alloc(gfp_t gfp_mask);
 void ulist_free(struct ulist *ulist);
-- 
cgit v1.2.3-70-g09d2


From 23c6bf6a91e96c17a452e07b12b38ed66504e799 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sat, 11 Jan 2014 21:28:54 +0000
Subject: Btrfs: fix btrfs_search_slot_for_read backwards iteration

If the current path's leaf slot is 0, we do search for the previous
leaf (via btrfs_prev_leaf) and set the new path's leaf slot to a
value corresponding to the number of items - 1 of the former leaf.
Fix this by using the slot set by btrfs_prev_leaf, decrementing it
by 1 if it's equal to the leaf's number of items.

Use of btrfs_search_slot_for_read() for backward iteration is used in
particular by the send feature, which could miss items when the input
leaf has less items than its previous leaf.

This could be reproduced by running btrfs/007 from xfstests in a loop.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 30f5b11d7dd..cbd3a7d6fa6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3142,7 +3142,9 @@ again:
 			if (ret < 0)
 				return ret;
 			if (!ret) {
-				p->slots[0] = btrfs_header_nritems(leaf) - 1;
+				leaf = p->nodes[0];
+				if (p->slots[0] == btrfs_header_nritems(leaf))
+					p->slots[0]--;
 				return 0;
 			}
 			if (!return_any)
-- 
cgit v1.2.3-70-g09d2


From 514ac8ad8793a097c0c9d89202c642479d6dfa34 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Fri, 3 Jan 2014 21:07:00 -0800
Subject: Btrfs: don't use ram_bytes for uncompressed inline items

If we truncate an uncompressed inline item, ram_bytes isn't updated to reflect
the new size.  The fixe uses the size directly from the item header when
reading uncompressed inlines, and also fixes truncate to update the
size as it goes.

Reported-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
CC: stable@vger.kernel.org
---
 fs/btrfs/ctree.h      | 35 ++++++++++++++++++++++++++---------
 fs/btrfs/file.c       |  3 ++-
 fs/btrfs/inode.c      | 15 +++++++++++----
 fs/btrfs/print-tree.c |  2 +-
 fs/btrfs/send.c       | 11 +++++++----
 fs/btrfs/tree-log.c   |  8 +++++---
 6 files changed, 52 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84d4c052fcd..fceddbdfdd3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2990,15 +2990,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
 BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
 		   other_encoding, 16);
 
-/* this returns the number of file bytes represented by the inline item.
- * If an item is compressed, this is the uncompressed size
- */
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-					       struct btrfs_file_extent_item *e)
-{
-	return btrfs_file_extent_ram_bytes(eb, e);
-}
-
 /*
  * this returns the number of bytes used by the item on disk, minus the
  * size of any extent headers.  If a file is compressed on disk, this is
@@ -3012,6 +3003,32 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 	return btrfs_item_size(eb, e) - offset;
 }
 
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       int slot,
+					       struct btrfs_file_extent_item *fi)
+{
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+	/*
+	 * return the space used on disk if this item isn't
+	 * compressed or encoded
+	 */
+	if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 &&
+	    btrfs_token_file_extent_encryption(eb, fi, &token) == 0 &&
+	    btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) {
+		return btrfs_file_extent_inline_item_len(eb,
+							 btrfs_item_nr(slot));
+	}
+
+	/* otherwise use the ram bytes field */
+	return btrfs_token_file_extent_ram_bytes(eb, fi, &token);
+}
+
+
 /* btrfs_dev_stats_item */
 static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
 					struct btrfs_dev_stats_item *ptr,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3dfd8db0e24..0165b8672f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -772,7 +772,8 @@ next_slot:
 				btrfs_file_extent_num_bytes(leaf, fi);
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = key.offset +
-				btrfs_file_extent_inline_len(leaf, fi);
+				btrfs_file_extent_inline_len(leaf,
+						     path->slots[0], fi);
 		} else {
 			WARN_ON(1);
 			extent_end = search_start;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b6598783be..ad961a598c9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1281,7 +1281,8 @@ next_slot:
 			nocow = 1;
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = found_key.offset +
-				btrfs_file_extent_inline_len(leaf, fi);
+				btrfs_file_extent_inline_len(leaf,
+						     path->slots[0], fi);
 			extent_end = ALIGN(extent_end, root->sectorsize);
 		} else {
 			BUG_ON(1);
@@ -4023,7 +4024,7 @@ search_again:
 				    btrfs_file_extent_num_bytes(leaf, fi);
 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				item_end += btrfs_file_extent_inline_len(leaf,
-									 fi);
+							 path->slots[0], fi);
 			}
 			item_end--;
 		}
@@ -4093,6 +4094,12 @@ search_again:
 					inode_sub_bytes(inode, item_end + 1 -
 							new_size);
 				}
+
+				/*
+				 * update the ram bytes to properly reflect
+				 * the new size of our item
+				 */
+				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
 				size =
 				    btrfs_file_extent_calc_inline_size(size);
 				btrfs_truncate_item(root, path, size, 1);
@@ -6162,7 +6169,7 @@ again:
 		       btrfs_file_extent_num_bytes(leaf, item);
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size_t size;
-		size = btrfs_file_extent_inline_len(leaf, item);
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
 		extent_end = ALIGN(extent_start + size, root->sectorsize);
 	}
 next:
@@ -6231,7 +6238,7 @@ next:
 			goto out;
 		}
 
-		size = btrfs_file_extent_inline_len(leaf, item);
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
 		extent_offset = page_offset(page) + pg_offset - extent_start;
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
 				size - extent_offset);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 4eed002b7cf..6efd70d3b64 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -249,7 +249,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			    BTRFS_FILE_EXTENT_INLINE) {
 				printk(KERN_INFO "\t\tinline extent data "
 				       "size %u\n",
-				       btrfs_file_extent_inline_len(l, fi));
+				       btrfs_file_extent_inline_len(l, i, fi));
 				break;
 			}
 			printk(KERN_INFO "\t\textent data disk bytenr %llu "
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 85259cba784..730dce39585 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1377,7 +1377,7 @@ static int read_symlink(struct btrfs_root *root,
 	BUG_ON(compression);
 
 	off = btrfs_file_extent_inline_start(ei);
-	len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+	len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
 
 	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
 
@@ -4207,7 +4207,8 @@ static int send_write_or_clone(struct send_ctx *sctx,
 			struct btrfs_file_extent_item);
 	type = btrfs_file_extent_type(path->nodes[0], ei);
 	if (type == BTRFS_FILE_EXTENT_INLINE) {
-		len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+		len = btrfs_file_extent_inline_len(path->nodes[0],
+						   path->slots[0], ei);
 		/*
 		 * it is possible the inline item won't cover the whole page,
 		 * but there may be items after this page.  Make
@@ -4448,7 +4449,8 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
 			    struct btrfs_file_extent_item);
 	type = btrfs_file_extent_type(path->nodes[0], fi);
 	if (type == BTRFS_FILE_EXTENT_INLINE) {
-		u64 size = btrfs_file_extent_inline_len(path->nodes[0], fi);
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+							path->slots[0], fi);
 		extent_end = ALIGN(key.offset + size,
 				   sctx->send_root->sectorsize);
 	} else {
@@ -4482,7 +4484,8 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
 			    struct btrfs_file_extent_item);
 	type = btrfs_file_extent_type(path->nodes[0], fi);
 	if (type == BTRFS_FILE_EXTENT_INLINE) {
-		u64 size = btrfs_file_extent_inline_len(path->nodes[0], fi);
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+							path->slots[0], fi);
 		extent_end = ALIGN(key->offset + size,
 				   sctx->send_root->sectorsize);
 	} else {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b142b6dc96c..39d83da03e0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -570,7 +570,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 			nbytes = 0;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		size = btrfs_file_extent_inline_len(eb, item);
+		size = btrfs_file_extent_inline_len(eb, slot, item);
 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
 		extent_end = ALIGN(start + size, root->sectorsize);
 	} else {
@@ -3367,7 +3367,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 					struct btrfs_file_extent_item);
 		if (btrfs_file_extent_type(src, extent) ==
 		    BTRFS_FILE_EXTENT_INLINE) {
-			len = btrfs_file_extent_inline_len(src, extent);
+			len = btrfs_file_extent_inline_len(src,
+							   src_path->slots[0],
+							   extent);
 			*last_extent = ALIGN(key.offset + len,
 					     log->sectorsize);
 		} else {
@@ -3431,7 +3433,7 @@ fill_holes:
 		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
 		if (btrfs_file_extent_type(src, extent) ==
 		    BTRFS_FILE_EXTENT_INLINE) {
-			len = btrfs_file_extent_inline_len(src, extent);
+			len = btrfs_file_extent_inline_len(src, i, extent);
 			extent_end = ALIGN(key.offset + len, log->sectorsize);
 		} else {
 			len = btrfs_file_extent_num_bytes(src, extent);
-- 
cgit v1.2.3-70-g09d2


From 90d3e592e99b8e374ead2b45148abf506493a959 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Thu, 9 Jan 2014 17:28:00 -0800
Subject: Btrfs: setup inode location during btrfs_init_inode_locked

We have a race during inode init because the BTRFS_I(inode)->location is setup
after the inode hash table lock is dropped.  btrfs_find_actor uses the location
field, so our search might not find an existing inode in the hash table if we
race with the inode init code.

This commit changes things to setup the location field sooner.  Also the find actor now
uses only the location objectid to match inodes.  For inode hashing, we just
need a unique and stable test, it doesn't have to reflect the inode numbers we
show to userland.

Signed-off-by: Chris Mason <clm@fb.com>
CC: stable@vger.kernel.org
---
 fs/btrfs/inode.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ad961a598c9..fb74a536add 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -61,7 +61,7 @@
 #include "props.h"
 
 struct btrfs_iget_args {
-	u64 ino;
+	struct btrfs_key *location;
 	struct btrfs_root *root;
 };
 
@@ -4977,7 +4977,9 @@ again:
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct btrfs_iget_args *args = p;
-	inode->i_ino = args->ino;
+	inode->i_ino = args->location->objectid;
+	memcpy(&BTRFS_I(inode)->location, args->location,
+	       sizeof(*args->location));
 	BTRFS_I(inode)->root = args->root;
 	return 0;
 }
@@ -4985,19 +4987,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct btrfs_iget_args *args = opaque;
-	return args->ino == btrfs_ino(inode) &&
+	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
 		args->root == BTRFS_I(inode)->root;
 }
 
 static struct inode *btrfs_iget_locked(struct super_block *s,
-				       u64 objectid,
+				       struct btrfs_key *location,
 				       struct btrfs_root *root)
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
-	unsigned long hashval = btrfs_inode_hash(objectid, root);
+	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
 
-	args.ino = objectid;
+	args.location = location;
 	args.root = root;
 
 	inode = iget5_locked(s, hashval, btrfs_find_actor,
@@ -5014,13 +5016,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 {
 	struct inode *inode;
 
-	inode = btrfs_iget_locked(s, location->objectid, root);
+	inode = btrfs_iget_locked(s, location, root);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
-		BTRFS_I(inode)->root = root;
-		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
 		btrfs_read_locked_inode(inode);
 		if (!is_bad_inode(inode)) {
 			inode_tree_add(inode);
-- 
cgit v1.2.3-70-g09d2


From cf93da7bcf450cb4595055d491a0519cb39e68ed Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Wed, 29 Jan 2014 07:02:40 -0800
Subject: Btrfs: fix spin_unlock in check_ref_cleanup

Our goto out should have gone a little farther.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 73b55d94b95..9c9ecc93ae2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5893,7 +5893,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	spin_lock(&delayed_refs->lock);
 	head = btrfs_find_delayed_ref_head(trans, bytenr);
 	if (!head)
-		goto out;
+		goto out_delayed_unlock;
 
 	spin_lock(&head->lock);
 	if (rb_first(&head->ref_root))
@@ -5942,6 +5942,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	return ret;
 out:
 	spin_unlock(&head->lock);
+
+out_delayed_unlock:
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From f9c96fcc501a43dbc292b17fc0ded4b54e63b79d Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 29 Jan 2014 11:34:38 -0500
Subject: NFSv4.1 free slot before resending I/O to MDS

Fix a dynamic session slot leak where a slot is preallocated and I/O is
resent through the MDS.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4_fs.h        |  1 +
 fs/nfs/nfs4filelayout.c | 10 ++++++++--
 fs/nfs/nfs4proc.c       |  3 ++-
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5609edc742a..a5b27c2d968 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -270,6 +270,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 extern int nfs41_setup_sequence(struct nfs4_session *session,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		struct rpc_task *task);
+extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
 extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 03fd8be8c0c..20a56fa271b 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -335,8 +335,11 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
 	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
-	    task->tk_status == 0)
+	    task->tk_status == 0) {
+		if (rdata->res.seq_res.sr_slot != NULL)
+			nfs41_sequence_done(task, &rdata->res.seq_res);
 		return;
+	}
 
 	/* Note this may cause RPC to be resent */
 	rdata->header->mds_ops->rpc_call_done(task, data);
@@ -442,8 +445,11 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
 	struct nfs_write_data *wdata = data;
 
 	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
-	    task->tk_status == 0)
+	    task->tk_status == 0) {
+		if (wdata->res.seq_res.sr_slot != NULL)
+			nfs41_sequence_done(task, &wdata->res.seq_res);
 		return;
+	}
 
 	/* Note this may cause RPC to be resent */
 	wdata->header->mds_ops->rpc_call_done(task, data);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ed10d0d4f86..ae00c3ed733 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -592,7 +592,7 @@ out_unlock:
 		nfs41_server_notify_highest_slotid_update(session->clp);
 }
 
-static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
 	struct nfs4_session *session;
 	struct nfs4_slot *slot;
@@ -692,6 +692,7 @@ out_retry:
 	rpc_delay(task, NFS4_POLL_RETRY_MAX);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nfs41_sequence_done);
 
 static int nfs4_sequence_done(struct rpc_task *task,
 			       struct nfs4_sequence_res *res)
-- 
cgit v1.2.3-70-g09d2


From cab92c19821a814ecf5a5279e2699bf28e66caee Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 29 Jan 2014 12:12:15 -0500
Subject: NFSv4: Fix a slot leak in nfs40_sequence_done

The check for whether or not we sent an RPC call in nfs40_sequence_done
is insufficient to decide whether or not we are holding a session slot,
and thus should not be used to decide when to free that slot.

This patch replaces the RPC_WAS_SENT() test with the correct test for
whether or not slot == NULL.

Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: stable@vger.kernel.org # 3.12+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ae00c3ed733..493e9cce1f1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -539,7 +539,7 @@ static int nfs40_sequence_done(struct rpc_task *task,
 	struct nfs4_slot *slot = res->sr_slot;
 	struct nfs4_slot_table *tbl;
 
-	if (!RPC_WAS_SENT(task))
+	if (slot == NULL)
 		goto out;
 
 	tbl = slot->table;
-- 
cgit v1.2.3-70-g09d2


From a13ce7c629366880c1072d4d113d3dab6c53510d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 29 Jan 2014 12:24:03 -0500
Subject: NFSv4.1: Clean up nfs41_sequence_done

Move the test for res->sr_slot == NULL out of the nfs41_sequence_free_slot
helper and into the main function for efficiency.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 493e9cce1f1..42da6af7758 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -559,15 +559,10 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 {
 	struct nfs4_session *session;
 	struct nfs4_slot_table *tbl;
+	struct nfs4_slot *slot = res->sr_slot;
 	bool send_new_highest_used_slotid = false;
 
-	if (!res->sr_slot) {
-		/* just wake up the next guy waiting since
-		 * we may have not consumed a slot after all */
-		dprintk("%s: No slot\n", __func__);
-		return;
-	}
-	tbl = res->sr_slot->table;
+	tbl = slot->table;
 	session = tbl->session;
 
 	spin_lock(&tbl->slot_tbl_lock);
@@ -577,11 +572,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 	if (tbl->highest_used_slotid > tbl->target_highest_slotid)
 		send_new_highest_used_slotid = true;
 
-	if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) {
+	if (nfs41_wake_and_assign_slot(tbl, slot)) {
 		send_new_highest_used_slotid = false;
 		goto out_unlock;
 	}
-	nfs4_free_slot(tbl, res->sr_slot);
+	nfs4_free_slot(tbl, slot);
 
 	if (tbl->highest_used_slotid != NFS4_NO_SLOT)
 		send_new_highest_used_slotid = false;
@@ -595,16 +590,17 @@ out_unlock:
 int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
 	struct nfs4_session *session;
-	struct nfs4_slot *slot;
+	struct nfs4_slot *slot = res->sr_slot;
 	struct nfs_client *clp;
 	bool interrupted = false;
 	int ret = 1;
 
+	if (slot == NULL)
+		goto out_noaction;
 	/* don't increment the sequence number if the task wasn't sent */
 	if (!RPC_WAS_SENT(task))
 		goto out;
 
-	slot = res->sr_slot;
 	session = slot->table->session;
 
 	if (slot->interrupted) {
@@ -679,6 +675,7 @@ out:
 	/* The session may be reset by one of the error handlers. */
 	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
 	nfs41_sequence_free_slot(res);
+out_noaction:
 	return ret;
 retry_nowait:
 	if (rpc_restart_call_prepare(task)) {
-- 
cgit v1.2.3-70-g09d2


From 905e7dafbe1c69dd69197a9e8ba2e4bf518c9926 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 29 Jan 2014 12:26:57 -0500
Subject: NFSv4.1: Cleanup

It is now completely safe to call nfs41_sequence_free_slot with a NULL
slot.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4filelayout.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 20a56fa271b..12c8132ad40 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -336,8 +336,7 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
 
 	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
 	    task->tk_status == 0) {
-		if (rdata->res.seq_res.sr_slot != NULL)
-			nfs41_sequence_done(task, &rdata->res.seq_res);
+		nfs41_sequence_done(task, &rdata->res.seq_res);
 		return;
 	}
 
@@ -446,8 +445,7 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
 
 	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
 	    task->tk_status == 0) {
-		if (wdata->res.seq_res.sr_slot != NULL)
-			nfs41_sequence_done(task, &wdata->res.seq_res);
+		nfs41_sequence_done(task, &wdata->res.seq_res);
 		return;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 72466d0b92e04a7e0e5abf74c86eb352225346e4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 29 Jan 2014 06:22:25 -0800
Subject: ceph: fix posix ACL hooks

The merge of commit 7221fe4c2ed7 ("ceph: add acl for cephfs") raced with
upstream changes in the generic POSIX ACL code (eg commit 2aeccbe957d0
"fs: add generic xattr_acl handlers" and others).

Some of the fallout was fixed in commit 4db658ea0ca ("ceph: Fix up after
semantic merge conflict"), but it was incomplete: the set_acl
inode_operation wasn't getting set, and the prototype needed to be
adjusted a bit (it doesn't take a dentry anymore).

Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ceph/acl.c   | 9 ++++-----
 fs/ceph/dir.c   | 1 +
 fs/ceph/inode.c | 2 ++
 fs/ceph/super.h | 3 +++
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index f6911284c9b..66d377a12f7 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -107,14 +107,14 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-static int ceph_set_acl(struct dentry *dentry, struct inode *inode,
-				struct posix_acl *acl, int type)
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret = 0, size = 0;
 	const char *name = NULL;
 	char *value = NULL;
 	struct iattr newattrs;
 	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+	struct dentry *dentry = d_find_alias(inode);
 
 	if (acl) {
 		ret = posix_acl_valid(acl);
@@ -208,8 +208,7 @@ int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
 
 	if (IS_POSIXACL(dir) && acl) {
 		if (S_ISDIR(inode->i_mode)) {
-			ret = ceph_set_acl(dentry, inode, acl,
-						ACL_TYPE_DEFAULT);
+			ret = ceph_set_acl(inode, acl, ACL_TYPE_DEFAULT);
 			if (ret)
 				goto out_release;
 		}
@@ -217,7 +216,7 @@ int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
 		if (ret < 0)
 			goto out;
 		else if (ret > 0)
-			ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS);
+			ret = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
 		else
 			cache_no_acl(inode);
 	} else {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 619616d585b..6da4df84ba3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1303,6 +1303,7 @@ const struct inode_operations ceph_dir_iops = {
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
 	.get_acl = ceph_get_acl,
+	.set_acl = ceph_set_acl,
 	.mknod = ceph_mknod,
 	.symlink = ceph_symlink,
 	.mkdir = ceph_mkdir,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8b8b506636c..32d519d8a2e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -97,6 +97,7 @@ const struct inode_operations ceph_file_iops = {
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
 	.get_acl = ceph_get_acl,
+	.set_acl = ceph_set_acl,
 };
 
 
@@ -1616,6 +1617,7 @@ static const struct inode_operations ceph_symlink_iops = {
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
 	.get_acl = ceph_get_acl,
+	.set_acl = ceph_set_acl,
 };
 
 /*
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 345933948b6..aa260590f61 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -718,6 +718,7 @@ extern void ceph_queue_writeback(struct inode *inode);
 extern int ceph_do_getattr(struct inode *inode, int mask);
 extern int ceph_permission(struct inode *inode, int mask);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
 
@@ -741,12 +742,14 @@ extern const struct xattr_handler *ceph_xattr_handlers[];
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 
 struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
 void ceph_forget_all_cached_acls(struct inode *inode);
 
 #else
 
 #define ceph_get_acl NULL
+#define ceph_set_acl NULL
 
 static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
 				struct inode *dir)
-- 
cgit v1.2.3-70-g09d2


From dfd948e32af2e7b28bcd7a490c0a30d4b8df2a36 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Jan 2014 14:05:44 -0800
Subject: fs/compat: fix parameter handling for compat readv/writev syscalls

We got a report that the pwritev syscall does not work correctly in
compat mode on s390.

It turned out that with commit 72ec35163f9f ("switch compat readv/writev
variants to COMPAT_SYSCALL_DEFINE") we lost the zero extension of a
couple of syscall parameters because the some parameter types haven't
been converted from unsigned long to compat_ulong_t.

This is needed for architectures where the ABI requires that the caller
of a function performed zero and/or sign extension to 64 bit of all
parameters.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <stable@vger.kernel.org>	[v3.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/read_write.c        | 16 ++++++++--------
 include/linux/compat.h | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 1193ffd0356..edc5746a902 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -964,9 +964,9 @@ out:
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen)
+		compat_ulong_t, vlen)
 {
 	struct fd f = fdget(fd);
 	ssize_t ret;
@@ -1001,9 +1001,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, u32, pos_low, u32, pos_high)
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	return compat_sys_preadv64(fd, vec, vlen, pos);
@@ -1031,9 +1031,9 @@ out:
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
 		const struct compat_iovec __user *, vec,
-		unsigned long, vlen)
+		compat_ulong_t, vlen)
 {
 	struct fd f = fdget(fd);
 	ssize_t ret;
@@ -1068,9 +1068,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, u32, pos_low, u32, pos_high)
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	return compat_sys_pwritev64(fd, vec, vlen, pos);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index eb8a49d75ab..c5642b83e02 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -327,16 +327,16 @@ asmlinkage long compat_sys_keyctl(u32 option,
 			      u32 arg2, u32 arg3, u32 arg4, u32 arg5);
 asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32);
 
-asmlinkage ssize_t compat_sys_readv(unsigned long fd,
-		const struct compat_iovec __user *vec, unsigned long vlen);
-asmlinkage ssize_t compat_sys_writev(unsigned long fd,
-		const struct compat_iovec __user *vec, unsigned long vlen);
-asmlinkage ssize_t compat_sys_preadv(unsigned long fd,
+asmlinkage ssize_t compat_sys_readv(compat_ulong_t fd,
+		const struct compat_iovec __user *vec, compat_ulong_t vlen);
+asmlinkage ssize_t compat_sys_writev(compat_ulong_t fd,
+		const struct compat_iovec __user *vec, compat_ulong_t vlen);
+asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
-		unsigned long vlen, u32 pos_low, u32 pos_high);
-asmlinkage ssize_t compat_sys_pwritev(unsigned long fd,
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high);
+asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
-		unsigned long vlen, u32 pos_low, u32 pos_high);
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high);
 asmlinkage long comat_sys_lseek(unsigned int, compat_off_t, unsigned int);
 
 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
-- 
cgit v1.2.3-70-g09d2


From d8d14bd09cddbaf0168d61af638455a26bd027ff Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Jan 2014 14:05:46 -0800
Subject: fs/compat: fix lookup_dcookie() parameter handling

Commit d5dc77bfeeab ("consolidate compat lookup_dcookie()") coverted all
architectures to the new compat_sys_lookup_dcookie() syscall.

The "len" paramater of the new compat syscall must have the type
compat_size_t in order to enforce zero extension for architectures where
the ABI requires that the caller of a function performed zero and/or
sign extension to 64 bit of all parameters.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <stable@vger.kernel.org>	[v3.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcookies.c          | 2 +-
 include/linux/compat.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcookies.c b/fs/dcookies.c
index ab5954b5026..ac44a69fbea 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -204,7 +204,7 @@ out:
 }
 
 #ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len)
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
 {
 #ifdef __BIG_ENDIAN
 	return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c5642b83e02..19f6003291d 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -422,7 +422,7 @@ extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 				  compat_long_t addr, compat_long_t data);
 
-asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, size_t);
+asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
 /*
  * epoll (fs/eventpoll.c) compat bits follow ...
  */
-- 
cgit v1.2.3-70-g09d2


From 32d35d44d03c502a2fe2cd9ba9b70d5e220da439 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@lysator.liu.se>
Date: Thu, 30 Jan 2014 11:33:19 +0100
Subject: ceph: remove duplicate declaration of ceph_setattr

Signed-off-by: Peter Rosin <peda@lysator.liu.se>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/super.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index aa260590f61..19793b56d0a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -718,7 +718,6 @@ extern void ceph_queue_writeback(struct inode *inode);
 extern int ceph_do_getattr(struct inode *inode, int mask);
 extern int ceph_permission(struct inode *inode, int mask);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
 
-- 
cgit v1.2.3-70-g09d2


From 5f13ee9c1ce87b3c99928ab33ef43a2c0d3fd220 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 30 Jan 2014 06:01:52 -0800
Subject: nfs: fix xattr inode op pointers when disabled

Chris Mason reported a NULL pointer derefernence in generic_getxattr()
that was due to sb->s_xattr being NULL.

The reason is that the nfs #ifdef's for ACL support were misplaced, and
the nfs3 inode operations had the xattr operation pointers set up, even
though xattrs were not actually supported.  As a result, the xattr code
was being called without the infrastructure having been set up.

Move the #ifdef's appropriately.

Reported-and-tested-by: Chris Mason <clm@fb.com>
Acked-by: Al Viro viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfs3proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d2255d70542..aa9bc973f36 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -924,11 +924,11 @@ static const struct inode_operations nfs3_dir_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
 	.listxattr	= generic_listxattr,
 	.getxattr	= generic_getxattr,
 	.setxattr	= generic_setxattr,
 	.removexattr	= generic_removexattr,
-#ifdef CONFIG_NFS_V3_ACL
 	.get_acl	= nfs3_get_acl,
 	.set_acl	= nfs3_set_acl,
 #endif
@@ -938,11 +938,11 @@ static const struct inode_operations nfs3_file_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
 	.listxattr	= generic_listxattr,
 	.getxattr	= generic_getxattr,
 	.setxattr	= generic_setxattr,
 	.removexattr	= generic_removexattr,
-#ifdef CONFIG_NFS_V3_ACL
 	.get_acl	= nfs3_get_acl,
 	.set_acl	= nfs3_set_acl,
 #endif
-- 
cgit v1.2.3-70-g09d2


From 758582361976e9640a1cb9516c0af10cdf8e4753 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 30 Jan 2014 08:56:36 -0800
Subject: ceph: simplify ceph_{get,init}_acl

 - ->get_acl only gets called after we checked for a cached ACL, so no
   need to call get_cached_acl again.
 - no need to check IS_POSIXACL in ->get_acl, without that it should
   never get set as all the callers that set it already have the check.
 - you should be able to use the full posix_acl_create in CEPH

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/acl.c | 56 ++++++++++++++++----------------------------------------
 1 file changed, 16 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 66d377a12f7..9ab312e4963 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -66,13 +66,6 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
 	char *value = NULL;
 	struct posix_acl *acl;
 
-	if (!IS_POSIXACL(inode))
-		return NULL;
-
-	acl = ceph_get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
@@ -190,41 +183,24 @@ out:
 
 int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl = NULL;
-	int ret = 0;
-
-	if (!S_ISLNK(inode->i_mode)) {
-		if (IS_POSIXACL(dir)) {
-			acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl)) {
-				ret = PTR_ERR(acl);
-				goto out;
-			}
-		}
+	struct posix_acl *default_acl, *acl;
+	int error;
 
-		if (!acl)
-			inode->i_mode &= ~current_umask();
-	}
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
 
-	if (IS_POSIXACL(dir) && acl) {
-		if (S_ISDIR(inode->i_mode)) {
-			ret = ceph_set_acl(inode, acl, ACL_TYPE_DEFAULT);
-			if (ret)
-				goto out_release;
-		}
-		ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-		if (ret < 0)
-			goto out;
-		else if (ret > 0)
-			ret = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
-		else
-			cache_no_acl(inode);
-	} else {
+	if (!default_acl && !acl)
 		cache_no_acl(inode);
-	}
 
-out_release:
-	posix_acl_release(acl);
-out:
-	return ret;
+	if (default_acl) {
+		error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From a1800acaf7d1c2bf6d68b9a8f4ab8560cc66555a Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Mon, 27 Jan 2014 15:31:09 -0600
Subject: nfs: initialize the ACL support bits to zero.

Avoid returning incorrect acl mask attributes when the server doesn't
support ACLs.

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8c21d69a9dc..72f3bf1754e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3449,7 +3449,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 {
 	__be32 *p;
 
-	*res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
+	*res = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
-- 
cgit v1.2.3-70-g09d2


From a9a315d41407cd1079eb815f4adae897cc08b0d2 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Fri, 31 Jan 2014 14:27:16 +0000
Subject: cifs: Fix check for regular file in couldbe_mf_symlink()

MF Symlinks are regular files containing content in a specified format.

The function couldbe_mf_symlink() checks the mode for a set S_IFREG bit
as a test to confirm that it is a regular file. This bit is also set for
other filetypes and simply checking for this bit being set may return
false positives.

We ensure that we are actually checking for a regular file by using the
S_ISREG macro to test instead.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reported-by: Neil Brown <neilb@suse.de>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/link.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 52f41f9f7de..264ece71bdb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -185,7 +185,7 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
 bool
 couldbe_mf_symlink(const struct cifs_fattr *fattr)
 {
-	if (!(fattr->cf_mode & S_IFREG))
+	if (!S_ISREG(fattr->cf_mode))
 		/* it's not a symlink */
 		return false;
 
-- 
cgit v1.2.3-70-g09d2


From 77516dc92a14bb3853df959c0cb3ef51e659f5af Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 31 Jan 2014 07:25:25 -0800
Subject: ceph: fix missing dput in ceph_set_acl

Add matching dput() for d_find_alias().  Move d_find_alias() down a bit
at Julia's suggestion.

[ Introduced by commit 72466d0b92e0: "ceph: fix posix ACL hooks" ]

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ceph/acl.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 9ab312e4963..4c2d452c4bf 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -107,7 +107,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	char *value = NULL;
 	struct iattr newattrs;
 	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
-	struct dentry *dentry = d_find_alias(inode);
+	struct dentry *dentry;
 
 	if (acl) {
 		ret = posix_acl_valid(acl);
@@ -151,12 +151,13 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 			goto out_free;
 	}
 
+	dentry = d_find_alias(inode);
 	if (new_mode != old_mode) {
 		newattrs.ia_mode = new_mode;
 		newattrs.ia_valid = ATTR_MODE;
 		ret = ceph_setattr(dentry, &newattrs);
 		if (ret)
-			goto out_free;
+			goto out_dput;
 	}
 
 	if (value)
@@ -170,11 +171,13 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 			newattrs.ia_valid = ATTR_MODE;
 			ceph_setattr(dentry, &newattrs);
 		}
-		goto out_free;
+		goto out_dput;
 	}
 
 	ceph_set_cached_acl(inode, type, acl);
 
+out_dput:
+	dput(dentry);
 out_free:
 	kfree(value);
 out:
-- 
cgit v1.2.3-70-g09d2


From 9115eac2c788c17b57c9256cb322fa7371972ddf Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 27 Jan 2014 13:33:28 -0500
Subject: vfs: unexport the getname() symbol

Leaving getname() exported when putname() isn't is a bad idea.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index bcb838e2e52..445d9bbc21e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -209,7 +209,6 @@ getname(const char __user * filename)
 {
 	return getname_flags(filename, 0, NULL);
 }
-EXPORT_SYMBOL(getname);
 
 #ifdef CONFIG_AUDITSYSCALL
 void putname(struct filename *name)
-- 
cgit v1.2.3-70-g09d2


From 807612db2f9940b9fa6deaef054eb16d51bd3e00 Mon Sep 17 00:00:00 2001
From: Andrew Ruder <andrew.ruder@elecsyscorp.com>
Date: Thu, 30 Jan 2014 09:26:54 -0600
Subject: fs/super.c: sync ro remount after blocking writers

Move sync_filesystem() after sb_prepare_remount_readonly().  If writers
sneak in anywhere from sync_filesystem() to sb_prepare_remount_readonly()
it can cause inodes to be dirtied and writeback to occur well after
sys_mount() has completely successfully.

This was spotted by corrupted ubifs filesystems on reboot, but appears
that it can cause issues with any filesystem using writeback.

Cc: Artem Bityutskiy <dedekind1@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Richard Weinberger <richard@nod.at>
Co-authored-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Andrew Ruder <andrew.ruder@elecsyscorp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index cecd780e0f4..80d5cf2ca76 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -703,7 +703,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
-	sync_filesystem(sb);
 
 	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 
@@ -720,6 +719,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		}
 	}
 
+	sync_filesystem(sb);
+
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
 		if (retval) {
-- 
cgit v1.2.3-70-g09d2


From b168fff72109a3627686578e31e745f778832f98 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 29 Jan 2014 23:59:19 -0800
Subject: hfsplus: use xattr handlers for removexattr

hfsplus was already using the handlers for get and set operations,
and with the removal of can_set_xattr we've now allow operations that
wouldn't otherwise be allowed.

With this we can also centralize the special-casing of the osx.
attrs that don't have prefixes on disk in the osx xattr handlers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/dir.c   |  2 +-
 fs/hfsplus/inode.c |  2 +-
 fs/hfsplus/xattr.c | 60 ++++++++++++++++--------------------------------------
 fs/hfsplus/xattr.h |  2 --
 4 files changed, 19 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9ee62985e73..bdec66522de 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -529,7 +529,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 	.setxattr		= generic_setxattr,
 	.getxattr		= generic_getxattr,
 	.listxattr		= hfsplus_listxattr,
-	.removexattr		= hfsplus_removexattr,
+	.removexattr		= generic_removexattr,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
 	.get_acl		= hfsplus_get_posix_acl,
 	.set_acl		= hfsplus_set_posix_acl,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 4551cbd6bd4..fa929f325f8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -331,7 +331,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= hfsplus_listxattr,
-	.removexattr	= hfsplus_removexattr,
+	.removexattr	= generic_removexattr,
 #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
 	.get_acl	= hfsplus_get_posix_acl,
 	.set_acl	= hfsplus_set_posix_acl,
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 0b4a5c9b93c..4e27edc082a 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -11,6 +11,8 @@
 #include "xattr.h"
 #include "acl.h"
 
+static int hfsplus_removexattr(struct inode *inode, const char *name);
+
 const struct xattr_handler *hfsplus_xattr_handlers[] = {
 	&hfsplus_xattr_osx_handler,
 	&hfsplus_xattr_user_handler,
@@ -274,14 +276,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
 				HFSPLUS_IS_RSRC(inode))
 		return -EOPNOTSUPP;
 
-	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
-				XATTR_MAC_OSX_PREFIX_LEN) == 0)
-		name += XATTR_MAC_OSX_PREFIX_LEN;
-
-	if (value == NULL) {
-		value = "";
-		size = 0;
-	}
+	if (value == NULL)
+		return hfsplus_removexattr(inode, name);
 
 	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
 	if (err) {
@@ -399,16 +395,11 @@ end_setxattr:
 	return err;
 }
 
-static inline int is_osx_xattr(const char *xattr_name)
-{
-	return !is_known_namespace(xattr_name);
-}
-
 static int name_len(const char *xattr_name, int xattr_name_len)
 {
 	int len = xattr_name_len + 1;
 
-	if (is_osx_xattr(xattr_name))
+	if (!is_known_namespace(xattr_name))
 		len += XATTR_MAC_OSX_PREFIX_LEN;
 
 	return len;
@@ -419,7 +410,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
 	int len = name_len;
 	int offset = 0;
 
-	if (is_osx_xattr(xattr_name)) {
+	if (!is_known_namespace(xattr_name)) {
 		strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
 		offset += XATTR_MAC_OSX_PREFIX_LEN;
 		len += XATTR_MAC_OSX_PREFIX_LEN;
@@ -497,18 +488,6 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
 				HFSPLUS_IS_RSRC(inode))
 		return -EOPNOTSUPP;
 
-	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
-				XATTR_MAC_OSX_PREFIX_LEN) == 0) {
-		/* skip "osx." prefix */
-		name += XATTR_MAC_OSX_PREFIX_LEN;
-		/*
-		 * Don't allow retrieving properly prefixed attributes
-		 * by prepending them with "osx."
-		 */
-		if (is_known_namespace(name))
-			return -EOPNOTSUPP;
-	}
-
 	if (!strcmp_xattr_finder_info(name))
 		return hfsplus_getxattr_finder_info(inode, value, size);
 
@@ -743,28 +722,18 @@ end_listxattr:
 	return res;
 }
 
-int hfsplus_removexattr(struct dentry *dentry, const char *name)
+static int hfsplus_removexattr(struct inode *inode, const char *name)
 {
 	int err = 0;
-	struct inode *inode = dentry->d_inode;
 	struct hfs_find_data cat_fd;
 	u16 flags;
 	u16 cat_entry_type;
 	int is_xattr_acl_deleted = 0;
 	int is_all_xattrs_deleted = 0;
 
-	if ((!S_ISREG(inode->i_mode) &&
-			!S_ISDIR(inode->i_mode)) ||
-				HFSPLUS_IS_RSRC(inode))
-		return -EOPNOTSUPP;
-
 	if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
 		return -EOPNOTSUPP;
 
-	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
-				XATTR_MAC_OSX_PREFIX_LEN) == 0)
-		name += XATTR_MAC_OSX_PREFIX_LEN;
-
 	if (!strcmp_xattr_finder_info(name))
 		return -EOPNOTSUPP;
 
@@ -838,8 +807,12 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
 	if (len > HFSPLUS_ATTR_MAX_STRLEN)
 		return -EOPNOTSUPP;
 
-	strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
-	strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
+	/*
+	 * Don't allow retrieving properly prefixed attributes
+	 * by prepending them with "osx."
+	 */
+	if (is_known_namespace(name))
+		return -EOPNOTSUPP;
 
 	return hfsplus_getxattr(dentry, xattr_name, buffer, size);
 }
@@ -857,12 +830,13 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
 	if (len > HFSPLUS_ATTR_MAX_STRLEN)
 		return -EOPNOTSUPP;
 
+	/*
+	 * Don't allow setting properly prefixed attributes
+	 * by prepending them with "osx."
+	 */
 	if (is_known_namespace(name))
 		return -EOPNOTSUPP;
 
-	strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
-	strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
-
 	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
 }
 
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 9e214490c31..288530cf80b 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -40,8 +40,6 @@ static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
 
 ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
-int hfsplus_removexattr(struct dentry *dentry, const char *name);
-
 int hfsplus_init_security(struct inode *inode, struct inode *dir,
 				const struct qstr *qstr);
 
-- 
cgit v1.2.3-70-g09d2


From d22e6338db7f613dd4f6095c190682fcc519e4b7 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Fri, 31 Jan 2014 15:41:58 -0500
Subject: Fix mountpoint reference leakage in linkat

Recent changes to retry on ESTALE in linkat
(commit 442e31ca5a49e398351b2954b51f578353fdf210)
introduced a mountpoint reference leak and a small memory
leak in case a filesystem link operation returns ESTALE
which is pretty normal for distributed filesystems like
lustre, nfs and so on.
Free old_path in such a case.

[AV: there was another missing path_put() nearby - on the previous
goto retry]

Signed-off-by: Oleg Drokin: <green@linuxhacker.ru>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 445d9bbc21e..d580df2e680 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3926,10 +3926,13 @@ out_dput:
 	done_path_create(&new_path, new_dentry);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
-		if (!error)
+		if (!error) {
+			path_put(&old_path);
 			goto retry;
+		}
 	}
 	if (retry_estale(error, how)) {
+		path_put(&old_path);
 		how |= LOOKUP_REVAL;
 		goto retry;
 	}
-- 
cgit v1.2.3-70-g09d2


From 1bda2ac071cdfad217856126859bc0dc88ee6f83 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali.rohar@gmail.com>
Date: Tue, 28 Jan 2014 20:26:44 +0000
Subject: afs: proc cells and rootcell are writeable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both proc files are writeable and used for configuring cells. But
there is missing correct mode flag for writeable files. Without
this patch both proc files are read only.

[ It turns out they aren't really read-only, since root can write to
  them even if the write bit isn't set due to CAP_DAC_OVERRIDE ]

Signed-off-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index bddc5120ed4..24a905b076f 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -130,8 +130,8 @@ int afs_proc_init(void)
 	if (!proc_afs)
 		goto error_dir;
 
-	if (!proc_create("cells", 0, proc_afs, &afs_proc_cells_fops) ||
-	    !proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops))
+	if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
+	    !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
 		goto error_tree;
 
 	_leave(" = 0");
-- 
cgit v1.2.3-70-g09d2


From 2cbe5c76fc5e38e9af4b709593146e4b8272b69e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Date: Wed, 29 Jan 2014 00:10:44 +0100
Subject: hpfs: remember free space

Previously, hpfs scanned all bitmaps each time the user asked for free
space using statfs.  This patch changes it so that hpfs scans the
bitmaps only once, remembes the free space and on next invocation of
statfs it returns the value instantly.

New versions of wine are hammering on the statfs syscall very heavily,
making some games unplayable when they're stored on hpfs, with load
times in minutes.

This should be backported to the stable kernels because it fixes
user-visible problem (excessive level load times in wine).

Signed-off-by: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hpfs/alloc.c   | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/hpfs/hpfs_fn.h |  2 +-
 fs/hpfs/super.c   | 29 ++++++++++++++++++------
 3 files changed, 87 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index cdb84a83806..58b5106186d 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -8,6 +8,58 @@
 
 #include "hpfs_fn.h"
 
+static void hpfs_claim_alloc(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free != (unsigned)-1) {
+		if (unlikely(!sbi->sb_n_free)) {
+			hpfs_error(s, "free count underflow, allocating sector %08x", sec);
+			sbi->sb_n_free = -1;
+			return;
+		}
+		sbi->sb_n_free--;
+	}
+}
+
+static void hpfs_claim_free(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free != (unsigned)-1) {
+		if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) {
+			hpfs_error(s, "free count overflow, freeing sector %08x", sec);
+			sbi->sb_n_free = -1;
+			return;
+		}
+		sbi->sb_n_free++;
+	}
+}
+
+static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+		if (unlikely(!sbi->sb_n_free_dnodes)) {
+			hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec);
+			sbi->sb_n_free_dnodes = -1;
+			return;
+		}
+		sbi->sb_n_free_dnodes--;
+	}
+}
+
+static void hpfs_claim_dirband_free(struct super_block *s, secno sec)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+		if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) {
+			hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec);
+			sbi->sb_n_free_dnodes = -1;
+			return;
+		}
+		sbi->sb_n_free_dnodes++;
+	}
+}
+
 /*
  * Check if a sector is allocated in bitmap
  * This is really slow. Turned on only if chk==2
@@ -203,9 +255,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa
 	}
 	sec = 0;
 	ret:
+	if (sec) {
+		i = 0;
+		do
+			hpfs_claim_alloc(s, sec + i);
+		while (unlikely(++i < n));
+	}
 	if (sec && f_p) {
 		for (i = 0; i < forward; i++) {
-			if (!hpfs_alloc_if_possible(s, sec + i + 1)) {
+			if (!hpfs_alloc_if_possible(s, sec + n + i)) {
 				hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i);
 				sec = 0;
 				break;
@@ -228,6 +286,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
 	nr >>= 2;
 	sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
 	if (!sec) return 0;
+	hpfs_claim_dirband_alloc(s, sec);
 	return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
 }
 
@@ -242,6 +301,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
 		bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
 		hpfs_mark_4buffers_dirty(&qbh);
 		hpfs_brelse4(&qbh);
+		hpfs_claim_alloc(s, sec);
 		return 1;
 	}
 	hpfs_brelse4(&qbh);
@@ -275,6 +335,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
 		return;
 	}
 	bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f));
+	hpfs_claim_free(s, sec);
 	if (!--n) {
 		hpfs_mark_4buffers_dirty(&qbh);
 		hpfs_brelse4(&qbh);
@@ -359,6 +420,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
 		bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f));
 		hpfs_mark_4buffers_dirty(&qbh);
 		hpfs_brelse4(&qbh);
+		hpfs_claim_dirband_free(s, dno);
 	}
 }
 
@@ -366,7 +428,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
 			 dnode_secno *dno, struct quad_buffer_head *qbh)
 {
 	struct dnode *d;
-	if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) {
+	if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) {
 		if (!(*dno = alloc_in_dirband(s, near)))
 			if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL;
 	} else {
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 6797bf80f6e..3ba49c080e4 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -312,7 +312,7 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
 __printf(2, 3)
 void hpfs_error(struct super_block *, const char *, ...);
 int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
-unsigned hpfs_count_one_bitmap(struct super_block *, secno);
+unsigned hpfs_get_free_dnodes(struct super_block *);
 
 /*
  * local time (HPFS) to GMT (Unix)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b8d01ef6f53..4534ff688b7 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ static void hpfs_put_super(struct super_block *s)
 	call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi);
 }
 
-unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
+static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 {
 	struct quad_buffer_head qbh;
 	unsigned long *bits;
@@ -129,7 +129,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 
 	bits = hpfs_map_4sectors(s, secno, &qbh, 0);
 	if (!bits)
-		return 0;
+		return (unsigned)-1;
 	count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
 	hpfs_brelse4(&qbh);
 	return count;
@@ -144,30 +144,45 @@ static unsigned count_bitmaps(struct super_block *s)
 		hpfs_prefetch_bitmap(s, n);
 	}
 	for (n = 0; n < n_bands; n++) {
+		unsigned c;
 		hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
-		count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+		c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+		if (c != (unsigned)-1)
+			count += c;
 	}
 	return count;
 }
 
+unsigned hpfs_get_free_dnodes(struct super_block *s)
+{
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_n_free_dnodes == (unsigned)-1) {
+		unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap);
+		if (c == (unsigned)-1)
+			return 0;
+		sbi->sb_n_free_dnodes = c;
+	}
+	return sbi->sb_n_free_dnodes;
+}
+
 static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
 	hpfs_lock(s);
 
-	/*if (sbi->sb_n_free == -1) {*/
+	if (sbi->sb_n_free == (unsigned)-1)
 		sbi->sb_n_free = count_bitmaps(s);
-		sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap);
-	/*}*/
+
 	buf->f_type = s->s_magic;
 	buf->f_bsize = 512;
 	buf->f_blocks = sbi->sb_fs_size;
 	buf->f_bfree = sbi->sb_n_free;
 	buf->f_bavail = sbi->sb_n_free;
 	buf->f_files = sbi->sb_dirband_size / 4;
-	buf->f_ffree = sbi->sb_n_free_dnodes;
+	buf->f_ffree = hpfs_get_free_dnodes(s);
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = 254;
-- 
cgit v1.2.3-70-g09d2


From 1c0b8a7a62c3d3ebf53a8e40cc6da22f5e192d63 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Date: Wed, 29 Jan 2014 00:11:33 +0100
Subject: hpfs: optimize quad buffer loading

HPFS needs to load 4 consecutive 512-byte sectors when accessing the
directory nodes or bitmaps.  We can't switch to 2048-byte block size
because files are allocated in the units of 512-byte sectors.

Previously, the driver would allocate a 2048-byte area using kmalloc,
copy the data from four buffers to this area and eventually copy them
back if they were modified.

In the current implementation of the buffer cache, buffers are allocated
in the pagecache.  That means that 4 consecutive 512-byte buffers are
stored in consecutive areas in the kernel address space.  So, we don't
need to allocate extra memory and copy the content of the buffers there.

This patch optimizes the code to avoid copying the buffers.  It checks
if the four buffers are stored in contiguous memory - if they are not,
it falls back to allocating a 2048-byte area and copying data there.

Signed-off-by: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hpfs/buffer.c | 96 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 50 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index 4d0a1afa058..139ef1684d0 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -86,7 +86,6 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head
 void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh,
 		   int ahead)
 {
-	struct buffer_head *bh;
 	char *data;
 
 	hpfs_lock_assert(s);
@@ -100,34 +99,32 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
 
 	hpfs_prefetch_sectors(s, secno, 4 + ahead);
 
+	if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0;
+	if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1;
+	if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2;
+	if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3;
+
+	if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+	    likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+	    likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+		return qbh->data = qbh->bh[0]->b_data;
+	}
+
 	qbh->data = data = kmalloc(2048, GFP_NOFS);
 	if (!data) {
 		printk("HPFS: hpfs_map_4sectors: out of memory\n");
-		goto bail;
+		goto bail4;
 	}
 
-	qbh->bh[0] = bh = sb_bread(s, secno);
-	if (!bh)
-		goto bail0;
-	memcpy(data, bh->b_data, 512);
-
-	qbh->bh[1] = bh = sb_bread(s, secno + 1);
-	if (!bh)
-		goto bail1;
-	memcpy(data + 512, bh->b_data, 512);
-
-	qbh->bh[2] = bh = sb_bread(s, secno + 2);
-	if (!bh)
-		goto bail2;
-	memcpy(data + 2 * 512, bh->b_data, 512);
-
-	qbh->bh[3] = bh = sb_bread(s, secno + 3);
-	if (!bh)
-		goto bail3;
-	memcpy(data + 3 * 512, bh->b_data, 512);
+	memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512);
+	memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512);
+	memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512);
+	memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512);
 
 	return data;
 
+ bail4:
+	brelse(qbh->bh[3]);
  bail3:
 	brelse(qbh->bh[2]);
  bail2:
@@ -135,9 +132,6 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
  bail1:
 	brelse(qbh->bh[0]);
  bail0:
-	kfree(data);
-	printk("HPFS: hpfs_map_4sectors: read error\n");
- bail:
 	return NULL;
 }
 
@@ -155,44 +149,54 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno,
 		return NULL;
 	}
 
-	/*return hpfs_map_4sectors(s, secno, qbh, 0);*/
+	if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0;
+	if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1;
+	if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2;
+	if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3;
+
+	if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+	    likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+	    likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+		return qbh->data = qbh->bh[0]->b_data;
+	}
+
 	if (!(qbh->data = kmalloc(2048, GFP_NOFS))) {
 		printk("HPFS: hpfs_get_4sectors: out of memory\n");
-		return NULL;
+		goto bail4;
 	}
-	if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0;
-	if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1;
-	if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2;
-	if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3;
-	memcpy(qbh->data, qbh->bh[0]->b_data, 512);
-	memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512);
-	memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512);
-	memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512);
 	return qbh->data;
 
-	bail3:	brelse(qbh->bh[2]);
-	bail2:	brelse(qbh->bh[1]);
-	bail1:	brelse(qbh->bh[0]);
-	bail0:
+bail4:
+	brelse(qbh->bh[3]);
+bail3:
+	brelse(qbh->bh[2]);
+bail2:
+	brelse(qbh->bh[1]);
+bail1:
+	brelse(qbh->bh[0]);
+bail0:
 	return NULL;
 }
 	
 
 void hpfs_brelse4(struct quad_buffer_head *qbh)
 {
-	brelse(qbh->bh[3]);
-	brelse(qbh->bh[2]);
-	brelse(qbh->bh[1]);
+	if (unlikely(qbh->data != qbh->bh[0]->b_data))
+		kfree(qbh->data);
 	brelse(qbh->bh[0]);
-	kfree(qbh->data);
+	brelse(qbh->bh[1]);
+	brelse(qbh->bh[2]);
+	brelse(qbh->bh[3]);
 }	
 
 void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh)
 {
-	memcpy(qbh->bh[0]->b_data, qbh->data, 512);
-	memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512);
-	memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
-	memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+	if (unlikely(qbh->data != qbh->bh[0]->b_data)) {
+		memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512);
+		memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512);
+		memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
+		memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+	}
 	mark_buffer_dirty(qbh->bh[0]);
 	mark_buffer_dirty(qbh->bh[1]);
 	mark_buffer_dirty(qbh->bh[2]);
-- 
cgit v1.2.3-70-g09d2