From b1b5d7f9b7ac6a8e3452ac43a53eebf69fdf5a77 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@austin.ibm.com>
Date: Tue, 30 Aug 2005 14:28:56 -0500
Subject: JFS: jfs_delete_inode should always call clear_inode.

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
---
 fs/jfs/inode.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 767c7ecb429..37da3e33e75 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -128,21 +128,21 @@ void jfs_delete_inode(struct inode *inode)
 {
 	jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
 
-	if (is_bad_inode(inode) ||
-	    (JFS_IP(inode)->fileset != cpu_to_le32(FILESYSTEM_I)))
-			return;
+	if (!is_bad_inode(inode) &&
+	    (JFS_IP(inode)->fileset == cpu_to_le32(FILESYSTEM_I))) {
 
-	if (test_cflag(COMMIT_Freewmap, inode))
-		jfs_free_zero_link(inode);
+		if (test_cflag(COMMIT_Freewmap, inode))
+			jfs_free_zero_link(inode);
 
-	diFree(inode);
+		diFree(inode);
 
-	/*
-	 * Free the inode from the quota allocation.
-	 */
-	DQUOT_INIT(inode);
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+		/*
+		 * Free the inode from the quota allocation.
+		 */
+		DQUOT_INIT(inode);
+		DQUOT_FREE_INODE(inode);
+		DQUOT_DROP(inode);
+	}
 
 	clear_inode(inode);
 }
-- 
cgit v1.2.3-70-g09d2


From 4f4b401bfaa97edbea41a1fcab794148e7ac0421 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@austin.ibm.com>
Date: Thu, 1 Sep 2005 09:02:43 -0500
Subject: JFS: allow extended attributes to be set within a existing
 transaction

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
---
 fs/jfs/acl.c       | 24 +++++++++++++++------
 fs/jfs/jfs_acl.h   | 12 +++++++++--
 fs/jfs/jfs_xattr.h |  4 ++--
 fs/jfs/namei.c     | 63 ++++++++++++++++++++++++++++++++++++------------------
 fs/jfs/xattr.c     | 58 ++++++++++++++++++++++++++++++++++---------------
 5 files changed, 112 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index e892dab40c2..461e4934ca7 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -23,6 +23,7 @@
 #include <linux/quotaops.h>
 #include <linux/posix_acl_xattr.h>
 #include "jfs_incore.h"
+#include "jfs_txnmgr.h"
 #include "jfs_xattr.h"
 #include "jfs_acl.h"
 
@@ -75,7 +76,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
+		       struct posix_acl *acl)
 {
 	char *ea_name;
 	struct jfs_inode_info *ji = JFS_IP(inode);
@@ -110,7 +112,7 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		if (rc < 0)
 			goto out;
 	}
-	rc = __jfs_setxattr(inode, ea_name, value, size, 0);
+	rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0);
 out:
 	kfree(value);
 
@@ -143,7 +145,7 @@ int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return generic_permission(inode, mask, jfs_check_acl);
 }
 
-int jfs_init_acl(struct inode *inode, struct inode *dir)
+int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 {
 	struct posix_acl *acl = NULL;
 	struct posix_acl *clone;
@@ -159,7 +161,7 @@ int jfs_init_acl(struct inode *inode, struct inode *dir)
 
 	if (acl) {
 		if (S_ISDIR(inode->i_mode)) {
-			rc = jfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
 			if (rc)
 				goto cleanup;
 		}
@@ -173,7 +175,8 @@ int jfs_init_acl(struct inode *inode, struct inode *dir)
 		if (rc >= 0) {
 			inode->i_mode = mode;
 			if (rc > 0)
-				rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+				rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS,
+						 clone);
 		}
 		posix_acl_release(clone);
 cleanup:
@@ -202,8 +205,15 @@ static int jfs_acl_chmod(struct inode *inode)
 		return -ENOMEM;
 
 	rc = posix_acl_chmod_masq(clone, inode->i_mode);
-	if (!rc)
-		rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+	if (!rc) {
+		tid_t tid = txBegin(inode->i_sb, 0);
+		down(&JFS_IP(inode)->commit_sem);
+		rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, clone);
+		if (!rc)
+			rc = txCommit(tid, 1, &inode, 0);
+		txEnd(tid);
+		up(&JFS_IP(inode)->commit_sem);
+	}
 
 	posix_acl_release(clone);
 	return rc;
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index a3acd3eec05..a76293767c7 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -21,8 +21,16 @@
 #ifdef CONFIG_JFS_POSIX_ACL
 
 int jfs_permission(struct inode *, int, struct nameidata *);
-int jfs_init_acl(struct inode *, struct inode *);
+int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
 
-#endif		/* CONFIG_JFS_POSIX_ACL */
+#else
+
+static inline int jfs_init_acl(tid_t tid, struct inode *inode,
+			       struct inode *dir)
+{
+	return 0;
+}
+
+#endif
 #endif		/* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index a1052f3f0be..116a73ce307 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -52,8 +52,8 @@ struct jfs_ea_list {
 #define	END_EALIST(ealist) \
 	((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
 
-extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t,
-			  int);
+extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
+			  size_t, int);
 extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
 			int);
 extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 49ccde3937f..f23f9c2aa52 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -38,6 +38,24 @@ struct dentry_operations jfs_ci_dentry_operations;
 
 static s64 commitZeroLink(tid_t, struct inode *);
 
+/*
+ * NAME:	free_ea_wmap(inode)
+ *
+ * FUNCTION:	free uncommitted extended attributes from working map 
+ *
+ */
+static inline void free_ea_wmap(struct inode *inode)
+{
+	dxd_t *ea = &JFS_IP(inode)->ea;
+
+	if (ea->flag & DXD_EXTENT) {
+		/* free EA pages from cache */
+		invalidate_dxd_metapages(inode, *ea);
+		dbFree(inode, addressDXD(ea), lengthDXD(ea));
+	}
+	ea->flag = 0;
+}
+
 /*
  * NAME:	jfs_create(dip, dentry, mode)
  *
@@ -89,8 +107,13 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	down(&JFS_IP(dip)->commit_sem);
 	down(&JFS_IP(ip)->commit_sem);
 
+	rc = jfs_init_acl(tid, ip, dip);
+	if (rc)
+		goto out3;
+
 	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
 		jfs_err("jfs_create: dtSearch returned %d", rc);
+		txAbort(tid, 0);
 		goto out3;
 	}
 
@@ -139,6 +162,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	up(&JFS_IP(dip)->commit_sem);
 	up(&JFS_IP(ip)->commit_sem);
 	if (rc) {
+		free_ea_wmap(ip);
 		ip->i_nlink = 0;
 		iput(ip);
 	} else
@@ -147,11 +171,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
       out2:
 	free_UCSname(&dname);
 
-#ifdef CONFIG_JFS_POSIX_ACL
-	if (rc == 0)
-		jfs_init_acl(ip, dip);
-#endif
-
       out1:
 
 	jfs_info("jfs_create: rc:%d", rc);
@@ -216,8 +235,13 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	down(&JFS_IP(dip)->commit_sem);
 	down(&JFS_IP(ip)->commit_sem);
 
+	rc = jfs_init_acl(tid, ip, dip);
+	if (rc)
+		goto out3;
+
 	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
 		jfs_err("jfs_mkdir: dtSearch returned %d", rc);
+		txAbort(tid, 0);
 		goto out3;
 	}
 
@@ -267,6 +291,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	up(&JFS_IP(dip)->commit_sem);
 	up(&JFS_IP(ip)->commit_sem);
 	if (rc) {
+		free_ea_wmap(ip);
 		ip->i_nlink = 0;
 		iput(ip);
 	} else
@@ -275,10 +300,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
       out2:
 	free_UCSname(&dname);
 
-#ifdef CONFIG_JFS_POSIX_ACL
-	if (rc == 0)
-		jfs_init_acl(ip, dip);
-#endif
 
       out1:
 
@@ -1000,6 +1021,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	up(&JFS_IP(dip)->commit_sem);
 	up(&JFS_IP(ip)->commit_sem);
 	if (rc) {
+		free_ea_wmap(ip);
 		ip->i_nlink = 0;
 		iput(ip);
 	} else
@@ -1008,11 +1030,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
       out2:
 	free_UCSname(&dname);
 
-#ifdef CONFIG_JFS_POSIX_ACL
-	if (rc == 0)
-		jfs_init_acl(ip, dip);
-#endif
-
       out1:
 	jfs_info("jfs_symlink: rc:%d", rc);
 	return rc;
@@ -1328,17 +1345,25 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	down(&JFS_IP(dir)->commit_sem);
 	down(&JFS_IP(ip)->commit_sem);
 
-	if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
+	rc = jfs_init_acl(tid, ip, dir);
+	if (rc)
 		goto out3;
 
+	if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
 	tblk = tid_to_tblock(tid);
 	tblk->xflag |= COMMIT_CREATE;
 	tblk->ino = ip->i_ino;
 	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
 
 	ino = ip->i_ino;
-	if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack)))
+	if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) {
+		txAbort(tid, 0);
 		goto out3;
+	}
 
 	ip->i_op = &jfs_file_inode_operations;
 	jfs_ip->dev = new_encode_dev(rdev);
@@ -1360,6 +1385,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	up(&JFS_IP(ip)->commit_sem);
 	up(&JFS_IP(dir)->commit_sem);
 	if (rc) {
+		free_ea_wmap(ip);
 		ip->i_nlink = 0;
 		iput(ip);
 	} else
@@ -1368,11 +1394,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
       out1:
 	free_UCSname(&dname);
 
-#ifdef CONFIG_JFS_POSIX_ACL
-	if (rc == 0)
-		jfs_init_acl(ip, dir);
-#endif
-
       out:
 	jfs_info("jfs_mknod: returning %d", rc);
 	return rc;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 554ec739e49..35674b2a0e6 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -633,12 +633,12 @@ static void ea_release(struct inode *inode, struct ea_buffer *ea_buf)
 	}
 }
 
-static int ea_put(struct inode *inode, struct ea_buffer *ea_buf, int new_size)
+static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
+		  int new_size)
 {
 	struct jfs_inode_info *ji = JFS_IP(inode);
 	unsigned long old_blocks, new_blocks;
 	int rc = 0;
-	tid_t tid;
 
 	if (new_size == 0) {
 		ea_release(inode, ea_buf);
@@ -664,9 +664,6 @@ static int ea_put(struct inode *inode, struct ea_buffer *ea_buf, int new_size)
 	if (rc)
 		return rc;
 
-	tid = txBegin(inode->i_sb, 0);
-	down(&ji->commit_sem);
-
 	old_blocks = new_blocks = 0;
 
 	if (ji->ea.flag & DXD_EXTENT) {
@@ -695,11 +692,8 @@ static int ea_put(struct inode *inode, struct ea_buffer *ea_buf, int new_size)
 		DQUOT_FREE_BLOCK(inode, old_blocks);
 
 	inode->i_ctime = CURRENT_TIME;
-	rc = txCommit(tid, 1, &inode, 0);
-	txEnd(tid);
-	up(&ji->commit_sem);
 
-	return rc;
+	return 0;
 }
 
 /*
@@ -810,8 +804,8 @@ static int can_set_xattr(struct inode *inode, const char *name,
 	return permission(inode, MAY_WRITE, NULL);
 }
 
-int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
-		   size_t value_len, int flags)
+int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
+		   const void *value, size_t value_len, int flags)
 {
 	struct jfs_ea_list *ealist;
 	struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL;
@@ -825,9 +819,6 @@ int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
 	int rc;
 	int length;
 
-	if ((rc = can_set_xattr(inode, name, value, value_len)))
-		return rc;
-
 	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
 				  GFP_KERNEL);
@@ -939,7 +930,7 @@ int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
 
 	ealist->size = cpu_to_le32(new_size);
 
-	rc = ea_put(inode, &ea_buf, new_size);
+	rc = ea_put(tid, inode, &ea_buf, new_size);
 
 	goto out;
       release:
@@ -955,12 +946,29 @@ int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
 int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		 size_t value_len, int flags)
 {
+	struct inode *inode = dentry->d_inode;
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	int rc;
+	tid_t tid;
+
+	if ((rc = can_set_xattr(inode, name, value, value_len)))
+		return rc;
+
 	if (value == NULL) {	/* empty EA, do not remove */
 		value = "";
 		value_len = 0;
 	}
 
-	return __jfs_setxattr(dentry->d_inode, name, value, value_len, flags);
+	tid = txBegin(inode->i_sb, 0);
+	down(&ji->commit_sem);
+	rc = __jfs_setxattr(tid, dentry->d_inode, name, value, value_len,
+			    flags);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	up(&ji->commit_sem);
+
+	return rc;
 }
 
 static int can_get_xattr(struct inode *inode, const char *name)
@@ -1122,5 +1130,21 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
 
 int jfs_removexattr(struct dentry *dentry, const char *name)
 {
-	return __jfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+	struct inode *inode = dentry->d_inode;
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	int rc;
+	tid_t tid;
+
+	if ((rc = can_set_xattr(inode, name, NULL, 0)))
+		return rc;
+
+	tid = txBegin(inode->i_sb, 0);
+	down(&ji->commit_sem);
+	rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+	if (!rc)
+		rc = txCommit(tid, 1, &inode, 0);
+	txEnd(tid);
+	up(&ji->commit_sem);
+
+	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From 1d15b10f95d4c4295a0f2288c7be7b6a005490da Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@austin.ibm.com>
Date: Thu, 1 Sep 2005 09:05:39 -0500
Subject: JFS: Implement jfs_init_security

This atomically initializes the security xattr when an object is created

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
---
 fs/jfs/jfs_xattr.h | 10 ++++++++++
 fs/jfs/namei.c     | 22 ++++++++++++++++++++++
 fs/jfs/xattr.c     | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)

(limited to 'fs')

diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 116a73ce307..25e9990bccd 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -61,4 +61,14 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
 
+#ifdef CONFIG_JFS_SECURITY
+extern int jfs_init_security(tid_t, struct inode *, struct inode *);
+#else
+static inline int jfs_init_security(tid_t tid, struct inode *inode,
+				    struct inode *dir)
+{
+	return 0;
+}
+#endif
+
 #endif	/* H_JFS_XATTR */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index f23f9c2aa52..1abe7343f92 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -111,6 +111,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	if (rc)
 		goto out3;
 
+	rc = jfs_init_security(tid, ip, dip);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
 	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
 		jfs_err("jfs_create: dtSearch returned %d", rc);
 		txAbort(tid, 0);
@@ -239,6 +245,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	if (rc)
 		goto out3;
 
+	rc = jfs_init_security(tid, ip, dip);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
 	if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
 		jfs_err("jfs_mkdir: dtSearch returned %d", rc);
 		txAbort(tid, 0);
@@ -906,6 +918,10 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	down(&JFS_IP(dip)->commit_sem);
 	down(&JFS_IP(ip)->commit_sem);
 
+	rc = jfs_init_security(tid, ip, dip);
+	if (rc)
+		goto out3;
+
 	tblk = tid_to_tblock(tid);
 	tblk->xflag |= COMMIT_CREATE;
 	tblk->ino = ip->i_ino;
@@ -1349,6 +1365,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (rc)
 		goto out3;
 
+	rc = jfs_init_security(tid, ip, dir);
+	if (rc) {
+		txAbort(tid, 0);
+		goto out3;
+	}
+
 	if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) {
 		txAbort(tid, 0);
 		goto out3;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 35674b2a0e6..23aa5066b5a 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/quotaops.h>
+#include <linux/security.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_dmap.h"
@@ -1148,3 +1149,38 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 
 	return rc;
 }
+
+#ifdef CONFIG_JFS_SECURITY
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
+{
+	int rc;
+	size_t len;
+	void *value;
+	char *suffix;
+	char *name;
+
+	rc = security_inode_init_security(inode, dir, &suffix, &value, &len);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			return 0;
+		return rc;
+	}
+	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
+		       GFP_NOFS);
+	if (!name) {
+		rc = -ENOMEM;
+		goto kmalloc_failed;
+	}
+	strcpy(name, XATTR_SECURITY_PREFIX);
+	strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+
+	rc = __jfs_setxattr(tid, inode, name, value, len, 0);
+
+	kfree(name);
+kmalloc_failed:
+	kfree(suffix);
+	kfree(value);
+
+	return rc;
+}
+#endif
-- 
cgit v1.2.3-70-g09d2


From f016bad6be720496b5582a59738bca00a26f876c Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Thu, 8 Sep 2005 15:30:05 +1000
Subject: [XFS] Cleanup some -Wundef flag warnings in the endian macros (thanks
 Christoph).

SGI-PV: 942400
SGI-Modid: xfs-linux-melb:xfs-kern:23771a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_arch.h       | 22 ++++++++++++++--------
 fs/xfs/xfs_bmap_btree.c |  8 ++++----
 fs/xfs/xfs_bmap_btree.h | 12 ++++++------
 fs/xfs/xfs_dir_leaf.h   |  6 +++---
 fs/xfs/xfs_inode_item.c |  4 ++--
 fs/xfs/xfs_log_priv.h   | 10 +++-------
 6 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index ae35189b3d7..5ab0dd885b1 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -40,22 +40,28 @@
 
 #include <asm/byteorder.h>
 
-#ifdef __LITTLE_ENDIAN
-# define __BYTE_ORDER	__LITTLE_ENDIAN
-#endif
 #ifdef __BIG_ENDIAN
-# define __BYTE_ORDER	__BIG_ENDIAN
+#define	XFS_NATIVE_HOST	1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#else /* __KERNEL__ */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define	XFS_NATIVE_HOST	1
+#else
+#undef XFS_NATIVE_HOST
 #endif
 
 #endif	/* __KERNEL__ */
 
 /* do we need conversion? */
-
 #define ARCH_NOCONVERT 1
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-# define ARCH_CONVERT	0
-#else
+#ifdef XFS_NATIVE_HOST
 # define ARCH_CONVERT	ARCH_NOCONVERT
+#else
+# define ARCH_CONVERT	0
 #endif
 
 /* generic swapping macros */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 09c413576ba..09a77b17565 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -2017,7 +2017,7 @@ xfs_bmbt_get_state(
 				ext_flag);
 }
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 /* Endian flipping versions of the bmbt extraction functions */
 void
 xfs_bmbt_disk_get_all(
@@ -2087,7 +2087,7 @@ xfs_bmbt_disk_get_state(
 	return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r),
 				ext_flag);
 }
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 
 /*
@@ -2531,7 +2531,7 @@ xfs_bmbt_set_allf(
 #endif	/* XFS_BIG_BLKNOS */
 }
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 /*
  * Set all the fields in a bmap extent record from the uncompressed form.
  */
@@ -2617,7 +2617,7 @@ xfs_bmbt_disk_set_allf(
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 /*
  * Set the blockcount field in a bmap extent record.
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0a40cf126c2..2cf4fe45cbc 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -62,7 +62,7 @@ typedef struct xfs_bmdr_block
  *  l1:0-20 are blockcount.
  */
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#ifndef XFS_NATIVE_HOST
 
 #define BMBT_TOTAL_BITLEN	128	/* 128 bits, 16 bytes */
 #define BMBT_EXNTFLAG_BITOFF	0
@@ -87,7 +87,7 @@ typedef struct xfs_bmdr_block
 #define BMBT_BLOCKCOUNT_BITOFF	64 /* Start of second 64 bit container */
 #define BMBT_BLOCKCOUNT_BITLEN	21
 
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 
 #define BMBT_USE_64	1
@@ -505,7 +505,7 @@ xfs_exntst_t
 xfs_bmbt_get_state(
 	xfs_bmbt_rec_t	*r);
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 void
 xfs_bmbt_disk_get_all(
 	xfs_bmbt_rec_t	*r,
@@ -538,7 +538,7 @@ xfs_bmbt_disk_get_startoff(
 	xfs_bmbt_get_blockcount(r)
 #define xfs_bmbt_disk_get_startoff(r) \
 	xfs_bmbt_get_startoff(r)
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 int
 xfs_bmbt_increment(
@@ -623,7 +623,7 @@ xfs_bmbt_set_state(
 	xfs_bmbt_rec_t	*r,
 	xfs_exntst_t	v);
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 void
 xfs_bmbt_disk_set_all(
 	xfs_bmbt_rec_t	*r,
@@ -641,7 +641,7 @@ xfs_bmbt_disk_set_allf(
 	xfs_bmbt_set_all(r, s)
 #define xfs_bmbt_disk_set_allf(r, o, b, c, v) \
 	xfs_bmbt_set_allf(r, o, b, c, v)
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 void
 xfs_bmbt_to_bmdr(
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
index dd423ce1bc8..480bffc1f29 100644
--- a/fs/xfs/xfs_dir_leaf.h
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -127,13 +127,13 @@ typedef union {
 	 * Watch the order here (endian-ness dependent).
 	 */
 	struct {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#ifndef XFS_NATIVE_HOST
 		xfs_dahash_t	h;	/* hash value */
 		__uint32_t	be;	/* block and entry */
-#else	/* __BYTE_ORDER == __BIG_ENDIAN */
+#else
 		__uint32_t	be;	/* block and entry */
 		xfs_dahash_t	h;	/* hash value */
-#endif	/* __BYTE_ORDER == __BIG_ENDIAN */
+#endif /* XFS_NATIVE_HOST */
 	} s;
 } xfs_dircook_t;
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 276ec70eb7f..50e2cadf909 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -341,7 +341,7 @@ xfs_inode_item_format(
 			nrecs = ip->i_df.if_bytes /
 				(uint)sizeof(xfs_bmbt_rec_t);
 			ASSERT(nrecs > 0);
-#if __BYTE_ORDER == __BIG_ENDIAN
+#ifdef XFS_NATIVE_HOST
 			if (nrecs == ip->i_d.di_nextents) {
 				/*
 				 * There are no delayed allocation
@@ -473,7 +473,7 @@ xfs_inode_item_format(
 #endif
 			ASSERT(nrecs > 0);
 			ASSERT(nrecs == ip->i_d.di_anextents);
-#if __BYTE_ORDER == __BIG_ENDIAN
+#ifdef XFS_NATIVE_HOST
 			/*
 			 * There are not delayed allocation extents
 			 * for attributes, so just point at the array.
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index eb7fdc6ebc3..a884cea82fc 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -112,7 +112,7 @@ struct xfs_mount;
  * this has endian issues, of course.
  */
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#ifndef XFS_NATIVE_HOST
 #define GET_CLIENT_ID(i,arch) \
     ((i) & 0xff)
 #else
@@ -414,14 +414,10 @@ typedef struct xlog_op_header {
 #define XLOG_FMT_IRIX_BE  3
 
 /* our fmt */
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define XLOG_FMT XLOG_FMT_LINUX_LE
-#else
-#if __BYTE_ORDER == __BIG_ENDIAN
+#ifdef XFS_NATIVE_HOST
 #define XLOG_FMT XLOG_FMT_LINUX_BE
 #else
-#error unknown byte order
-#endif
+#define XLOG_FMT XLOG_FMT_LINUX_LE
 #endif
 
 typedef struct xlog_rec_header {
-- 
cgit v1.2.3-70-g09d2


From 20ba02879bc78cdf1ed89a1c6a92ee55d31ee103 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@bruce>
Date: Thu, 8 Sep 2005 15:34:58 +1000
Subject: [XFS] Remove special Kconfig XFS menu, make XFS options "inline".

Signed-off-by: Eric Sandeen <sandeen@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/Kconfig | 45 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index c92306f0fdc..8e8f32dabe5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,5 +1,3 @@
-menu "XFS support"
-
 config XFS_FS
 	tristate "XFS filesystem support"
 	select EXPORTFS if NFSD!=n
@@ -22,27 +20,11 @@ config XFS_FS
 
 config XFS_EXPORT
 	bool
-	default y if XFS_FS && EXPORTFS
-
-config XFS_RT
-	bool "Realtime support (EXPERIMENTAL)"
-	depends on XFS_FS && EXPERIMENTAL
-	help
-	  If you say Y here you will be able to mount and use XFS filesystems
-	  which contain a realtime subvolume. The realtime subvolume is a
-	  separate area of disk space where only file data is stored. The
-	  realtime subvolume is designed to provide very deterministic
-	  data rates suitable for media streaming applications.
-
-	  See the xfs man page in section 5 for a bit more information.
-
-	  This feature is unsupported at this time, is not yet fully
-	  functional, and may cause serious problems.
-
-	  If unsure, say N.
+	depends on XFS_FS && EXPORTFS
+	default y
 
 config XFS_QUOTA
-	bool "Quota support"
+	tristate "XFS Quota support"
 	depends on XFS_FS
 	help
 	  If you say Y here, you will be able to set limits for disk usage on
@@ -59,7 +41,7 @@ config XFS_QUOTA
 	  they are completely independent subsystems.
 
 config XFS_SECURITY
-	bool "Security Label support"
+	bool "XFS Security Label support"
 	depends on XFS_FS
 	help
 	  Security labels support alternative access control models
@@ -71,7 +53,7 @@ config XFS_SECURITY
 	  extended attributes for inode security labels, say N.
 
 config XFS_POSIX_ACL
-	bool "POSIX ACL support"
+	bool "XFS POSIX ACL support"
 	depends on XFS_FS
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
@@ -82,4 +64,19 @@ config XFS_POSIX_ACL
 
 	  If you don't know what Access Control Lists are, say N.
 
-endmenu
+config XFS_RT
+	bool "XFS Realtime support (EXPERIMENTAL)"
+	depends on XFS_FS && EXPERIMENTAL
+	help
+	  If you say Y here you will be able to mount and use XFS filesystems
+	  which contain a realtime subvolume. The realtime subvolume is a
+	  separate area of disk space where only file data is stored. The
+	  realtime subvolume is designed to provide very deterministic
+	  data rates suitable for media streaming applications.
+
+	  See the xfs man page in section 5 for a bit more information.
+
+	  This feature is unsupported at this time, is not yet fully
+	  functional, and may cause serious problems.
+
+	  If unsure, say N.
-- 
cgit v1.2.3-70-g09d2


From eccdfcd6f8265300380fa14a83aeb14e69830323 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@bruce>
Date: Thu, 8 Sep 2005 15:38:52 +1000
Subject: [XFS] Fix modular XFS builds (Makefile botch).

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/Makefile-linux-2.6       | 2 +-
 fs/xfs/quota/Makefile-linux-2.6 | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index fbfcbe5a7cd..8e18ff15724 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -55,7 +55,7 @@ ifeq ($(CONFIG_XFS_TRACE),y)
 endif
 
 obj-$(CONFIG_XFS_FS)		+= xfs.o
-obj-$(CONFIG_XFS_QUOTA)		+= quota/
+xfs-$(CONFIG_XFS_QUOTA)		+= quota/
 
 xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
diff --git a/fs/xfs/quota/Makefile-linux-2.6 b/fs/xfs/quota/Makefile-linux-2.6
index 8b7b676718b..93e60e83935 100644
--- a/fs/xfs/quota/Makefile-linux-2.6
+++ b/fs/xfs/quota/Makefile-linux-2.6
@@ -41,13 +41,13 @@ ifeq ($(CONFIG_XFS_TRACE),y)
 	EXTRA_CFLAGS += -DXFS_VNODE_TRACE
 endif
 
-obj-$(CONFIG_XFS_QUOTA)		+= xfs_quota.o
-
-xfs_quota-y			+= xfs_dquot.o \
+xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 				   xfs_dquot_item.o \
 				   xfs_trans_dquot.o \
 				   xfs_qm_syscalls.o \
 				   xfs_qm_bhv.o \
 				   xfs_qm.o
 
-xfs_quota-$(CONFIG_PROC_FS)	+= xfs_qm_stats.o
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)		+= xfs_qm_stats.o
+endif
-- 
cgit v1.2.3-70-g09d2


From e7a1033b946f4f2622f2b338ab107f559aad542c Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 16:12:28 +0100
Subject: NTFS: Support more clean journal ($LogFile) states.

      - Support journals ($LogFile) which have been modified by chkdsk.  This
        means users can boot into Windows after we marked the volume dirty.
        The Windows boot will run chkdsk and then reboot.  The user can then
        immediately boot into Linux rather than having to do a full Windows
        boot first before rebooting into Linux and we will recognize such a
        journal and empty it as it is clean by definition.
      - Support journals ($LogFile) with only one restart page as well as
        journals with two different restart pages.  We sanity check both and
        either use the only sane one or the more recent one of the two in the
        case that both are valid.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  13 +++
 fs/ntfs/Makefile  |   2 +-
 fs/ntfs/logfile.c | 251 +++++++++++++++++++++++++++++-------------------------
 fs/ntfs/logfile.h |   8 +-
 fs/ntfs/super.c   |  16 +++-
 5 files changed, 167 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 9eecc9939df..cceec3e9e2d 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -22,6 +22,19 @@ ToDo/Notes:
 	- Enable the code for setting the NT4 compatibility flag when we start
 	  making NTFS 1.2 specific modifications.
 
+2.1.24-WIP
+
+	- Support journals ($LogFile) which have been modified by chkdsk.  This
+	  means users can boot into Windows after we marked the volume dirty.
+	  The Windows boot will run chkdsk and then reboot.  The user can then
+	  immediately boot into Linux rather than having to do a full Windows
+	  boot first before rebooting into Linux and we will recognize such a
+	  journal and empty it as it is clean by definition.
+	- Support journals ($LogFile) with only one restart page as well as
+	  journals with two different restart pages.  We sanity check both and
+	  either use the only sane one or the more recent one of the two in the
+	  case that both are valid.
+
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
 
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index f083f27d8b6..ce970dacf90 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.23\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24-WIP\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 8edb8e20fb0..0173e95500d 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -121,7 +121,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	 */
 	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
-				"chkdsk but a chkdsk LSN is specified.");
+				"by chkdsk but a chkdsk LSN is specified.");
 		return FALSE;
 	}
 	ntfs_debug("Done.");
@@ -312,10 +312,12 @@ err_out:
  * @vi:		$LogFile inode to which the restart page belongs
  * @rp:		restart page to check
  * @pos:	position in @vi at which the restart page resides
- * @wrp:	copy of the multi sector transfer deprotected restart page
+ * @wrp:	[OUT] copy of the multi sector transfer deprotected restart page
+ * @lsn:	[OUT] set to the current logfile lsn on success
  *
- * Check the restart page @rp for consistency and return TRUE if it is
- * consistent and FALSE otherwise.
+ * Check the restart page @rp for consistency and return 0 if it is consistent
+ * and -errno otherwise.  The restart page may have been modified by chkdsk in
+ * which case its magic is CHKD instead of RSTR.
  *
  * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
  * require the full restart page.
@@ -323,25 +325,33 @@ err_out:
  * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
  * copy of the complete multi sector transfer deprotected page.  On failure,
  * *@wrp is undefined.
+ *
+ * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
+ *
+ * The following error codes are defined:
+ *	-EINVAL	- The restart page is inconsistent.
+ *	-ENOMEM	- Not enough memory to load the restart page.
+ *	-EIO	- Failed to reading from $LogFile.
  */
-static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
-		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp)
+static int ntfs_check_and_load_restart_page(struct inode *vi,
+		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
+		LSN *lsn)
 {
 	RESTART_AREA *ra;
 	RESTART_PAGE_HEADER *trp;
-	int size;
-	BOOL ret;
+	int size, err;
 
 	ntfs_debug("Entering.");
 	/* Check the restart page header for consistency. */
 	if (!ntfs_check_restart_page_header(vi, rp, pos)) {
 		/* Error output already done inside the function. */
-		return FALSE;
+		return -EINVAL;
 	}
 	/* Check the restart area for consistency. */
 	if (!ntfs_check_restart_area(vi, rp)) {
 		/* Error output already done inside the function. */
-		return FALSE;
+		return -EINVAL;
 	}
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
 	/*
@@ -352,7 +362,7 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 	if (!trp) {
 		ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
 				"restart page buffer.");
-		return FALSE;
+		return -ENOMEM;
 	}
 	/*
 	 * Read the whole of the restart page into the buffer.  If it fits
@@ -379,6 +389,9 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 			if (IS_ERR(page)) {
 				ntfs_error(vi->i_sb, "Error mapping $LogFile "
 						"page (index %lu).", idx);
+				err = PTR_ERR(page);
+				if (err != -EIO && err != -ENOMEM)
+					err = -EIO;
 				goto err_out;
 			}
 			size = min_t(int, to_read, PAGE_CACHE_SIZE);
@@ -392,29 +405,57 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 	/* Perform the multi sector transfer deprotection on the buffer. */
 	if (post_read_mst_fixup((NTFS_RECORD*)trp,
 			le32_to_cpu(rp->system_page_size))) {
-		ntfs_error(vi->i_sb, "Multi sector transfer error detected in "
-				"$LogFile restart page.");
-		goto err_out;
+		/*
+		 * A multi sector tranfer error was detected.  We only need to
+		 * abort if the restart page contents exceed the multi sector
+		 * transfer fixup of the first sector.
+		 */
+		if (le16_to_cpu(rp->restart_area_offset) +
+				le16_to_cpu(ra->restart_area_length) >
+				NTFS_BLOCK_SIZE - sizeof(u16)) {
+			ntfs_error(vi->i_sb, "Multi sector transfer error "
+					"detected in $LogFile restart page.");
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	/*
+	 * If the restart page is modified by chkdsk or there are no active
+	 * logfile clients, the logfile is consistent.  Otherwise, need to
+	 * check the log client records for consistency, too.
+	 */
+	err = 0;
+	if (ntfs_is_rstr_record(rp->magic) &&
+			ra->client_in_use_list != LOGFILE_NO_CLIENT) {
+		if (!ntfs_check_log_client_array(vi, trp)) {
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	if (lsn) {
+		if (ntfs_is_rstr_record(rp->magic))
+			*lsn = sle64_to_cpu(ra->current_lsn);
+		else /* if (ntfs_is_chkd_record(rp->magic)) */
+			*lsn = sle64_to_cpu(rp->chkdsk_lsn);
 	}
-	/* Check the log client records for consistency. */
-	ret = ntfs_check_log_client_array(vi, trp);
-	if (ret && wrp)
-		*wrp = trp;
-	else
-		ntfs_free(trp);
 	ntfs_debug("Done.");
-	return ret;
+	if (wrp)
+		*wrp = trp;
+	else {
 err_out:
-	ntfs_free(trp);
-	return FALSE;
+		ntfs_free(trp);
+	}
+	return err;
 }
 
 /**
  * ntfs_check_logfile - check the journal for consistency
  * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		[OUT] on success this is a copy of the current restart page
  *
  * Check the $LogFile journal for consistency and return TRUE if it is
- * consistent and FALSE if not.
+ * consistent and FALSE if not.  On success, the current restart page is
+ * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
  *
  * At present we only check the two restart pages and ignore the log record
  * pages.
@@ -424,19 +465,18 @@ err_out:
  * if the $LogFile was created on a system with a different page size to ours
  * yet and mst deprotection would fail if our page size is smaller.
  */
-BOOL ntfs_check_logfile(struct inode *log_vi)
+BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 {
-	s64 size, pos, rstr1_pos, rstr2_pos;
+	s64 size, pos;
+	LSN rstr1_lsn, rstr2_lsn;
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
 	struct address_space *mapping = log_vi->i_mapping;
 	struct page *page = NULL;
 	u8 *kaddr = NULL;
 	RESTART_PAGE_HEADER *rstr1_ph = NULL;
 	RESTART_PAGE_HEADER *rstr2_ph = NULL;
-	int log_page_size, log_page_mask, ofs;
+	int log_page_size, log_page_mask, err;
 	BOOL logfile_is_empty = TRUE;
-	BOOL rstr1_found = FALSE;
-	BOOL rstr2_found = FALSE;
 	u8 log_page_bits;
 
 	ntfs_debug("Entering.");
@@ -491,7 +531,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 			if (IS_ERR(page)) {
 				ntfs_error(vol->sb, "Error mapping $LogFile "
 						"page (index %lu).", idx);
-				return FALSE;
+				goto err_out;
 			}
 		}
 		kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
@@ -510,99 +550,95 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 		 */
 		if (ntfs_is_rcrd_recordp((le32*)kaddr))
 			break;
-		/*
-		 * A modified by chkdsk restart page means we cannot handle
-		 * this log file.
-		 */
-		if (ntfs_is_chkd_recordp((le32*)kaddr)) {
-			ntfs_error(vol->sb, "$LogFile has been modified by "
-					"chkdsk.  Mount this volume in "
-					"Windows.");
-			goto err_out;
-		}
-		/* If not a restart page, continue. */
-		if (!ntfs_is_rstr_recordp((le32*)kaddr)) {
-			/* Skip to the minimum page size for the next one. */
+		/* If not a (modified by chkdsk) restart page, continue. */
+		if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
+				!ntfs_is_chkd_recordp((le32*)kaddr)) {
 			if (!pos)
 				pos = NTFS_BLOCK_SIZE >> 1;
 			continue;
 		}
-		/* We now know we have a restart page. */
-		if (!pos) {
-			rstr1_found = TRUE;
-			rstr1_pos = pos;
-		} else {
-			if (rstr2_found) {
-				ntfs_error(vol->sb, "Found more than two "
-						"restart pages in $LogFile.");
-				goto err_out;
-			}
-			rstr2_found = TRUE;
-			rstr2_pos = pos;
-		}
 		/*
-		 * Check the restart page for consistency and get a copy of the
-		 * complete multi sector transfer deprotected restart page.
+		 * Check the (modified by chkdsk) restart page for consistency
+		 * and get a copy of the complete multi sector transfer
+		 * deprotected restart page.
 		 */
-		if (!ntfs_check_and_load_restart_page(log_vi,
+		err = ntfs_check_and_load_restart_page(log_vi,
 				(RESTART_PAGE_HEADER*)kaddr, pos,
-				!pos ? &rstr1_ph : &rstr2_ph)) {
-			/* Error output already done inside the function. */
-			goto err_out;
+				!rstr1_ph ? &rstr1_ph : &rstr2_ph,
+				!rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
+		if (!err) {
+			/*
+			 * If we have now found the first (modified by chkdsk)
+			 * restart page, continue looking for the second one.
+			 */
+			if (!pos) {
+				pos = NTFS_BLOCK_SIZE >> 1;
+				continue;
+			}
+			/*
+			 * We have now found the second (modified by chkdsk)
+			 * restart page, so we can stop looking.
+			 */
+			break;
 		}
 		/*
-		 * We have a valid restart page.  The next one must be after
-		 * a whole system page size as specified by the valid restart
-		 * page.
+		 * Error output already done inside the function.  Note, we do
+		 * not abort if the restart page was invalid as we might still
+		 * find a valid one further in the file.
 		 */
+		if (err != -EINVAL) {
+			ntfs_unmap_page(page);
+			goto err_out;
+		}
+		/* Continue looking. */
 		if (!pos)
-			pos = le32_to_cpu(rstr1_ph->system_page_size) >> 1;
+			pos = NTFS_BLOCK_SIZE >> 1;
 	}
-	if (page) {
+	if (page)
 		ntfs_unmap_page(page);
-		page = NULL;
-	}
 	if (logfile_is_empty) {
 		NVolSetLogFileEmpty(vol);
 is_empty:
 		ntfs_debug("Done.  ($LogFile is empty.)");
 		return TRUE;
 	}
-	if (!rstr1_found || !rstr2_found) {
-		ntfs_error(vol->sb, "Did not find two restart pages in "
-				"$LogFile.");
-		goto err_out;
+	if (!rstr1_ph) {
+		BUG_ON(rstr2_ph);
+		ntfs_error(vol->sb, "Did not find any restart pages in "
+				"$LogFile and it was not empty.");
+		return FALSE;
+	}
+	/* If both restart pages were found, use the more recent one. */
+	if (rstr2_ph) {
+		/*
+		 * If the second restart area is more recent, switch to it.
+		 * Otherwise just throw it away.
+		 */
+		if (rstr2_lsn > rstr1_lsn) {
+			ntfs_free(rstr1_ph);
+			rstr1_ph = rstr2_ph;
+			/* rstr1_lsn = rstr2_lsn; */
+		} else
+			ntfs_free(rstr2_ph);
+		rstr2_ph = NULL;
 	}
-	/*
-	 * The two restart areas must be identical except for the update
-	 * sequence number.
-	 */
-	ofs = le16_to_cpu(rstr1_ph->usa_ofs);
-	if (memcmp(rstr1_ph, rstr2_ph, ofs) || (ofs += sizeof(u16),
-			memcmp((u8*)rstr1_ph + ofs, (u8*)rstr2_ph + ofs,
-			le32_to_cpu(rstr1_ph->system_page_size) - ofs))) {
-		ntfs_error(vol->sb, "The two restart pages in $LogFile do not "
-				"match.");
-		goto err_out;
-	}
-	ntfs_free(rstr1_ph);
-	ntfs_free(rstr2_ph);
 	/* All consistency checks passed. */
+	if (rp)
+		*rp = rstr1_ph;
+	else
+		ntfs_free(rstr1_ph);
 	ntfs_debug("Done.");
 	return TRUE;
 err_out:
-	if (page)
-		ntfs_unmap_page(page);
 	if (rstr1_ph)
 		ntfs_free(rstr1_ph);
-	if (rstr2_ph)
-		ntfs_free(rstr2_ph);
 	return FALSE;
 }
 
 /**
  * ntfs_is_logfile_clean - check in the journal if the volume is clean
  * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		copy of the current restart page
  *
  * Analyze the $LogFile journal and return TRUE if it indicates the volume was
  * shutdown cleanly and FALSE if not.
@@ -619,11 +655,9 @@ err_out:
  * is empty this function requires that NVolLogFileEmpty() is true otherwise an
  * empty volume will be reported as dirty.
  */
-BOOL ntfs_is_logfile_clean(struct inode *log_vi)
+BOOL ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 {
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	struct page *page;
-	RESTART_PAGE_HEADER *rp;
 	RESTART_AREA *ra;
 
 	ntfs_debug("Entering.");
@@ -632,24 +666,15 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi)
 		ntfs_debug("Done.  ($LogFile is empty.)");
 		return TRUE;
 	}
-	/*
-	 * Read the first restart page.  It will be possibly incomplete and
-	 * will not be multi sector transfer deprotected but we only need the
-	 * first NTFS_BLOCK_SIZE bytes so it does not matter.
-	 */
-	page = ntfs_map_page(log_vi->i_mapping, 0);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Error mapping $LogFile page (index 0).");
+	BUG_ON(!rp);
+	if (!ntfs_is_rstr_record(rp->magic) &&
+			!ntfs_is_chkd_record(rp->magic)) {
+		ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
+				"probably a bug in that the $LogFile should "
+				"have been consistency checked before calling "
+				"this function.");
 		return FALSE;
 	}
-	rp = (RESTART_PAGE_HEADER*)page_address(page);
-	if (!ntfs_is_rstr_record(rp->magic)) {
-		ntfs_error(vol->sb, "No restart page found at offset zero in "
-				"$LogFile.  This is probably a bug in that "
-				"the $LogFile should have been consistency "
-				"checked before calling this function.");
-		goto err_out;
-	}
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
 	/*
 	 * If the $LogFile has active clients, i.e. it is open, and we do not
@@ -659,15 +684,11 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi)
 	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
 			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
 		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
-		goto err_out;
+		return FALSE;
 	}
-	ntfs_unmap_page(page);
 	/* $LogFile indicates a clean shutdown. */
 	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
 	return TRUE;
-err_out:
-	ntfs_unmap_page(page);
-	return FALSE;
 }
 
 /**
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 4ee4378de06..42388f95ea6 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -2,7 +2,7 @@
  * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
  *	       the Linux-NTFS project.
  *
- * Copyright (c) 2000-2004 Anton Altaparmakov
+ * Copyright (c) 2000-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -296,9 +296,11 @@ typedef struct {
 /* sizeof() = 160 (0xa0) bytes */
 } __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
 
-extern BOOL ntfs_check_logfile(struct inode *log_vi);
+extern BOOL ntfs_check_logfile(struct inode *log_vi,
+		RESTART_PAGE_HEADER **rp);
 
-extern BOOL ntfs_is_logfile_clean(struct inode *log_vi);
+extern BOOL ntfs_is_logfile_clean(struct inode *log_vi,
+		const RESTART_PAGE_HEADER *rp);
 
 extern BOOL ntfs_empty_logfile(struct inode *log_vi);
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 41aa8eb6755..bf8569d503a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1133,7 +1133,8 @@ mft_unmap_out:
  *
  * Return TRUE on success or FALSE on error.
  */
-static BOOL load_and_check_logfile(ntfs_volume *vol)
+static BOOL load_and_check_logfile(ntfs_volume *vol,
+		RESTART_PAGE_HEADER **rp)
 {
 	struct inode *tmp_ino;
 
@@ -1145,7 +1146,7 @@ static BOOL load_and_check_logfile(ntfs_volume *vol)
 		/* Caller will display error message. */
 		return FALSE;
 	}
-	if (!ntfs_check_logfile(tmp_ino)) {
+	if (!ntfs_check_logfile(tmp_ino, rp)) {
 		iput(tmp_ino);
 		/* ntfs_check_logfile() will have displayed error output. */
 		return FALSE;
@@ -1687,6 +1688,7 @@ static BOOL load_system_files(ntfs_volume *vol)
 	struct super_block *sb = vol->sb;
 	MFT_RECORD *m;
 	VOLUME_INFORMATION *vi;
+	RESTART_PAGE_HEADER *rp;
 	ntfs_attr_search_ctx *ctx;
 #ifdef NTFS_RW
 	int err;
@@ -1841,8 +1843,9 @@ get_ctx_vol_failed:
 	 * Get the inode for the logfile, check it and determine if the volume
 	 * was shutdown cleanly.
 	 */
-	if (!load_and_check_logfile(vol) ||
-			!ntfs_is_logfile_clean(vol->logfile_ino)) {
+	rp = NULL;
+	if (!load_and_check_logfile(vol, &rp) ||
+			!ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
 		static const char *es1a = "Failed to load $LogFile";
 		static const char *es1b = "$LogFile is not clean";
 		static const char *es2 = ".  Mount in Windows.";
@@ -1857,6 +1860,10 @@ get_ctx_vol_failed:
 						"continue nor on_errors="
 						"remount-ro was specified%s",
 						es1, es2);
+				if (vol->logfile_ino) {
+					BUG_ON(!rp);
+					ntfs_free(rp);
+				}
 				goto iput_logfile_err_out;
 			}
 			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
@@ -1867,6 +1874,7 @@ get_ctx_vol_failed:
 		/* This will prevent a read-write remount. */
 		NVolSetErrors(vol);
 	}
+	ntfs_free(rp);
 #endif /* NTFS_RW */
 	/* Get the root directory inode so we can do path lookups. */
 	vol->root_ino = ntfs_iget(sb, FILE_root);
-- 
cgit v1.2.3-70-g09d2


From 06d0e3cf3d527f927681773c6ffbe697ccc5db7f Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 16:28:25 +0100
Subject: NTFS: Allow highmem kmalloc() in ntfs_malloc_nofs() and add _nofail()
 version.

- Modify fs/ntfs/malloc.h::ntfs_malloc_nofs() to do the kmalloc() based
  allocations with __GFP_HIGHMEM, analogous to how the vmalloc() based
  allocations are done.
- Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
  ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
  hence cannot fail.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  6 ++++++
 fs/ntfs/malloc.h  | 48 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index cceec3e9e2d..c3bf17be0e1 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -34,6 +34,12 @@ ToDo/Notes:
 	  journals with two different restart pages.  We sanity check both and
 	  either use the only sane one or the more recent one of the two in the
 	  case that both are valid.
+	- Modify fs/ntfs/malloc.h::ntfs_malloc_nofs() to do the kmalloc() based
+	  allocations with __GFP_HIGHMEM, analogous to how the vmalloc() based
+	  allocations are done.
+	- Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
+	  ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
+	  hence cannot fail.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index fac5944df6d..9994e019a3c 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -27,27 +27,63 @@
 #include <linux/highmem.h>
 
 /**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
- * @size	number of bytes to allocate
+ * __ntfs_malloc - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ * @gfp_mask:	extra flags for the allocator
+ *
+ * Internal function.  You probably want ntfs_malloc_nofs()...
  *
  * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
  * returns a pointer to the allocated memory.
  *
  * If there was insufficient memory to complete the request, return NULL.
+ * Depending on @gfp_mask the allocation may be guaranteed to succeed.
  */
-static inline void *ntfs_malloc_nofs(unsigned long size)
+static inline void *__ntfs_malloc(unsigned long size,
+		unsigned int __nocast gfp_mask)
 {
 	if (likely(size <= PAGE_SIZE)) {
 		BUG_ON(!size);
 		/* kmalloc() has per-CPU caches so is faster for now. */
-		return kmalloc(PAGE_SIZE, GFP_NOFS);
-		/* return (void *)__get_free_page(GFP_NOFS | __GFP_HIGHMEM); */
+		return kmalloc(PAGE_SIZE, gfp_mask);
+		/* return (void *)__get_free_page(gfp_mask); */
 	}
 	if (likely(size >> PAGE_SHIFT < num_physpages))
-		return __vmalloc(size, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
 	return NULL;
 }
 
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
+}
+
+/**
+ * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+}
+
 static inline void ntfs_free(void *addr)
 {
 	if (likely(((unsigned long)addr < VMALLOC_START) ||
-- 
cgit v1.2.3-70-g09d2


From 9529d461d0992959026264b8fc002ac01d226708 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 16:33:12 +0100
Subject: NTFS: Use ntfs_malloc_nofs_nofail() in
 runlist.c::ntfs_runlists_merge()       in the two critical regions.  This
 means we no longer need to       panic() when the allocation fails as it now
 cannot fail.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  3 +++
 fs/ntfs/runlist.c | 68 ++++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 53 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index c3bf17be0e1..c5bbedff5ae 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -40,6 +40,9 @@ ToDo/Notes:
 	- Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
 	  ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
 	  hence cannot fail.
+	- Use ntfs_malloc_nofs_nofail() in the two critical regions in
+	  fs/ntfs/runlist.c::ntfs_runlists_merge().  This means we no longer
+	  need to panic() if the allocation fails as it now cannot fail.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 758855b0414..3bb4a57d1fa 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -35,7 +35,7 @@ static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
 		int size)
 {
 	if (likely((dst != src) && (size > 0)))
-		memmove(base + dst, base + src, size * sizeof (*base));
+		memmove(base + dst, base + src, size * sizeof(*base));
 }
 
 /**
@@ -94,6 +94,51 @@ static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
 	return new_rl;
 }
 
+/**
+ * ntfs_rl_realloc_nofail - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs_nofail(new_size);
+	BUG_ON(!new_rl);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
 /**
  * ntfs_are_rl_mergeable - test if two runlists can be joined together
  * @dst:	original runlist
@@ -621,11 +666,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
 				/* Add an unmapped runlist element. */
 				if (!slots) {
-					/* FIXME/TODO: We need to have the
-					 * extra memory already! (AIA) */
-					drl = ntfs_rl_realloc(drl, ds, ds + 2);
-					if (!drl)
-						goto critical_error;
+					drl = ntfs_rl_realloc_nofail(drl, ds,
+							ds + 2);
 					slots = 2;
 				}
 				ds++;
@@ -640,13 +682,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			drl[ds].length = marker_vcn - drl[ds].vcn;
 			/* Finally add the ENOENT terminator. */
 			ds++;
-			if (!slots) {
-				/* FIXME/TODO: We need to have the extra
-				 * memory already! (AIA) */
-				drl = ntfs_rl_realloc(drl, ds, ds + 1);
-				if (!drl)
-					goto critical_error;
-			}
+			if (!slots)
+				drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
 			drl[ds].vcn = marker_vcn;
 			drl[ds].lcn = LCN_ENOENT;
 			drl[ds].length = (s64)0;
@@ -659,11 +696,6 @@ finished:
 	ntfs_debug("Merged runlist:");
 	ntfs_debug_dump_runlist(drl);
 	return drl;
-
-critical_error:
-	/* Critical error! We cannot afford to fail here. */
-	ntfs_error(NULL, "Critical error! Not enough memory.");
-	panic("NTFS: Cannot continue.");
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 84d6ebe63f50b6efd8be252b58a207132157c60f Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 16:46:55 +0100
Subject: NTFS: Fix two nasty runlist merging bugs that had gone unnoticed so
 far.       Thanks to Stefano Picerno for the bug report.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog | 2 ++
 fs/ntfs/runlist.c | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index c5bbedff5ae..4bc8f91c5fc 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -43,6 +43,8 @@ ToDo/Notes:
 	- Use ntfs_malloc_nofs_nofail() in the two critical regions in
 	  fs/ntfs/runlist.c::ntfs_runlists_merge().  This means we no longer
 	  need to panic() if the allocation fails as it now cannot fail.
+	- Fix two nasty runlist merging bugs that had gone unnoticed so far.
+	  Thanks to Stefano Picerno for the bug report.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 3bb4a57d1fa..d26a1be530c 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -542,6 +542,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			/* Scan to the end of the source runlist. */
 			for (dend = 0; likely(drl[dend].length); dend++)
 				;
+			dend++;
 			drl = ntfs_rl_realloc(drl, dend, dend + 1);
 			if (IS_ERR(drl))
 				return drl;
@@ -611,8 +612,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
 		  (srl[send - 1].vcn + srl[send - 1].length)));
 
-	/* Or we'll lose an end marker */
-	if (start && finish && (drl[dins].length == 0))
+	/* Or we will lose an end marker. */
+	if (finish && !drl[dins].length)
 		ss++;
 	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
 		finish = FALSE;
-- 
cgit v1.2.3-70-g09d2


From 8bb735216a0675e247bbe8b8b92c09d6884d1a17 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 16:48:28 +0100
Subject: NTFS: Remove two bogus BUG_ON()s from fs/ntfs/mft.c.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog | 1 +
 fs/ntfs/mft.c     | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 4bc8f91c5fc..f4c27f7c1b6 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -45,6 +45,7 @@ ToDo/Notes:
 	  need to panic() if the allocation fails as it now cannot fail.
 	- Fix two nasty runlist merging bugs that had gone unnoticed so far.
 	  Thanks to Stefano Picerno for the bug report.
+	- Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 317f7c679fd..e27651db453 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -511,7 +511,6 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 		} while (bh);
 		tail->b_this_page = head;
 		attach_page_buffers(page, head);
-		BUG_ON(!page_has_buffers(page));
 	}
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
@@ -692,7 +691,6 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	 */
 	if (!NInoTestClearDirty(ni))
 		goto done;
-	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
 	rl = NULL;
-- 
cgit v1.2.3-70-g09d2


From 2b0ada2b8e086c267dd116a39ad41ff0a717b665 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 16:52:31 +0100
Subject: NTFS: Fix handling of valid but empty mapping pairs array in      
 fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog | 2 ++
 fs/ntfs/runlist.c | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index f4c27f7c1b6..8fe38c80116 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -46,6 +46,8 @@ ToDo/Notes:
 	- Fix two nasty runlist merging bugs that had gone unnoticed so far.
 	  Thanks to Stefano Picerno for the bug report.
 	- Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
+	- Fix handling of valid but empty mapping pairs array in
+	  fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index d26a1be530c..e4c4716939d 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -760,6 +760,9 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
 		ntfs_error(vol->sb, "Corrupt attribute.");
 		return ERR_PTR(-EIO);
 	}
+	/* If the mapping pairs array is valid but empty, nothing to do. */
+	if (!vcn && !*buf)
+		return old_rl;
 	/* Current position in runlist array. */
 	rlpos = 0;
 	/* Allocate first page and set current runlist size to one page. */
-- 
cgit v1.2.3-70-g09d2


From f94ad38e68e1623660fdbb063d0c580ba6661c29 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 17:04:11 +0100
Subject: NTFS: Report unrepresentable inodes during ntfs_readdir() as
 KERN_WARNING       messages and include the inode number.  Thanks to Yura
 Pakhuchiy for       pointing this out.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog | 3 +++
 fs/ntfs/dir.c     | 3 ++-
 fs/ntfs/unistr.c  | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 8fe38c80116..45f806fc45f 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -48,6 +48,9 @@ ToDo/Notes:
 	- Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
 	- Fix handling of valid but empty mapping pairs array in
 	  fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
+	- Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
+	  messages and include the inode number.  Thanks to Yura Pakhuchiy for
+	  pointing this out.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 46779471c54..795c3d1930f 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1051,7 +1051,8 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
 			ie->key.file_name.file_name_length, &name,
 			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
 	if (name_len <= 0) {
-		ntfs_debug("Skipping unrepresentable file.");
+		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
+				(long long)MREF_LE(ie->data.dir.indexed_file));
 		return 0;
 	}
 	if (ie->key.file_name.file_attributes &
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index 19c42e231b4..a389a5a16c8 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -372,7 +372,8 @@ retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
 	return -EINVAL;
 conversion_err:
 	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
-			"converted to character set %s.", nls->charset);
+			"converted to character set %s.  You might want to "
+			"try to use the mount option nls=utf8.", nls->charset);
 	if (ns != *outs)
 		kfree(ns);
 	if (wc != -ENAMETOOLONG)
-- 
cgit v1.2.3-70-g09d2


From 3ffc5a443824fcf426d8d35dc632acc4dd9fb6d1 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 20:23:06 +0100
Subject: NTFS: Change ntfs_rl_truncate_nolock() to throw away the runlist if
 the new       length is zero.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  2 ++
 fs/ntfs/runlist.c | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 45f806fc45f..79d9899b3b0 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -51,6 +51,8 @@ ToDo/Notes:
 	- Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
 	  messages and include the inode number.  Thanks to Yura Pakhuchiy for
 	  pointing this out.
+	- Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
+	  length is zero.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index e4c4716939d..539fa2b7f36 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1455,6 +1455,7 @@ err_out:
 
 /**
  * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @vol:	ntfs volume (needed for error output)
  * @runlist:	runlist to truncate
  * @new_length:	the new length of the runlist in VCNs
  *
@@ -1462,12 +1463,16 @@ err_out:
  * holding the runlist elements to a length of @new_length VCNs.
  *
  * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded.
+ * @new_length and above are discarded.  As a special case if @new_length is
+ * zero, the runlist is discarded and set to NULL.
  *
  * If @new_length lies beyond the runlist, a sparse runlist element is added to
  * the end of the runlist @runlist or if the last runlist element is a sparse
  * one already, this is extended.
  *
+ * Note, no checking is done for unmapped runlist elements.  It is assumed that
+ * the caller has mapped any elements that need to be mapped already.
+ *
  * Return 0 on success and -errno on error.
  *
  * Locking: The caller must hold @runlist->lock for writing.
@@ -1482,6 +1487,13 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	BUG_ON(!runlist);
 	BUG_ON(new_length < 0);
 	rl = runlist->rl;
+	if (!new_length) {
+		ntfs_debug("Freeing runlist.");
+		runlist->rl = NULL;
+		if (rl)
+			ntfs_free(rl);
+		return 0;
+	}
 	if (unlikely(!rl)) {
 		/*
 		 * Create a runlist consisting of a sparse runlist element of
-- 
cgit v1.2.3-70-g09d2


From 6e48321a40610f7213e3ac75ba234f6f8b3ed5f5 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 20:26:34 +0100
Subject: NTFS: Add ntfs_rl_punch_nolock() which punches a caller specified
 hole into a runlist.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |   2 +
 fs/ntfs/runlist.c | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ntfs/runlist.h |   3 +
 3 files changed, 289 insertions(+)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 79d9899b3b0..39dca6dced1 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -53,6 +53,8 @@ ToDo/Notes:
 	  pointing this out.
 	- Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
 	  length is zero.
+	- Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
+	  specified hole into a runlist.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 539fa2b7f36..f5b2ac92908 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1601,4 +1601,288 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	return 0;
 }
 
+/**
+ * ntfs_rl_punch_nolock - punch a hole into a runlist
+ * @vol:	ntfs volume (needed for error output)
+ * @runlist:	runlist to punch a hole into
+ * @start:	starting VCN of the hole to be created
+ * @length:	size of the hole to be created in units of clusters
+ *
+ * Punch a hole into the runlist @runlist starting at VCN @start and of size
+ * @length clusters.
+ *
+ * Return 0 on success and -errno on error, in which case @runlist has not been
+ * modified.
+ *
+ * If @start and/or @start + @length are outside the runlist return error code
+ * -ENOENT.
+ *
+ * If the runlist contains unmapped or error elements between @start and @start
+ * + @length return error code -EINVAL.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length)
+{
+	const VCN end = start + length;
+	s64 delta;
+	runlist_element *rl, *rl_end, *rl_real_end, *trl;
+	int old_size;
+	BOOL lcn_fixup = FALSE;
+
+	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
+			(long long)start, (long long)length);
+	BUG_ON(!runlist);
+	BUG_ON(start < 0);
+	BUG_ON(length < 0);
+	BUG_ON(end < 0);
+	rl = runlist->rl;
+	if (unlikely(!rl)) {
+		if (likely(!start && !length))
+			return 0;
+		return -ENOENT;
+	}
+	/* Find @start in the runlist. */
+	while (likely(rl->length && start >= rl[1].vcn))
+		rl++;
+	rl_end = rl;
+	/* Find @end in the runlist. */
+	while (likely(rl_end->length && end >= rl_end[1].vcn)) {
+		/* Verify there are no unmapped or error elements. */
+		if (unlikely(rl_end->lcn < LCN_HOLE))
+			return -EINVAL;
+		rl_end++;
+	}
+	/* Check the last element. */
+	if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
+		return -EINVAL;
+	/* This covers @start being out of bounds, too. */
+	if (!rl_end->length && end > rl_end->vcn)
+		return -ENOENT;
+	if (!length)
+		return 0;
+	if (!rl->length)
+		return -ENOENT;
+	rl_real_end = rl_end;
+	/* Determine the runlist size. */
+	while (likely(rl_real_end->length))
+		rl_real_end++;
+	old_size = rl_real_end - runlist->rl + 1;
+	/* If @start is in a hole simply extend the hole. */
+	if (rl->lcn == LCN_HOLE) {
+		/*
+		 * If both @start and @end are in the same sparse run, we are
+		 * done.
+		 */
+		if (end <= rl[1].vcn) {
+			ntfs_debug("Done (requested hole is already sparse).");
+			return 0;
+		}
+extend_hole:
+		/* Extend the hole. */
+		rl->length = end - rl->vcn;
+		/* If @end is in a hole, merge it with the current one. */
+		if (rl_end->lcn == LCN_HOLE) {
+			rl_end++;
+			rl->length = rl_end->vcn - rl->vcn;
+		}
+		/* We have done the hole.  Now deal with the remaining tail. */
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Adjust the beginning of the tail if necessary. */
+		if (end > rl->vcn) {
+			s64 delta = end - rl->vcn;
+			rl->vcn = end;
+			rl->length -= delta;
+			/* Only adjust the lcn if it is real. */
+			if (rl->lcn >= 0)
+				rl->lcn += delta;
+		}
+shrink_allocation:
+		/* Reallocate memory if the allocation changed. */
+		if (rl < rl_end) {
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size - (rl_end - rl));
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+		ntfs_debug("Done (extend hole).");
+		return 0;
+	}
+	/*
+	 * If @start is at the beginning of a run things are easier as there is
+	 * no need to split the first run.
+	 */
+	if (start == rl->vcn) {
+		/*
+		 * @start is at the beginning of a run.
+		 *
+		 * If the previous run is sparse, extend its hole.
+		 *
+		 * If @end is not in the same run, switch the run to be sparse
+		 * and extend the newly created hole.
+		 *
+		 * Thus both of these cases reduce the problem to the above
+		 * case of "@start is in a hole".
+		 */
+		if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
+			rl--;
+			goto extend_hole;
+		}
+		if (end >= rl[1].vcn) {
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		/*
+		 * The final case is when @end is in the same run as @start.
+		 * For this need to split the run into two.  One run for the
+		 * sparse region between the beginning of the old run, i.e.
+		 * @start, and @end and one for the remaining non-sparse
+		 * region, i.e. between @end and the end of the old run.
+		 */
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+split_end:
+		/* Shift all the runs up by one. */
+		memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+		/* Finally, setup the two split runs. */
+		rl->lcn = LCN_HOLE;
+		rl->length = length;
+		rl++;
+		rl->vcn += length;
+		/* Only adjust the lcn if it is real. */
+		if (rl->lcn >= 0 || lcn_fixup)
+			rl->lcn += length;
+		rl->length -= length;
+		ntfs_debug("Done (split one).");
+		return 0;
+	}
+	/*
+	 * @start is neither in a hole nor at the beginning of a run.
+	 *
+	 * If @end is in a hole, things are easier as simply truncating the run
+	 * @start is in to end at @start - 1, deleting all runs after that up
+	 * to @end, and finally extending the beginning of the run @end is in
+	 * to be @start is all that is needed.
+	 */
+	if (rl_end->lcn == LCN_HOLE) {
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Extend the beginning of the run @end is in to be @start. */
+		rl->vcn = start;
+		rl->length = rl[1].vcn - start;
+		goto shrink_allocation;
+	}
+	/* 
+	 * If @end is not in a hole there are still two cases to distinguish.
+	 * Either @end is or is not in the same run as @start.
+	 *
+	 * The second case is easier as it can be reduced to an already solved
+	 * problem by truncating the run @start is in to end at @start - 1.
+	 * Then, if @end is in the next run need to split the run into a sparse
+	 * run followed by a non-sparse run (already covered above) and if @end
+	 * is not in the next run switching it to be sparse, again reduces the
+	 * problem to the already covered case of "@start is in a hole".
+	 */
+	if (end >= rl[1].vcn) {
+		/*
+		 * If @end is not in the next run, reduce the problem to the
+		 * case of "@start is in a hole".
+		 */
+		if (rl[1].length && end >= rl[2].vcn) {
+			/* Truncate the run containing @start. */
+			rl->length = start - rl->vcn;
+			rl++;
+			rl->vcn = start;
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/*
+		 * @end is in the next run, reduce the problem to the case
+		 * where "@start is at the beginning of a run and @end is in
+		 * the same run as @start".
+		 */
+		delta = rl->vcn - start;
+		rl->vcn = start;
+		if (rl->lcn >= 0) {
+			rl->lcn -= delta;
+			/* Need this in case the lcn just became negative. */
+			lcn_fixup = TRUE;
+		}
+		rl->length += delta;
+		goto split_end;
+	}
+	/*
+	 * The first case from above, i.e. @end is in the same run as @start.
+	 * We need to split the run into three.  One run for the non-sparse
+	 * region between the beginning of the old run and @start, one for the
+	 * sparse region between @start and @end, and one for the remaining
+	 * non-sparse region, i.e. between @end and the end of the old run.
+	 */
+	trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
+	if (IS_ERR(trl))
+		goto enomem_out;
+	old_size += 2;
+	if (runlist->rl != trl) {
+		rl = trl + (rl - runlist->rl);
+		rl_end = trl + (rl_end - runlist->rl);
+		rl_real_end = trl + (rl_real_end - runlist->rl);
+		runlist->rl = trl;
+	}
+	/* Shift all the runs up by two. */
+	memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+	/* Finally, setup the three split runs. */
+	rl->length = start - rl->vcn;
+	rl++;
+	rl->vcn = start;
+	rl->lcn = LCN_HOLE;
+	rl->length = length;
+	rl++;
+	delta = end - rl->vcn;
+	rl->vcn = end;
+	rl->lcn += delta;
+	rl->length -= delta;
+	ntfs_debug("Done (split both).");
+	return 0;
+enomem_out:
+	ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
+	return -ENOMEM;
+}
+
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
index aa0ee6540e7..47728fbb610 100644
--- a/fs/ntfs/runlist.h
+++ b/fs/ntfs/runlist.h
@@ -94,6 +94,9 @@ extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
 		runlist *const runlist, const s64 new_length);
 
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length);
+
 #endif /* NTFS_RW */
 
 #endif /* _LINUX_NTFS_RUNLIST_H */
-- 
cgit v1.2.3-70-g09d2


From 8e08ceaeacd5d300aaad166f2eef8bfc37e09831 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 20:29:50 +0100
Subject: NTFS: Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the
 returned       index entry is in the index root, we forgot to set the @ir
 pointer in       the index context.  Thanks for Yura Pakhuchiy for finding
 this bug.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog | 3 +++
 fs/ntfs/index.c   | 1 +
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 39dca6dced1..1168d3ed2be 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -55,6 +55,9 @@ ToDo/Notes:
 	  length is zero.
 	- Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
 	  specified hole into a runlist.
+	- Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the returned
+	  index entry is in the index root, we forgot to set the @ir pointer in
+	  the index context.  Thanks to Yura Pakhuchiy for finding this bug.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 11fd5307d78..8f2d5727546 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -205,6 +205,7 @@ int ntfs_index_lookup(const void *key, const int key_len,
 				&ie->key, key_len)) {
 ir_done:
 			ictx->is_in_root = TRUE;
+			ictx->ir = ir;
 			ictx->actx = actx;
 			ictx->base_ni = base_ni;
 			ictx->ia = NULL;
-- 
cgit v1.2.3-70-g09d2


From f25dfb5e44fa8641961780d681bc1871abcfb861 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 20:35:33 +0100
Subject: NTFS: Remove bogus setting of PageError in
 ntfs_read_compressed_block().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog  | 1 +
 fs/ntfs/compress.c | 8 --------
 fs/ntfs/file.c     | 9 +++++++--
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 1168d3ed2be..96224c02079 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -58,6 +58,7 @@ ToDo/Notes:
 	- Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the returned
 	  index entry is in the index root, we forgot to set the @ir pointer in
 	  the index context.  Thanks to Yura Pakhuchiy for finding this bug.
+	- Remove bogus setting of PageError in ntfs_read_compressed_block().
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6d265cfd49a..25d24106f89 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -539,7 +539,6 @@ int ntfs_read_compressed_block(struct page *page)
 	if (unlikely(!pages || !bhs)) {
 		kfree(bhs);
 		kfree(pages);
-		SetPageError(page);
 		unlock_page(page);
 		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
 		return -ENOMEM;
@@ -871,9 +870,6 @@ lock_retry_remap:
 			for (; prev_cur_page < cur_page; prev_cur_page++) {
 				page = pages[prev_cur_page];
 				if (page) {
-					if (prev_cur_page == xpage &&
-							!xpage_done)
-						SetPageError(page);
 					flush_dcache_page(page);
 					kunmap(page);
 					unlock_page(page);
@@ -904,8 +900,6 @@ lock_retry_remap:
 					"Terminating them with extreme "
 					"prejudice.  Inode 0x%lx, page index "
 					"0x%lx.", ni->mft_no, page->index);
-			if (cur_page == xpage && !xpage_done)
-				SetPageError(page);
 			flush_dcache_page(page);
 			kunmap(page);
 			unlock_page(page);
@@ -953,8 +947,6 @@ err_out:
 	for (i = cur_page; i < max_page; i++) {
 		page = pages[i];
 		if (page) {
-			if (i == xpage && !xpage_done)
-				SetPageError(page);
 			flush_dcache_page(page);
 			kunmap(page);
 			unlock_page(page);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index e0f530ce6b9..be9fd1dd423 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
- * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
+ * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -94,6 +94,11 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
 		ret = ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
+	/*
+	 * NOTE: If we were to use mapping->private_list (see ext2 and
+	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
+	 * sync_mapping_buffers(vi->i_mapping).
+	 */
 	err = sync_blockdev(vi->i_sb->s_bdev);
 	if (unlikely(err && !ret))
 		ret = err;
-- 
cgit v1.2.3-70-g09d2


From 0aacceacf35451ffb771ec825555e98c5dce8b01 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 20:40:32 +0100
Subject: NTFS: Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  1 +
 fs/ntfs/attrib.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/ntfs/attrib.h  |  2 ++
 3 files changed, 43 insertions(+)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 96224c02079..29841bf8226 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -59,6 +59,7 @@ ToDo/Notes:
 	  index entry is in the index root, we forgot to set the @ir pointer in
 	  the index context.  Thanks to Yura Pakhuchiy for finding this bug.
 	- Remove bogus setting of PageError in ntfs_read_compressed_block().
+	- Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index cd0f9e740b1..79dda398068 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1246,6 +1246,46 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
 	return 0;
 }
 
+/**
+ * ntfs_resident_attr_value_resize - resize the value of a resident attribute
+ * @m:		mft record containing attribute record
+ * @a:		attribute record whose value to resize
+ * @new_size:	new size in bytes to which to resize the attribute value of @a
+ *
+ * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
+ * If the value is made bigger, the newly allocated space is cleared.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size)
+{
+	u32 old_size;
+
+	/* Resize the resident part of the attribute record. */
+	if (ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) + new_size))
+		return -ENOSPC;
+	/*
+	 * The resize succeeded!  If we made the attribute value bigger, clear
+	 * the area between the old size and @new_size.
+	 */
+	old_size = le32_to_cpu(a->data.resident.value_length);
+	if (new_size > old_size)
+		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+				old_size, 0, new_size - old_size);
+	/* Finally update the length of the attribute value. */
+	a->data.resident.value_length = cpu_to_le32(new_size);
+	return 0;
+}
+
 /**
  * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
  * @ni:		ntfs inode describing the attribute to convert
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 0e4ac6d3c0e..0618ed6fd7b 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -99,6 +99,8 @@ extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
 		const ATTR_TYPE type);
 
 extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size);
 
 extern int ntfs_attr_make_non_resident(ntfs_inode *ni);
 
-- 
cgit v1.2.3-70-g09d2


From 2983d1bd1a596e88cdddc0c2d45b9e97728f3f41 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 20:56:09 +0100
Subject: NTFS: Fix several bugs in fs/ntfs/attrib.c.

- Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
  access to the allocated size in the ntfs inode with the size lock.
- Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
  return LCN_ENOENT when there is no runlist and the allocated size is
  zero.
- Fix load_attribute_list() to handle the case of a NULL runlist.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  6 ++++++
 fs/ntfs/attrib.c  | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 29841bf8226..1d1dcab3902 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -60,6 +60,12 @@ ToDo/Notes:
 	  the index context.  Thanks to Yura Pakhuchiy for finding this bug.
 	- Remove bogus setting of PageError in ntfs_read_compressed_block().
 	- Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
+	- Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
+	  access to the allocated size in the ntfs inode with the size lock.
+	- Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
+	  return LCN_ENOENT when there is no runlist and the allocated size is
+	  zero.
+	- Fix load_attribute_list() to handle the case of a NULL runlist.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 79dda398068..98b5b96e839 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -43,6 +43,9 @@
  * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
  * of bounds of the runlist.
  *
+ * Note the runlist can be NULL after this function returns if @vcn is zero and
+ * the attribute has zero allocated size, i.e. there simply is no runlist.
+ *
  * Locking: - The runlist must be locked for writing.
  *	    - This function modifies the runlist.
  */
@@ -54,6 +57,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
 	ATTR_RECORD *a;
 	ntfs_attr_search_ctx *ctx;
 	runlist_element *rl;
+	unsigned long flags;
 	int err = 0;
 
 	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
@@ -85,8 +89,11 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
 	 * ntfs_mapping_pairs_decompress() fails.
 	 */
 	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
-	if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1))
+	if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1)) {
+		read_lock_irqsave(&ni->size_lock, flags);
 		end_vcn = ni->allocated_size >> ni->vol->cluster_size_bits;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
 	if (unlikely(vcn >= end_vcn)) {
 		err = -ENOENT;
 		goto err_out;
@@ -165,6 +172,7 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
 		const BOOL write_locked)
 {
 	LCN lcn;
+	unsigned long flags;
 	BOOL is_retry = FALSE;
 
 	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
@@ -173,6 +181,14 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
 	BUG_ON(!ni);
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return LCN_ENOENT;
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
 retry_remap:
 	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
 	lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
@@ -255,6 +271,7 @@ retry_remap:
 runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
 		const BOOL write_locked)
 {
+	unsigned long flags;
 	runlist_element *rl;
 	int err = 0;
 	BOOL is_retry = FALSE;
@@ -265,6 +282,14 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
 	BUG_ON(!ni);
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return ERR_PTR(-ENOENT);
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
 retry_remap:
 	rl = ni->runlist.rl;
 	if (likely(rl && vcn >= rl[0].vcn)) {
@@ -528,6 +553,11 @@ int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
 	block_size_bits = sb->s_blocksize_bits;
 	down_read(&runlist->lock);
 	rl = runlist->rl;
+	if (!rl) {
+		ntfs_error(sb, "Cannot read attribute list since runlist is "
+				"missing.");
+		goto err_out;	
+	}
 	/* Read all clusters specified by the runlist one run at a time. */
 	while (rl->length) {
 		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
-- 
cgit v1.2.3-70-g09d2


From 807c453de7c5487d2e5eece76bafdea8f39d249e Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 21:01:17 +0100
Subject: NTFS: Fix handling of sparse attributes in
 ntfs_attr_make_non_resident().       Also, add BUG() checks to
 ntfs_attr_make_non_resident() and       ntfs_attr_set() to ensure that these
 functions are never called       for compressed or encrypted attributes.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  4 ++++
 fs/ntfs/attrib.c  | 53 ++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 1d1dcab3902..b0cc43bd297 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -66,6 +66,10 @@ ToDo/Notes:
 	  return LCN_ENOENT when there is no runlist and the allocated size is
 	  zero.
 	- Fix load_attribute_list() to handle the case of a NULL runlist.
+	- Fix handling of sparse attributes in ntfs_attr_make_non_resident().
+	- Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
+	  to ensure that these functions are never called for compressed or
+	  encrypted attributes.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 98b5b96e839..3f9a4ff42ee 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1371,6 +1371,12 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 					"volume!");
 		return err;
 	}
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
 	/*
 	 * The size needs to be aligned to a cluster boundary for allocation
 	 * purposes.
@@ -1447,10 +1453,15 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 	BUG_ON(a->non_resident);
 	/*
 	 * Calculate new offsets for the name and the mapping pairs array.
-	 * We assume the attribute is not compressed or sparse.
 	 */
-	name_ofs = (offsetof(ATTR_REC,
-			data.non_resident.compressed_size) + 7) & ~7;
+	if (NInoSparse(ni) || NInoCompressed(ni))
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) +
+				sizeof(a->data.non_resident.compressed_size) +
+				7) & ~7;
+	else
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) + 7) & ~7;
 	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
 	/*
 	 * Determine the size of the resident part of the now non-resident
@@ -1489,24 +1500,23 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
 				a->name_length * sizeof(ntfschar));
 	a->name_offset = cpu_to_le16(name_ofs);
-	/*
-	 * FIXME: For now just clear all of these as we do not support them
-	 * when writing.
-	 */
-	a->flags &= cpu_to_le16(0xffff & ~le16_to_cpu(ATTR_IS_SPARSE |
-			ATTR_IS_ENCRYPTED | ATTR_COMPRESSION_MASK));
 	/* Setup the fields specific to non-resident attributes. */
 	a->data.non_resident.lowest_vcn = 0;
 	a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
 			vol->cluster_size_bits);
 	a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
-	a->data.non_resident.compression_unit = 0;
 	memset(&a->data.non_resident.reserved, 0,
 			sizeof(a->data.non_resident.reserved));
 	a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
 	a->data.non_resident.data_size =
 			a->data.non_resident.initialized_size =
 			cpu_to_sle64(attr_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		a->data.non_resident.compression_unit = 4;
+		a->data.non_resident.compressed_size =
+				a->data.non_resident.allocated_size;
+	} else
+		a->data.non_resident.compression_unit = 0;
 	/* Generate the mapping pairs array into the attribute record. */
 	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
 			arec_size - mp_ofs, rl, 0, -1, NULL);
@@ -1516,16 +1526,19 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 		goto undo_err_out;
 	}
 	/* Setup the in-memory attribute structure to be non-resident. */
-	/*
-	 * FIXME: For now just clear all of these as we do not support them
-	 * when writing.
-	 */
-	NInoClearSparse(ni);
-	NInoClearEncrypted(ni);
-	NInoClearCompressed(ni);
 	ni->runlist.rl = rl;
 	write_lock_irqsave(&ni->size_lock, flags);
 	ni->allocated_size = new_size;
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size = ni->allocated_size;
+		ni->itype.compressed.block_size = 1U <<
+				(a->data.non_resident.compression_unit +
+				vol->cluster_size_bits);
+		ni->itype.compressed.block_size_bits =
+				ffs(ni->itype.compressed.block_size) - 1;
+		ni->itype.compressed.block_clusters = 1U <<
+				a->data.non_resident.compression_unit;
+	}
 	write_unlock_irqrestore(&ni->size_lock, flags);
 	/*
 	 * This needs to be last since the address space operations ->readpage
@@ -1673,6 +1686,12 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	BUG_ON(cnt < 0);
 	if (!cnt)
 		goto done;
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
 	mapping = VFS_I(ni)->i_mapping;
 	/* Work out the starting index and page offset. */
 	idx = ofs >> PAGE_CACHE_SHIFT;
-- 
cgit v1.2.3-70-g09d2


From bbf1813fb8ff9d21171bf22e6d1f0e0393601e86 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 21:09:06 +0100
Subject: NTFS: Fix cluster (de)allocators to work when the runlist is NULL and
 more       importantly to take a locked runlist rather than them locking it  
     which leads to lock reversal.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog  |  3 +++
 fs/ntfs/lcnalloc.c | 39 ++++++++++++++++-----------------------
 fs/ntfs/lcnalloc.h | 21 ++++++++++++---------
 fs/ntfs/mft.c      |  2 +-
 4 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index b0cc43bd297..beed90c9f55 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -70,6 +70,9 @@ ToDo/Notes:
 	- Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
 	  to ensure that these functions are never called for compressed or
 	  encrypted attributes.
+	- Fix cluster (de)allocators to work when the runlist is NULL and more
+	  importantly to take a locked runlist rather than them locking it
+	  which leads to lock reversal.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index a4bc07616e5..7b593429068 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -54,6 +54,8 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
 	int ret = 0;
 
 	ntfs_debug("Entering.");
+	if (!rl)
+		return 0;
 	for (; rl->length; rl++) {
 		int err;
 
@@ -163,17 +165,9 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 	BUG_ON(zone < FIRST_ZONE);
 	BUG_ON(zone > LAST_ZONE);
 
-	/* Return empty runlist if @count == 0 */
-	// FIXME: Do we want to just return NULL instead? (AIA)
-	if (!count) {
-		rl = ntfs_malloc_nofs(PAGE_SIZE);
-		if (!rl)
-			return ERR_PTR(-ENOMEM);
-		rl[0].vcn = start_vcn;
-		rl[0].lcn = LCN_RL_NOT_MAPPED;
-		rl[0].length = 0;
-		return rl;
-	}
+	/* Return NULL if @count is zero. */
+	if (!count)
+		return NULL;
 	/* Take the lcnbmp lock for writing. */
 	down_write(&vol->lcnbmp_lock);
 	/*
@@ -788,7 +782,8 @@ out:
  * @vi:		vfs inode whose runlist describes the clusters to free
  * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
- * @is_rollback:	if TRUE this is a rollback operation
+ * @write_locked:	true if the runlist is locked for writing
+ * @is_rollback:	true if this is a rollback operation
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
  * described by the vfs inode @vi.
@@ -806,17 +801,17 @@ out:
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
- *	      unlocked on return.
- *	    - This function takes the runlist lock of @vi for reading and
- *	      sometimes for writing and sometimes modifies the runlist.
+ * Locking: - The runlist described by @vi must be locked on entry and is
+ *	      locked on return.  Note if the runlist is locked for reading the
+ *	      lock may be dropped and reacquired.  Note the runlist may be
+ *	      modified when needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
  */
 s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
-		const BOOL is_rollback)
+		const BOOL write_locked, const BOOL is_rollback)
 {
 	s64 delta, to_free, total_freed, real_freed;
 	ntfs_inode *ni;
@@ -848,8 +843,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 	total_freed = real_freed = 0;
 
-	down_read(&ni->runlist.lock);
-	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, FALSE);
+	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, write_locked);
 	if (IS_ERR(rl)) {
 		if (!is_rollback)
 			ntfs_error(vol->sb, "Failed to find first runlist "
@@ -903,7 +897,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 			/* Attempt to map runlist. */
 			vcn = rl->vcn;
-			rl = ntfs_attr_find_vcn_nolock(ni, vcn, FALSE);
+			rl = ntfs_attr_find_vcn_nolock(ni, vcn, write_locked);
 			if (IS_ERR(rl)) {
 				err = PTR_ERR(rl);
 				if (!is_rollback)
@@ -950,7 +944,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 		/* Update the total done clusters. */
 		total_freed += to_free;
 	}
-	up_read(&ni->runlist.lock);
 	if (likely(!is_rollback))
 		up_write(&vol->lcnbmp_lock);
 
@@ -960,7 +953,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 	ntfs_debug("Done.");
 	return real_freed;
 err_out:
-	up_read(&ni->runlist.lock);
 	if (is_rollback)
 		return err;
 	/* If no real clusters were freed, no need to rollback. */
@@ -973,7 +965,8 @@ err_out:
 	 * If rollback fails, set the volume errors flag, emit an error
 	 * message, and return the error code.
 	 */
-	delta = __ntfs_cluster_free(vi, start_vcn, total_freed, TRUE);
+	delta = __ntfs_cluster_free(vi, start_vcn, total_freed, write_locked,
+			TRUE);
 	if (delta < 0) {
 		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
 				"inconsistent metadata!  Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index 4cac1c024af..e4d7fb98d68 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -43,13 +43,14 @@ extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
 		const NTFS_CLUSTER_ALLOCATION_ZONES zone);
 
 extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count, const BOOL is_rollback);
+		s64 count, const BOOL write_locked, const BOOL is_rollback);
 
 /**
  * ntfs_cluster_free - free clusters on an ntfs volume
  * @vi:		vfs inode whose runlist describes the clusters to free
  * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
+ * @write_locked:	true if the runlist is locked for writing
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
  * described by the vfs inode @vi.
@@ -64,19 +65,19 @@ extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
- *	      unlocked on return.
- *	    - This function takes the runlist lock of @vi for reading and
- *	      sometimes for writing and sometimes modifies the runlist.
+ * Locking: - The runlist described by @vi must be locked on entry and is
+ *	      locked on return.  Note if the runlist is locked for reading the
+ *	      lock may be dropped and reacquired.  Note the runlist may be
+ *	      modified when needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
  */
 static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count)
+		s64 count, const BOOL write_locked)
 {
-	return __ntfs_cluster_free(vi, start_vcn, count, FALSE);
+	return __ntfs_cluster_free(vi, start_vcn, count, write_locked, FALSE);
 }
 
 extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
@@ -93,8 +94,10 @@ extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  *
  * Return 0 on success and -errno on error.
  *
- * Locking: This function takes the volume lcn bitmap lock for writing and
- *	    modifies the bitmap contents.
+ * Locking: - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - The caller must have locked the runlist @rl for reading or
+ *	      writing.
  */
 static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
 		const runlist_element *rl)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index e27651db453..2c32b84385a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1953,7 +1953,7 @@ restore_undo_alloc:
 	a = ctx->attr;
 	a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
 undo_alloc:
-	if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1) < 0) {
+	if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1, TRUE) < 0) {
 		ntfs_error(vol->sb, "Failed to free clusters from mft data "
 				"attribute.%s", es);
 		NVolSetErrors(vol);
-- 
cgit v1.2.3-70-g09d2


From 1c7d469d47668f4664b892a6cd1c452a0c02d710 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 21:15:09 +0100
Subject: NTFS: Truncate {a,c,m}time to the ntfs supported time granularity
 when       updating the times in the inode in ntfs_setattr().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  2 ++
 fs/ntfs/inode.c   | 12 +++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index beed90c9f55..9c5af9c8a8a 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -73,6 +73,8 @@ ToDo/Notes:
 	- Fix cluster (de)allocators to work when the runlist is NULL and more
 	  importantly to take a locked runlist rather than them locking it
 	  which leads to lock reversal.
+	- Truncate {a,c,m}time to the ntfs supported time granularity when
+	  updating the times in the inode in ntfs_setattr().
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 886214a77f9..89d844f6924 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2430,16 +2430,18 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 			 * We skipped the truncate but must still update
 			 * timestamps.
 			 */
-			ia_valid |= ATTR_MTIME|ATTR_CTIME;
+			ia_valid |= ATTR_MTIME | ATTR_CTIME;
 		}
 	}
-
 	if (ia_valid & ATTR_ATIME)
-		vi->i_atime = attr->ia_atime;
+		vi->i_atime = timespec_trunc(attr->ia_atime,
+				vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
-		vi->i_mtime = attr->ia_mtime;
+		vi->i_mtime = timespec_trunc(attr->ia_mtime,
+				vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
-		vi->i_ctime = attr->ia_ctime;
+		vi->i_ctime = timespec_trunc(attr->ia_ctime,
+				vi->i_sb->s_time_gran);
 	mark_inode_dirty(vi);
 out:
 	return err;
-- 
cgit v1.2.3-70-g09d2


From 67bb103725e4cde322cb4ddb160a12933c5c7072 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 21:19:45 +0100
Subject: NTFS: Fixup handling of sparse, compressed, and encrypted attributes
 in       fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |   2 +
 fs/ntfs/inode.c   | 215 +++++++++++++++++++++++++++++-------------------------
 2 files changed, 116 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 9c5af9c8a8a..ed8d0015093 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -75,6 +75,8 @@ ToDo/Notes:
 	  which leads to lock reversal.
 	- Truncate {a,c,m}time to the ntfs supported time granularity when
 	  updating the times in the inode in ntfs_setattr().
+	- Fixup handling of sparse, compressed, and encrypted attributes in
+	  fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode().
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 89d844f6924..dc4bbe3acf5 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1013,41 +1013,50 @@ skip_large_dir_stuff:
 		}
 		a = ctx->attr;
 		/* Setup the state. */
-		if (a->non_resident) {
-			NInoSetNonResident(ni);
-			if (a->flags & (ATTR_COMPRESSION_MASK |
-					ATTR_IS_SPARSE)) {
-				if (a->flags & ATTR_COMPRESSION_MASK) {
-					NInoSetCompressed(ni);
-					if (vol->cluster_size > 4096) {
-						ntfs_error(vi->i_sb, "Found "
+		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
+				NInoSetCompressed(ni);
+				if (vol->cluster_size > 4096) {
+					ntfs_error(vi->i_sb, "Found "
 							"compressed data but "
 							"compression is "
 							"disabled due to "
 							"cluster size (%i) > "
 							"4kiB.",
 							vol->cluster_size);
-						goto unm_err_out;
-					}
-					if ((a->flags & ATTR_COMPRESSION_MASK)
-							!= ATTR_IS_COMPRESSED) {
-						ntfs_error(vi->i_sb, "Found "
-							"unknown compression "
-							"method or corrupt "
-							"file.");
-						goto unm_err_out;
-					}
+					goto unm_err_out;
+				}
+				if ((a->flags & ATTR_COMPRESSION_MASK)
+						!= ATTR_IS_COMPRESSED) {
+					ntfs_error(vi->i_sb, "Found unknown "
+							"compression method "
+							"or corrupt file.");
+					goto unm_err_out;
 				}
-				if (a->flags & ATTR_IS_SPARSE)
-					NInoSetSparse(ni);
+			}
+			if (a->flags & ATTR_IS_SPARSE)
+				NInoSetSparse(ni);
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (NInoCompressed(ni)) {
+				ntfs_error(vi->i_sb, "Found encrypted and "
+						"compressed data.");
+				goto unm_err_out;
+			}
+			NInoSetEncrypted(ni);
+		}
+		if (a->non_resident) {
+			NInoSetNonResident(ni);
+			if (NInoCompressed(ni) || NInoSparse(ni)) {
 				if (a->data.non_resident.compression_unit !=
 						4) {
 					ntfs_error(vi->i_sb, "Found "
-						"nonstandard compression unit "
-						"(%u instead of 4).  Cannot "
-						"handle this.",
-						a->data.non_resident.
-						compression_unit);
+							"nonstandard "
+							"compression unit (%u "
+							"instead of 4).  "
+							"Cannot handle this.",
+							a->data.non_resident.
+							compression_unit);
 					err = -EOPNOTSUPP;
 					goto unm_err_out;
 				}
@@ -1065,14 +1074,6 @@ skip_large_dir_stuff:
 						a->data.non_resident.
 						compressed_size);
 			}
-			if (a->flags & ATTR_IS_ENCRYPTED) {
-				if (a->flags & ATTR_COMPRESSION_MASK) {
-					ntfs_error(vi->i_sb, "Found encrypted "
-							"and compressed data.");
-					goto unm_err_out;
-				}
-				NInoSetEncrypted(ni);
-			}
 			if (a->data.non_resident.lowest_vcn) {
 				ntfs_error(vi->i_sb, "First extent of $DATA "
 						"attribute has non zero "
@@ -1212,6 +1213,75 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	if (unlikely(err))
 		goto unm_err_out;
 	a = ctx->attr;
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			NInoSetCompressed(ni);
+			if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
+					ni->name_len)) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"non-data or named data "
+						"attribute.  Please report "
+						"you saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				goto unm_err_out;
+			}
+			if (vol->cluster_size > 4096) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"attribute but compression is "
+						"disabled due to cluster size "
+						"(%i) > 4kiB.",
+						vol->cluster_size);
+				goto unm_err_out;
+			}
+			if ((a->flags & ATTR_COMPRESSION_MASK) !=
+					ATTR_IS_COMPRESSED) {
+				ntfs_error(vi->i_sb, "Found unknown "
+						"compression method.");
+				goto unm_err_out;
+			}
+		}
+		/*
+		 * The encryption flag set in an index root just means to
+		 * compress all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is %s.  Please "
+					"report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net",
+					NInoCompressed(ni) ? "compressed" :
+					"sparse");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_SPARSE)
+			NInoSetSparse(ni);
+	}
+	if (a->flags & ATTR_IS_ENCRYPTED) {
+		if (NInoCompressed(ni)) {
+			ntfs_error(vi->i_sb, "Found encrypted and compressed "
+					"data.");
+			goto unm_err_out;
+		}
+		/*
+		 * The encryption flag set in an index root just means to
+		 * encrypt all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is encrypted.  "
+					"Please report you saw this message "
+					"to linux-ntfs-dev@lists.sourceforge."
+					"net");
+			goto unm_err_out;
+		}
+		if (ni->type != AT_DATA) {
+			ntfs_error(vi->i_sb, "Found encrypted non-data "
+					"attribute.");
+			goto unm_err_out;
+		}
+		NInoSetEncrypted(ni);
+	}
 	if (!a->non_resident) {
 		/* Ensure the attribute name is placed before the value. */
 		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
@@ -1220,11 +1290,10 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 					"the attribute value.");
 			goto unm_err_out;
 		}
-		if (NInoMstProtected(ni) || a->flags) {
+		if (NInoMstProtected(ni)) {
 			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"or attribute with non-zero flags but "
-					"the attribute is resident.  Please "
-					"report you saw this message to "
+					"but the attribute is resident.  "
+					"Please report you saw this message to "
 					"linux-ntfs-dev@lists.sourceforge.net");
 			goto unm_err_out;
 		}
@@ -1250,50 +1319,8 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 					"the mapping pairs array.");
 			goto unm_err_out;
 		}
-		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				NInoSetCompressed(ni);
-				if ((ni->type != AT_DATA) || (ni->type ==
-						AT_DATA && ni->name_len)) {
-					ntfs_error(vi->i_sb, "Found compressed "
-							"non-data or named "
-							"data attribute.  "
-							"Please report you "
-							"saw this message to "
-							"linux-ntfs-dev@lists."
-							"sourceforge.net");
-					goto unm_err_out;
-				}
-				if (vol->cluster_size > 4096) {
-					ntfs_error(vi->i_sb, "Found compressed "
-							"attribute but "
-							"compression is "
-							"disabled due to "
-							"cluster size (%i) > "
-							"4kiB.",
-							vol->cluster_size);
-					goto unm_err_out;
-				}
-				if ((a->flags & ATTR_COMPRESSION_MASK) !=
-						ATTR_IS_COMPRESSED) {
-					ntfs_error(vi->i_sb, "Found unknown "
-							"compression method.");
-					goto unm_err_out;
-				}
-			}
-			if (NInoMstProtected(ni)) {
-				ntfs_error(vi->i_sb, "Found mst protected "
-						"attribute but the attribute "
-						"is %s.  Please report you "
-						"saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net",
-						NInoCompressed(ni) ?
-						"compressed" : "sparse");
-				goto unm_err_out;
-			}
-			if (a->flags & ATTR_IS_SPARSE)
-				NInoSetSparse(ni);
+		if ((NInoCompressed(ni) || NInoSparse(ni)) &&
+				ni->type != AT_INDEX_ROOT) {
 			if (a->data.non_resident.compression_unit != 4) {
 				ntfs_error(vi->i_sb, "Found nonstandard "
 						"compression unit (%u instead "
@@ -1313,23 +1340,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 			ni->itype.compressed.size = sle64_to_cpu(
 					a->data.non_resident.compressed_size);
 		}
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				ntfs_error(vi->i_sb, "Found encrypted and "
-						"compressed data.");
-				goto unm_err_out;
-			}
-			if (NInoMstProtected(ni)) {
-				ntfs_error(vi->i_sb, "Found mst protected "
-						"attribute but the attribute "
-						"is encrypted.  Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto unm_err_out;
-			}
-			NInoSetEncrypted(ni);
-		}
 		if (a->data.non_resident.lowest_vcn) {
 			ntfs_error(vi->i_sb, "First extent of attribute has "
 					"non-zero lowest_vcn.");
@@ -1348,12 +1358,12 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 		vi->i_mapping->a_ops = &ntfs_mst_aops;
 	else
 		vi->i_mapping->a_ops = &ntfs_aops;
-	if (NInoCompressed(ni) || NInoSparse(ni))
+	if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
 		vi->i_blocks = ni->itype.compressed.size >> 9;
 	else
 		vi->i_blocks = ni->allocated_size >> 9;
 	/*
-	 * Make sure the base inode doesn't go away and attach it to the
+	 * Make sure the base inode does not go away and attach it to the
 	 * attribute inode.
 	 */
 	igrab(base_vi);
@@ -1480,7 +1490,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 				"after the attribute value.");
 		goto unm_err_out;
 	}
-	/* Compressed/encrypted/sparse index root is not allowed. */
+	/*
+	 * Compressed/encrypted/sparse index root is not allowed, except for
+	 * directories of course but those are not dealt with here.
+	 */
 	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
 			ATTR_IS_SPARSE)) {
 		ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
-- 
cgit v1.2.3-70-g09d2


From 8dcdebafb848415eae25924b00c4f0b9ec907da0 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 21:25:48 +0100
Subject: NTFS: Make ntfs_write_block() not instantiate sparse blocks if they
 are zero.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  2 ++
 fs/ntfs/aops.c    | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index ed8d0015093..b1766642baf 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -77,6 +77,8 @@ ToDo/Notes:
 	  updating the times in the inode in ntfs_setattr().
 	- Fixup handling of sparse, compressed, and encrypted attributes in
 	  fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode().
+	- Make ntfs_write_block() not instantiate sparse blocks if they contain
+	  only zeroes.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 78adad7a988..f3ad36d8b8c 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -670,6 +670,27 @@ lock_retry_remap:
 		}
 		/* It is a hole, need to instantiate it. */
 		if (lcn == LCN_HOLE) {
+			u8 *kaddr;
+			unsigned long *bpos, *bend;
+
+			/* Check if the buffer is zero. */
+			kaddr = kmap_atomic(page, KM_USER0);
+			bpos = (unsigned long *)(kaddr + bh_offset(bh));
+			bend = (unsigned long *)((u8*)bpos + blocksize);
+			do {
+				if (unlikely(*bpos))
+					break;
+			} while (likely(++bpos < bend));
+			kunmap_atomic(kaddr, KM_USER0);
+			if (bpos == bend) {
+				/*
+				 * Buffer is zero and sparse, no need to write
+				 * it.
+				 */
+				bh->b_blocknr = -1;
+				clear_buffer_dirty(bh);
+				continue;
+			}
 			// TODO: Instantiate the hole.
 			// clear_buffer_new(bh);
 			// unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
-- 
cgit v1.2.3-70-g09d2


From bd45fdd209ca49c5010ac9af469c41ae6dd3f145 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 21:38:05 +0100
Subject: NTFS: Fixup handling of sparse, compressed, and encrypted attributes
 in       fs/ntfs/aops.c::ntfs_writepage().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |   2 ++
 fs/ntfs/aops.c    | 104 ++++++++++++++++++++++++------------------------------
 2 files changed, 49 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index b1766642baf..4b8666d7e7f 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -79,6 +79,8 @@ ToDo/Notes:
 	  fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode().
 	- Make ntfs_write_block() not instantiate sparse blocks if they contain
 	  only zeroes.
+	- Fixup handling of sparse, compressed, and encrypted attributes in
+	  fs/ntfs/aops.c::ntfs_writepage().
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index f3ad36d8b8c..6f2954aac5a 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1301,38 +1301,42 @@ retry_writepage:
 		ntfs_debug("Write outside i_size - truncated?");
 		return 0;
 	}
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.
+	 */
+	if (ni->type != AT_INDEX_ROOT) {
+		/* If file is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			unlock_page(page);
+			BUG_ON(ni->type != AT_DATA);
+			ntfs_debug("Denying write access to encrypted "
+					"file.");
+			return -EACCES;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			// TODO: Implement and replace this with
+			// return ntfs_write_compressed_block(page);
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to compressed files is "
+					"not supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+		// TODO: Implement and remove this check.
+		if (NInoNonResident(ni) && NInoSparse(ni)) {
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to sparse files is not "
+					"supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+	}
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
-		/*
-		 * Only unnamed $DATA attributes can be compressed, encrypted,
-		 * and/or sparse.
-		 */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* If file is encrypted, deny access, just like NT4. */
-			if (NInoEncrypted(ni)) {
-				unlock_page(page);
-				ntfs_debug("Denying write access to encrypted "
-						"file.");
-				return -EACCES;
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni)) {
-				// TODO: Implement and replace this check with
-				// return ntfs_write_compressed_block(page);
-				unlock_page(page);
-				ntfs_error(vi->i_sb, "Writing to compressed "
-						"files is not supported yet. "
-						"Sorry.");
-				return -EOPNOTSUPP;
-			}
-			// TODO: Implement and remove this check.
-			if (NInoSparse(ni)) {
-				unlock_page(page);
-				ntfs_error(vi->i_sb, "Writing to sparse files "
-						"is not supported yet. Sorry.");
-				return -EOPNOTSUPP;
-			}
-		}
 		/* We have to zero every time due to mmap-at-end-of-file. */
 		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
 			/* The page straddles i_size. */
@@ -1345,14 +1349,16 @@ retry_writepage:
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
 			return ntfs_write_mst_block(page, wbc);
-		/* Normal data stream. */
+		/* Normal, non-resident data stream. */
 		return ntfs_write_block(page, wbc);
 	}
 	/*
-	 * Attribute is resident, implying it is not compressed, encrypted,
-	 * sparse, or mst protected.  This also means the attribute is smaller
-	 * than an mft record and hence smaller than a page, so can simply
-	 * return error on any pages with index above 0.
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * mst protected.  This also means the attribute is smaller than an mft
+	 * record and hence smaller than a page, so can simply return error on
+	 * any pages with index above 0.  Note the attribute can actually be
+	 * marked compressed but if it is resident the actual data is not
+	 * compressed so we are ok to ignore the compressed flag here.
 	 */
 	BUG_ON(page_has_buffers(page));
 	BUG_ON(!PageUptodate(page));
@@ -1401,30 +1407,14 @@ retry_writepage:
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
-
 	/*
-	 * Here, we don't need to zero the out of bounds area everytime because
-	 * the below memcpy() already takes care of the mmap-at-end-of-file
-	 * requirements. If the file is converted to a non-resident one, then
-	 * the code path use is switched to the non-resident one where the
-	 * zeroing happens on each ntfs_writepage() invocation.
-	 *
-	 * The above also applies nicely when i_size is decreased.
-	 *
-	 * When i_size is increased, the memory between the old and new i_size
-	 * _must_ be zeroed (or overwritten with new data). Otherwise we will
-	 * expose data to userspace/disk which should never have been exposed.
-	 *
-	 * FIXME: Ensure that i_size increases do the zeroing/overwriting and
-	 * if we cannot guarantee that, then enable the zeroing below.  If the
-	 * zeroing below is enabled, we MUST move the unlock_page() from above
-	 * to after the kunmap_atomic(), i.e. just before the
-	 * end_page_writeback().
-	 * UPDATE: ntfs_prepare/commit_write() do the zeroing on i_size
-	 * increases for resident attributes so those are ok.
-	 * TODO: ntfs_truncate(), others?
+	 * Here, we do not need to zero the out of bounds area everytime
+	 * because the below memcpy() already takes care of the
+	 * mmap-at-end-of-file requirements.  If the file is converted to a
+	 * non-resident one, then the code path use is switched to the
+	 * non-resident one where the zeroing happens on each ntfs_writepage()
+	 * invocation.
 	 */
-
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
 	i_size = i_size_read(vi);
 	if (unlikely(attr_len > i_size)) {
-- 
cgit v1.2.3-70-g09d2


From 54b02eb01c0172294e43e2b54d6815f65637c111 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 21:43:47 +0100
Subject: NTFS: Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the
 page       lock protection over the buffer submission for i/o which allows
 the       removal of the get_bh()/put_bh() pairs for each buffer.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  3 +++
 fs/ntfs/aops.c    | 13 +++----------
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 4b8666d7e7f..10fc2385efa 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -81,6 +81,9 @@ ToDo/Notes:
 	  only zeroes.
 	- Fixup handling of sparse, compressed, and encrypted attributes in
 	  fs/ntfs/aops.c::ntfs_writepage().
+	- Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
+	  lock protection over the buffer submission for i/o which allows the
+	  removal of the get_bh()/put_bh() pairs for each buffer.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 6f2954aac5a..821dad7d14c 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -735,7 +735,7 @@ lock_retry_remap:
 	/* For the error case, need to reset bh to the beginning. */
 	bh = head;
 
-	/* Just an optimization, so ->readpage() isn't called later. */
+	/* Just an optimization, so ->readpage() is not called later. */
 	if (unlikely(!PageUptodate(page))) {
 		int uptodate = 1;
 		do {
@@ -751,7 +751,6 @@ lock_retry_remap:
 
 	/* Setup all mapped, dirty buffers for async write i/o. */
 	do {
-		get_bh(bh);
 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
@@ -789,14 +788,8 @@ lock_retry_remap:
 
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
-	unlock_page(page);
 
-	/*
-	 * Submit the prepared buffers for i/o. Note the page is unlocked,
-	 * and the async write i/o completion handler can end_page_writeback()
-	 * at any time after the *first* submit_bh(). So the buffers can then
-	 * disappear...
-	 */
+	/* Submit the prepared buffers for i/o. */
 	need_end_writeback = TRUE;
 	do {
 		struct buffer_head *next = bh->b_this_page;
@@ -804,9 +797,9 @@ lock_retry_remap:
 			submit_bh(WRITE, bh);
 			need_end_writeback = FALSE;
 		}
-		put_bh(bh);
 		bh = next;
 	} while (bh != head);
+	unlock_page(page);
 
 	/* If no i/o was started, need to end_page_writeback(). */
 	if (unlikely(need_end_writeback))
-- 
cgit v1.2.3-70-g09d2


From 8273d5d4c28a9fde68f830cc6ff61e37e8ae1dca Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 22:00:33 +0100
Subject: NTFS: Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the
 case       where a concurrent truncate has truncated the runlist under our
 feet.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  2 ++
 fs/ntfs/aops.c    | 51 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 42 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 10fc2385efa..7f24cb19c69 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -84,6 +84,8 @@ ToDo/Notes:
 	- Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
 	  lock protection over the buffer submission for i/o which allows the
 	  removal of the get_bh()/put_bh() pairs for each buffer.
+	- Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
+	  where a concurrent truncate has truncated the runlist under our feet.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 821dad7d14c..59389a8801b 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -204,6 +204,7 @@ static int ntfs_read_block(struct page *page)
 	nr = i = 0;
 	do {
 		u8 *kaddr;
+		int err;
 
 		if (unlikely(buffer_uptodate(bh)))
 			continue;
@@ -211,6 +212,7 @@ static int ntfs_read_block(struct page *page)
 			arr[nr++] = bh;
 			continue;
 		}
+		err = 0;
 		bh->b_bdev = vol->sb->s_bdev;
 		/* Is the block within the allowed limits? */
 		if (iblock < lblock) {
@@ -252,7 +254,6 @@ lock_retry_remap:
 				goto handle_hole;
 			/* If first try and runlist unmapped, map and retry. */
 			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-				int err;
 				is_retry = TRUE;
 				/*
 				 * Attempt to map runlist, dropping lock for
@@ -263,20 +264,30 @@ lock_retry_remap:
 				if (likely(!err))
 					goto lock_retry_remap;
 				rl = NULL;
-				lcn = err;
 			} else if (!rl)
 				up_read(&ni->runlist.lock);
+			/*
+			 * If buffer is outside the runlist, treat it as a
+			 * hole.  This can happen due to concurrent truncate
+			 * for example.
+			 */
+			if (err == -ENOENT || lcn == LCN_ENOENT) {
+				err = 0;
+				goto handle_hole;
+			}
 			/* Hard error, zero out region. */
+			if (!err)
+				err = -EIO;
 			bh->b_blocknr = -1;
 			SetPageError(page);
 			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
 					"attribute type 0x%x, vcn 0x%llx, "
 					"offset 0x%x because its location on "
 					"disk could not be determined%s "
-					"(error code %lli).", ni->mft_no,
+					"(error code %i).", ni->mft_no,
 					ni->type, (unsigned long long)vcn,
 					vcn_ofs, is_retry ? " even after "
-					"retrying" : "", (long long)lcn);
+					"retrying" : "", err);
 		}
 		/*
 		 * Either iblock was outside lblock limits or
@@ -289,9 +300,10 @@ handle_hole:
 handle_zblock:
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr + i * blocksize, 0, blocksize);
-		flush_dcache_page(page);
 		kunmap_atomic(kaddr, KM_USER0);
-		set_buffer_uptodate(bh);
+		flush_dcache_page(page);
+		if (likely(!err))
+			set_buffer_uptodate(bh);
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
 
 	/* Release the lock if we took it. */
@@ -711,20 +723,37 @@ lock_retry_remap:
 			if (likely(!err))
 				goto lock_retry_remap;
 			rl = NULL;
-			lcn = err;
 		} else if (!rl)
 			up_read(&ni->runlist.lock);
+		/*
+		 * If buffer is outside the runlist, truncate has cut it out
+		 * of the runlist.  Just clean and clear the buffer and set it
+		 * uptodate so it can get discarded by the VM.
+		 */
+		if (err == -ENOENT || lcn == LCN_ENOENT) {
+			u8 *kaddr;
+
+			bh->b_blocknr = -1;
+			clear_buffer_dirty(bh);
+			kaddr = kmap_atomic(page, KM_USER0);
+			memset(kaddr + bh_offset(bh), 0, blocksize);
+			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page);
+			set_buffer_uptodate(bh);
+			err = 0;
+			continue;
+		}
 		/* Failed to map the buffer, even after retrying. */
+		if (!err)
+			err = -EIO;
 		bh->b_blocknr = -1;
 		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
 				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
 				"because its location on disk could not be "
-				"determined%s (error code %lli).", ni->mft_no,
+				"determined%s (error code %i).", ni->mft_no,
 				ni->type, (unsigned long long)vcn,
 				vcn_ofs, is_retry ? " even after "
-				"retrying" : "", (long long)lcn);
-		if (!err)
-			err = -EIO;
+				"retrying" : "", err);
 		break;
 	} while (block++, (bh = bh->b_this_page) != head);
 
-- 
cgit v1.2.3-70-g09d2


From 311120eca0013083f5eb0aff13ffb8aa9fdd050c Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 22:04:20 +0100
Subject: NTFS: Fixup handling of sparse, compressed, and encrypted attributes
 in       fs/ntfs/aops.c::ntfs_readpage().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  5 ++---
 fs/ntfs/aops.c    | 41 ++++++++++++++++++++++++-----------------
 2 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 7f24cb19c69..c3b51021074 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -76,11 +76,10 @@ ToDo/Notes:
 	- Truncate {a,c,m}time to the ntfs supported time granularity when
 	  updating the times in the inode in ntfs_setattr().
 	- Fixup handling of sparse, compressed, and encrypted attributes in
-	  fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode().
+	  fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
+	  fs/ntfs/aops.c::ntfs_{read,write}page().
 	- Make ntfs_write_block() not instantiate sparse blocks if they contain
 	  only zeroes.
-	- Fixup handling of sparse, compressed, and encrypted attributes in
-	  fs/ntfs/aops.c::ntfs_writepage().
 	- Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
 	  lock protection over the buffer submission for i/o which allows the
 	  removal of the get_bh()/put_bh() pairs for each buffer.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 59389a8801b..2482b677a82 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -379,31 +379,38 @@ retry_readpage:
 		return 0;
 	}
 	ni = NTFS_I(page->mapping->host);
-
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.
+	 */
+	if (ni->type != AT_INDEX_ROOT) {
+		/* If attribute is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			err = -EACCES;
+			goto err_out;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			return ntfs_read_compressed_block(page);
+		}
+	}
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
-		/*
-		 * Only unnamed $DATA attributes can be compressed or
-		 * encrypted.
-		 */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* If file is encrypted, deny access, just like NT4. */
-			if (NInoEncrypted(ni)) {
-				err = -EACCES;
-				goto err_out;
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni))
-				return ntfs_read_compressed_block(page);
-		}
-		/* Normal data stream. */
+		/* Normal, non-resident data stream. */
 		return ntfs_read_block(page);
 	}
 	/*
 	 * Attribute is resident, implying it is not compressed or encrypted.
 	 * This also means the attribute is smaller than an mft record and
 	 * hence smaller than a page, so can simply zero out any pages with
-	 * index above 0.
+	 * index above 0.  Note the attribute can actually be marked compressed
+	 * but if it is resident the actual data is not compressed so we are
+	 * ok to ignore the compressed flag here.
 	 */
 	if (unlikely(page->index > 0)) {
 		kaddr = kmap_atomic(page, KM_USER0);
-- 
cgit v1.2.3-70-g09d2


From a01ac532b519dc0e0b4d8bc4e12373e4e4cd1b1a Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 22:08:11 +0100
Subject: NTFS: Fix page_has_buffers()/page_buffers() handling in
 fs/ntfs/aops.c.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  1 +
 fs/ntfs/aops.c    | 38 +++++++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index c3b51021074..029f856c56c 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -85,6 +85,7 @@ ToDo/Notes:
 	  removal of the get_bh()/put_bh() pairs for each buffer.
 	- Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
 	  where a concurrent truncate has truncated the runlist under our feet.
+	- Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 2482b677a82..52e1aff98b0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -185,13 +185,15 @@ static int ntfs_read_block(struct page *page)
 	blocksize_bits = VFS_I(ni)->i_blkbits;
 	blocksize = 1 << blocksize_bits;
 
-	if (!page_has_buffers(page))
+	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, blocksize, 0);
-	bh = head = page_buffers(page);
-	if (unlikely(!bh)) {
-		unlock_page(page);
-		return -ENOMEM;
+		if (unlikely(!page_has_buffers(page))) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
 	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
 
 	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
 	read_lock_irqsave(&ni->size_lock, flags);
@@ -530,19 +532,21 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 		BUG_ON(!PageUptodate(page));
 		create_empty_buffers(page, blocksize,
 				(1 << BH_Uptodate) | (1 << BH_Dirty));
+		if (unlikely(!page_has_buffers(page))) {
+			ntfs_warning(vol->sb, "Error allocating page "
+					"buffers.  Redirtying page so we try "
+					"again later.");
+			/*
+			 * Put the page back on mapping->dirty_pages, but leave
+			 * its buffers' dirty state as-is.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
 	}
 	bh = head = page_buffers(page);
-	if (unlikely(!bh)) {
-		ntfs_warning(vol->sb, "Error allocating page buffers. "
-				"Redirtying page so we try again later.");
-		/*
-		 * Put the page back on mapping->dirty_pages, but leave its
-		 * buffer's dirty state as-is.
-		 */
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
-	}
+	BUG_ON(!bh);
 
 	/* NOTE: Different naming scheme to ntfs_read_block()! */
 
@@ -910,7 +914,6 @@ static int ntfs_write_mst_block(struct page *page,
 	sync = (wbc->sync_mode == WB_SYNC_ALL);
 
 	/* Make sure we have mapped buffers. */
-	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
 
@@ -2397,6 +2400,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 			buffers_to_free = bh;
 	}
 	bh = head = page_buffers(page);
+	BUG_ON(!bh);
 	do {
 		bh_ofs = bh_offset(bh);
 		if (bh_ofs + bh_size <= ofs)
-- 
cgit v1.2.3-70-g09d2


From e604635c8bea16f6177e6133eb3efbfb4a029ef6 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 22:13:02 +0100
Subject: NTFS: Improve scalability by changing the driver global spin lock in 
      fs/ntfs/aops.c::ntfs_end_buffer_async_read() to a bit spin lock       in
 the first buffer head of a page.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  3 +++
 fs/ntfs/aops.c    | 15 +++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 029f856c56c..32a4150ebcb 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -86,6 +86,9 @@ ToDo/Notes:
 	- Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
 	  where a concurrent truncate has truncated the runlist under our feet.
 	- Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
+	- In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
+	  in the first buffer head instead of a driver global spin lock to
+	  improve scalability.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 52e1aff98b0..950b686f02d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -55,9 +55,8 @@
  */
 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
-	static DEFINE_SPINLOCK(page_uptodate_lock);
 	unsigned long flags;
-	struct buffer_head *tmp;
+	struct buffer_head *first, *tmp;
 	struct page *page;
 	ntfs_inode *ni;
 	int page_uptodate = 1;
@@ -89,11 +88,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 	} else {
 		clear_buffer_uptodate(bh);
+		SetPageError(page);
 		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
 				(unsigned long long)bh->b_blocknr);
-		SetPageError(page);
 	}
-	spin_lock_irqsave(&page_uptodate_lock, flags);
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -108,7 +109,8 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -141,7 +143,8 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	return;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7d333d6c739a5cd6d60102ea1a9940cbbb0546ec Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 8 Sep 2005 23:01:16 +0100
Subject: NTFS: 2.1.24 release and some minor final fixes.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 Documentation/filesystems/ntfs.txt | 12 ++++++++++++
 fs/ntfs/ChangeLog                  |  4 +++-
 fs/ntfs/Makefile                   |  2 +-
 fs/ntfs/aops.c                     | 10 ++++------
 fs/ntfs/super.c                    |  2 +-
 5 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index eef4aca0c75..a5fbc8e897f 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -439,6 +439,18 @@ ChangeLog
 
 Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
 
+2.1.24:
+	- Support journals ($LogFile) which have been modified by chkdsk.  This
+	  means users can boot into Windows after we marked the volume dirty.
+	  The Windows boot will run chkdsk and then reboot.  The user can then
+	  immediately boot into Linux rather than having to do a full Windows
+	  boot first before rebooting into Linux and we will recognize such a
+	  journal and empty it as it is clean by definition.
+	- Support journals ($LogFile) with only one restart page as well as
+	  journals with two different restart pages.  We sanity check both and
+	  either use the only sane one or the more recent one of the two in the
+	  case that both are valid.
+	- Lots of bug fixes and enhancements across the board.
 2.1.23:
 	- Stamp the user space journal, aka transaction log, aka $UsnJrnl, if
 	  it is present and active thus telling Windows and applications using
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 32a4150ebcb..e4fd6134244 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -22,7 +22,7 @@ ToDo/Notes:
 	- Enable the code for setting the NT4 compatibility flag when we start
 	  making NTFS 1.2 specific modifications.
 
-2.1.24-WIP
+2.1.24 - Lots of bug fixes and support more clean journal states.
 
 	- Support journals ($LogFile) which have been modified by chkdsk.  This
 	  means users can boot into Windows after we marked the volume dirty.
@@ -89,6 +89,8 @@ ToDo/Notes:
 	- In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
 	  in the first buffer head instead of a driver global spin lock to
 	  improve scalability.
+	- Minor fix to error handling and error message display in
+	  fs/ntfs/aops.c::ntfs_prepare_nonresident_write(). 
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index ce970dacf90..894b2b876d3 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24-WIP\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 950b686f02d..545236414d5 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1727,27 +1727,25 @@ lock_retry_remap:
 					if (likely(!err))
 						goto lock_retry_remap;
 					rl = NULL;
-					lcn = err;
 				} else if (!rl)
 					up_read(&ni->runlist.lock);
 				/*
 				 * Failed to map the buffer, even after
 				 * retrying.
 				 */
+				if (!err)
+					err = -EIO;
 				bh->b_blocknr = -1;
 				ntfs_error(vol->sb, "Failed to write to inode "
 						"0x%lx, attribute type 0x%x, "
 						"vcn 0x%llx, offset 0x%x "
 						"because its location on disk "
 						"could not be determined%s "
-						"(error code %lli).",
+						"(error code %i).",
 						ni->mft_no, ni->type,
 						(unsigned long long)vcn,
 						vcn_ofs, is_retry ? " even "
-						"after retrying" : "",
-						(long long)lcn);
-				if (!err)
-					err = -EIO;
+						"after retrying" : "", err);
 				goto err_out;
 			}
 			/* We now have a successful remap, i.e. lcn >= 0. */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index bf8569d503a..b2b39296126 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1688,9 +1688,9 @@ static BOOL load_system_files(ntfs_volume *vol)
 	struct super_block *sb = vol->sb;
 	MFT_RECORD *m;
 	VOLUME_INFORMATION *vi;
-	RESTART_PAGE_HEADER *rp;
 	ntfs_attr_search_ctx *ctx;
 #ifdef NTFS_RW
+	RESTART_PAGE_HEADER *rp;
 	int err;
 #endif /* NTFS_RW */
 
-- 
cgit v1.2.3-70-g09d2


From c9fc0d6a6901e7f06c6bbd0061e503d96464266d Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@bruce>
Date: Fri, 9 Sep 2005 11:38:09 +1000
Subject: [XFS] Revert recent quota Makefile change, not in a fit state for
 merging.

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/Makefile-linux-2.6 | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 8e18ff15724..d8c87fa21ad 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of version 2 of the GNU General Public License as
@@ -55,7 +55,18 @@ ifeq ($(CONFIG_XFS_TRACE),y)
 endif
 
 obj-$(CONFIG_XFS_FS)		+= xfs.o
-xfs-$(CONFIG_XFS_QUOTA)		+= quota/
+
+xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
+				   xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm_bhv.o \
+				   xfs_qm.o)
+
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
+endif
 
 xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
-- 
cgit v1.2.3-70-g09d2


From 3f70353ea91ad77c83500e70507a239b2ab0c980 Mon Sep 17 00:00:00 2001
From: "viro@ZenIV.linux.org.uk" <viro@ZenIV.linux.org.uk>
Date: Fri, 9 Sep 2005 16:53:56 +0100
Subject: [PATCH] bogus cast in bio.c

<qualifier> void * is not the same as void <qualifier> *...
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index a7d4fd3a329..83a34957456 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -683,7 +683,7 @@ struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
 {
 	struct sg_iovec iov;
 
-	iov.iov_base = (__user void *)uaddr;
+	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
 	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
-- 
cgit v1.2.3-70-g09d2


From e85b565233236a2a833adea73fb2f0e0f8fa2a61 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 9 Sep 2005 13:01:29 -0700
Subject: [PATCH] move truncate_inode_pages() into ->delete_inode()

Allow file systems supporting ->delete_inode() to call
truncate_inode_pages() on their own.  OCFS2 wants this so it can query the
cluster before making a final decision on whether to wipe an inode from
disk or not.  In some corner cases an inode marked on the local node via
voting may not actually get orphaned.  A good example is node death before
the transaction moving the inode to the orphan dir commits to the journal.
Without this patch, the truncate_inode_pages() call in
generic_delete_inode() would discard valid data for such inodes.

During earlier discussion in the 2.6.13 merge plan thread, Christoph
Hellwig indicated that other file systems might also find this useful.

IMHO, the best solution would be to just allow ->drop_inode() to do the
cluster query but it seems that would require a substantial reworking of
that section of the code.  Assuming it is safe to call write_inode_now() in
ocfs2_delete_inode() for those inodes which won't actually get wiped, this
solution should get us by for now.

Trivial testing of this patch (and a related OCFS2 update) has shown this
to avoid the corruption I'm seeing.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 71df1b1e8f7..f80a79ff156 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1034,19 +1034,21 @@ void generic_delete_inode(struct inode *inode)
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
 
-	if (inode->i_data.nrpages)
-		truncate_inode_pages(&inode->i_data, 0);
-
 	security_inode_delete(inode);
 
 	if (op->delete_inode) {
 		void (*delete)(struct inode *) = op->delete_inode;
 		if (!is_bad_inode(inode))
 			DQUOT_INIT(inode);
-		/* s_op->delete_inode internally recalls clear_inode() */
+		/* Filesystems implementing their own
+		 * s_op->delete_inode are required to call
+		 * truncate_inode_pages and clear_inode()
+		 * internally */
 		delete(inode);
-	} else
+	} else {
+		truncate_inode_pages(&inode->i_data, 0);
 		clear_inode(inode);
+	}
 	spin_lock(&inode_lock);
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode_lock);
-- 
cgit v1.2.3-70-g09d2


From fef266580e5cf897a1b63528fc6b1185e2d6bb87 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 9 Sep 2005 13:01:31 -0700
Subject: [PATCH] update filesystems for new delete_inode behavior

Update the file systems in fs/ implementing a delete_inode() callback to
call truncate_inode_pages().  One implementation note: In developing this
patch I put the calls to truncate_inode_pages() at the very top of those
filesystems delete_inode() callbacks in order to retain the previous
behavior.  I'm guessing that some of those could probably be optimized.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/affs/inode.c         | 1 +
 fs/bfs/inode.c          | 2 ++
 fs/ext2/inode.c         | 2 ++
 fs/ext3/inode.c         | 2 ++
 fs/fat/inode.c          | 2 ++
 fs/hostfs/hostfs_kern.c | 1 +
 fs/hpfs/inode.c         | 1 +
 fs/jffs/inode-v23.c     | 1 +
 fs/jfs/inode.c          | 2 ++
 fs/minix/inode.c        | 1 +
 fs/ncpfs/inode.c        | 2 ++
 fs/nfs/inode.c          | 2 ++
 fs/proc/inode.c         | 2 ++
 fs/qnx4/inode.c         | 1 +
 fs/reiserfs/inode.c     | 2 ++
 fs/smbfs/inode.c        | 1 +
 fs/sysv/inode.c         | 1 +
 fs/udf/inode.c          | 2 ++
 fs/ufs/inode.c          | 1 +
 mm/shmem.c              | 1 +
 20 files changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 7aa6f200453..9ebe881c678 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -255,6 +255,7 @@ void
 affs_delete_inode(struct inode *inode)
 {
 	pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	if (S_ISREG(inode->i_mode))
 		affs_truncate(inode);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 64e0fb33fc0..628c2c1a7d7 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -143,6 +143,8 @@ static void bfs_delete_inode(struct inode * inode)
 
 	dprintf("ino=%08lx\n", inode->i_ino);
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (inode->i_ino < BFS_ROOT_INO || inode->i_ino > info->si_lasti) {
 		printf("invalid ino=%08lx\n", inode->i_ino);
 		return;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 53dceb0c659..fdba4d1d3c6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -71,6 +71,8 @@ void ext2_put_inode(struct inode *inode)
  */
 void ext2_delete_inode (struct inode * inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (is_bad_inode(inode))
 		goto no_delete;
 	EXT2_I(inode)->i_dtime	= get_seconds();
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9989fdcf4d5..b5177c90d6f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -187,6 +187,8 @@ void ext3_delete_inode (struct inode * inode)
 {
 	handle_t *handle;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (is_bad_inode(inode))
 		goto no_delete;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 96ae85b67eb..a7cbe68e225 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -335,6 +335,8 @@ EXPORT_SYMBOL(fat_build_inode);
 
 static void fat_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (!is_bad_inode(inode)) {
 		inode->i_size = 0;
 		fat_truncate(inode);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b2d18200a00..59c5062cd63 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -284,6 +284,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 
 static void hostfs_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	if(HOSTFS_I(inode)->fd != -1) {
 		close_file(&HOSTFS_I(inode)->fd);
 		HOSTFS_I(inode)->fd = -1;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 38b1741fa53..e3d17e9ea6c 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -284,6 +284,7 @@ void hpfs_write_if_changed(struct inode *inode)
 
 void hpfs_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	lock_kernel();
 	hpfs_remove_fnode(inode->i_sb, inode->i_ino);
 	unlock_kernel();
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 777b90057b8..3dcc6d2162c 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1744,6 +1744,7 @@ jffs_delete_inode(struct inode *inode)
 	D3(printk("jffs_delete_inode(): inode->i_ino == %lu\n",
 		  inode->i_ino));
 
+	truncate_inode_pages(&inode->i_data, 0);
 	lock_kernel();
 	inode->i_size = 0;
 	inode->i_blocks = 0;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 767c7ecb429..cff352f4ec1 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -132,6 +132,8 @@ void jfs_delete_inode(struct inode *inode)
 	    (JFS_IP(inode)->fileset != cpu_to_le32(FILESYSTEM_I)))
 			return;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (test_cflag(COMMIT_Freewmap, inode))
 		jfs_free_zero_link(inode);
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 3f18c21198d..790cc0d0e97 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -24,6 +24,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
 
 static void minix_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	minix_truncate(inode);
 	minix_free_inode(inode);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 44795d2f4b3..8c8839203cd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -286,6 +286,8 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 static void
 ncp_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (S_ISDIR(inode->i_mode)) {
 		DDPRINTK("ncp_delete_inode: put directory %ld\n", inode->i_ino);
 	}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 541b418327c..6922469d6fc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -146,6 +146,8 @@ nfs_delete_inode(struct inode * inode)
 {
 	dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	nfs_wb_all(inode);
 	/*
 	 * The following should never happen...
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 133c2868510..effa6c0c467 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -60,6 +60,8 @@ static void proc_delete_inode(struct inode *inode)
 	struct proc_dir_entry *de;
 	struct task_struct *tsk;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	/* Let go of any associated process */
 	tsk = PROC_I(inode)->task;
 	if (tsk)
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b79162a3547..80f32911c0c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -63,6 +63,7 @@ int qnx4_sync_inode(struct inode *inode)
 static void qnx4_delete_inode(struct inode *inode)
 {
 	QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	qnx4_truncate(inode);
 	lock_kernel();
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ff291c973a5..1a8a1bf2154 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -33,6 +33,8 @@ void reiserfs_delete_inode(struct inode *inode)
 	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
 	struct reiserfs_transaction_handle th;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	reiserfs_write_lock(inode->i_sb);
 
 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4765aaac9fd..10b994428fe 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -331,6 +331,7 @@ static void
 smb_delete_inode(struct inode *ino)
 {
 	DEBUG1("ino=%ld\n", ino->i_ino);
+	truncate_inode_pages(&ino->i_data, 0);
 	lock_kernel();
 	if (smb_close(ino))
 		PARANOIA("could not close inode %ld\n", ino->i_ino);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0530077d9dd..fa33eceb001 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -292,6 +292,7 @@ int sysv_sync_inode(struct inode * inode)
 
 static void sysv_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	sysv_truncate(inode);
 	lock_kernel();
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 3d68de39fad..b83890beaaa 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -87,6 +87,8 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
  */
 void udf_delete_inode(struct inode * inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (is_bad_inode(inode))
 		goto no_delete;
 
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 718627ca8b5..55f4aa16e3f 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -804,6 +804,7 @@ int ufs_sync_inode (struct inode *inode)
 
 void ufs_delete_inode (struct inode * inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
 	lock_kernel();
 	mark_inode_dirty(inode);
diff --git a/mm/shmem.c b/mm/shmem.c
index db2c9e8d990..0d627a37da9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -666,6 +666,7 @@ static void shmem_delete_inode(struct inode *inode)
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (inode->i_op->truncate == shmem_truncate) {
+		truncate_inode_pages(inode->i_mapping, 0);
 		shmem_unacct_size(info->flags, inode->i_size);
 		inode->i_size = 0;
 		shmem_truncate(inode);
-- 
cgit v1.2.3-70-g09d2


From 10f47e6a1b8b276323b652053945c87a63a5812d Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Fri, 9 Sep 2005 13:01:39 -0700
Subject: [PATCH] ext2: Enable atomic inode security labeling

This patch modifies ext2 to call the inode_init_security LSM hook to obtain
the security attribute for a newly created inode and to set the resulting
attribute on the new inode.  This parallels the existing processing for
setting ACLs on newly created inodes.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/ialloc.c         |  5 +++++
 fs/ext2/xattr.h          |  8 ++++++++
 fs/ext2/xattr_security.c | 22 ++++++++++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 161f156d98c..c8d07030c89 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -615,6 +615,11 @@ got:
 		DQUOT_DROP(inode);
 		goto fail2;
 	}
+	err = ext2_init_security(inode,dir);
+	if (err) {
+		DQUOT_FREE_INODE(inode);
+		goto fail2;
+	}
 	mark_inode_dirty(inode);
 	ext2_debug("allocating inode %lu\n", inode->i_ino);
 	ext2_preread_inode(inode);
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 5f3bfde3b81..67cfeb66e89 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,3 +116,11 @@ exit_ext2_xattr(void)
 
 # endif  /* CONFIG_EXT2_FS_XATTR */
 
+#ifdef CONFIG_EXT2_FS_SECURITY
+extern int ext2_init_security(struct inode *inode, struct inode *dir);
+#else
+static inline int ext2_init_security(struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 6a6c59fbe59..a2661279847 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
 #include <linux/ext2_fs.h>
+#include <linux/security.h>
 #include "xattr.h"
 
 static size_t
@@ -45,6 +46,27 @@ ext2_xattr_security_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
+int
+ext2_init_security(struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+			     name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
 struct xattr_handler ext2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext2_xattr_security_list,
-- 
cgit v1.2.3-70-g09d2


From ac50960afa31877493add6d941d8402fa879c452 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Fri, 9 Sep 2005 13:01:41 -0700
Subject: [PATCH] ext3: Enable atomic inode security labeling

This patch modifies ext3 to call the inode_init_security LSM hook to obtain
the security attribute for a newly created inode and to set the resulting
attribute on the new inode as part of the same transaction.  This parallels
the existing processing for setting ACLs on newly created inodes.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/ialloc.c         |  5 +++++
 fs/ext3/xattr.h          | 11 +++++++++++
 fs/ext3/xattr_security.c | 22 ++++++++++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 6981bd014ed..96552769d03 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -607,6 +607,11 @@ got:
 		DQUOT_DROP(inode);
 		goto fail2;
   	}
+	err = ext3_init_security(handle,inode, dir);
+	if (err) {
+		DQUOT_FREE_INODE(inode);
+		goto fail2;
+	}
 	err = ext3_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext3_std_error(sb, err);
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index eb31a69e82d..2ceae38f3d4 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -133,3 +133,14 @@ exit_ext3_xattr(void)
 #define ext3_xattr_handlers	NULL
 
 # endif  /* CONFIG_EXT3_FS_XATTR */
+
+#ifdef CONFIG_EXT3_FS_SECURITY
+extern int ext3_init_security(handle_t *handle, struct inode *inode,
+				struct inode *dir);
+#else
+static inline int ext3_init_security(handle_t *handle, struct inode *inode,
+				struct inode *dir)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index ddc1c41750e..b9c40c15647 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -9,6 +9,7 @@
 #include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
 #include <linux/ext3_fs.h>
+#include <linux/security.h>
 #include "xattr.h"
 
 static size_t
@@ -47,6 +48,27 @@ ext3_xattr_security_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
+				    name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
 struct xattr_handler ext3_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext3_xattr_security_list,
-- 
cgit v1.2.3-70-g09d2


From a74574aafea3a63add3251047601611111f44562 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Fri, 9 Sep 2005 13:01:44 -0700
Subject: [PATCH] Remove security_inode_post_create/mkdir/symlink/mknod hooks

This patch removes the inode_post_create/mkdir/mknod/symlink LSM hooks as
they are obsoleted by the new inode_init_security hook that enables atomic
inode security labeling.

If anyone sees any reason to retain these hooks, please speak now.  Also,
is anyone using the post_rename/link hooks; if not, those could also be
removed.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c                        |  16 ++----
 include/linux/security.h          |  90 -------------------------------
 security/dummy.c                  |  28 ----------
 security/selinux/hooks.c          | 111 --------------------------------------
 security/selinux/include/objsec.h |   1 -
 5 files changed, 4 insertions(+), 242 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 145e852c4bd..993a65a7d57 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1316,10 +1316,8 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		return error;
 	DQUOT_INIT(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, dentry->d_name.name);
-		security_inode_post_create(dir, dentry, mode);
-	}
 	return error;
 }
 
@@ -1635,10 +1633,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 
 	DQUOT_INIT(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, dentry->d_name.name);
-		security_inode_post_mknod(dir, dentry, mode, dev);
-	}
 	return error;
 }
 
@@ -1708,10 +1704,8 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	DQUOT_INIT(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
-	if (!error) {
+	if (!error)
 		fsnotify_mkdir(dir, dentry->d_name.name);
-		security_inode_post_mkdir(dir,dentry, mode);
-	}
 	return error;
 }
 
@@ -1947,10 +1941,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 
 	DQUOT_INIT(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, dentry->d_name.name);
-		security_inode_post_symlink(dir, dentry, oldname);
-	}
 	return error;
 }
 
diff --git a/include/linux/security.h b/include/linux/security.h
index d4f3b7a94ea..875225bf898 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -275,12 +275,6 @@ struct swap_info_struct;
  *	@dentry contains the dentry structure for the file to be created.
  *	@mode contains the file mode of the file to be created.
  *	Return 0 if permission is granted.
- * @inode_post_create:
- *	Set the security attributes on a newly created regular file.  This hook
- *	is called after a file has been successfully created.
- *	@dir contains the inode structure of the parent directory of the new file.
- *	@dentry contains the the dentry structure for the newly created file.
- *	@mode contains the file mode.
  * @inode_link:
  *	Check permission before creating a new hard link to a file.
  *	@old_dentry contains the dentry structure for an existing link to the file.
@@ -303,13 +297,6 @@ struct swap_info_struct;
  *	@dentry contains the dentry structure of the symbolic link.
  *	@old_name contains the pathname of file.
  *	Return 0 if permission is granted.
- * @inode_post_symlink:
- *	@dir contains the inode structure of the parent directory of the new link.
- *	@dentry contains the dentry structure of new symbolic link.
- *	@old_name contains the pathname of file.
- *	Set security attributes for a newly created symbolic link.  Note that
- *	@dentry->d_inode may be NULL, since the filesystem might not
- *	instantiate the dentry (e.g. NFS).
  * @inode_mkdir:
  *	Check permissions to create a new directory in the existing directory
  *	associated with inode strcture @dir. 
@@ -317,11 +304,6 @@ struct swap_info_struct;
  *	@dentry contains the dentry structure of new directory.
  *	@mode contains the mode of new directory.
  *	Return 0 if permission is granted.
- * @inode_post_mkdir:
- *	Set security attributes on a newly created directory.
- *	@dir contains the inode structure of parent of the directory to be created.
- *	@dentry contains the dentry structure of new directory.
- *	@mode contains the mode of new directory.
  * @inode_rmdir:
  *	Check the permission to remove a directory.
  *	@dir contains the inode structure of parent of the directory to be removed.
@@ -337,13 +319,6 @@ struct swap_info_struct;
  *	@mode contains the mode of the new file.
  *	@dev contains the the device number.
  *	Return 0 if permission is granted.
- * @inode_post_mknod:
- *	Set security attributes on a newly created special file (or socket or
- *	fifo file created via the mknod system call).
- *	@dir contains the inode structure of parent of the new node.
- *	@dentry contains the dentry structure of the new node.
- *	@mode contains the mode of the new node.
- *	@dev contains the the device number.
  * @inode_rename:
  *	Check for permission to rename a file or directory.
  *	@old_dir contains the inode structure for parent of the old link.
@@ -1103,8 +1078,6 @@ struct security_operations {
 				    char **name, void **value, size_t *len);
 	int (*inode_create) (struct inode *dir,
 	                     struct dentry *dentry, int mode);
-	void (*inode_post_create) (struct inode *dir,
-	                           struct dentry *dentry, int mode);
 	int (*inode_link) (struct dentry *old_dentry,
 	                   struct inode *dir, struct dentry *new_dentry);
 	void (*inode_post_link) (struct dentry *old_dentry,
@@ -1112,17 +1085,10 @@ struct security_operations {
 	int (*inode_unlink) (struct inode *dir, struct dentry *dentry);
 	int (*inode_symlink) (struct inode *dir,
 	                      struct dentry *dentry, const char *old_name);
-	void (*inode_post_symlink) (struct inode *dir,
-	                            struct dentry *dentry,
-	                            const char *old_name);
 	int (*inode_mkdir) (struct inode *dir, struct dentry *dentry, int mode);
-	void (*inode_post_mkdir) (struct inode *dir, struct dentry *dentry, 
-			    int mode);
 	int (*inode_rmdir) (struct inode *dir, struct dentry *dentry);
 	int (*inode_mknod) (struct inode *dir, struct dentry *dentry,
 	                    int mode, dev_t dev);
-	void (*inode_post_mknod) (struct inode *dir, struct dentry *dentry,
-	                          int mode, dev_t dev);
 	int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry,
 	                     struct inode *new_dir, struct dentry *new_dentry);
 	void (*inode_post_rename) (struct inode *old_dir,
@@ -1484,15 +1450,6 @@ static inline int security_inode_create (struct inode *dir,
 	return security_ops->inode_create (dir, dentry, mode);
 }
 
-static inline void security_inode_post_create (struct inode *dir,
-					       struct dentry *dentry,
-					       int mode)
-{
-	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
-		return;
-	security_ops->inode_post_create (dir, dentry, mode);
-}
-
 static inline int security_inode_link (struct dentry *old_dentry,
 				       struct inode *dir,
 				       struct dentry *new_dentry)
@@ -1528,15 +1485,6 @@ static inline int security_inode_symlink (struct inode *dir,
 	return security_ops->inode_symlink (dir, dentry, old_name);
 }
 
-static inline void security_inode_post_symlink (struct inode *dir,
-						struct dentry *dentry,
-						const char *old_name)
-{
-	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
-		return;
-	security_ops->inode_post_symlink (dir, dentry, old_name);
-}
-
 static inline int security_inode_mkdir (struct inode *dir,
 					struct dentry *dentry,
 					int mode)
@@ -1546,15 +1494,6 @@ static inline int security_inode_mkdir (struct inode *dir,
 	return security_ops->inode_mkdir (dir, dentry, mode);
 }
 
-static inline void security_inode_post_mkdir (struct inode *dir,
-					      struct dentry *dentry,
-					      int mode)
-{
-	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
-		return;
-	security_ops->inode_post_mkdir (dir, dentry, mode);
-}
-
 static inline int security_inode_rmdir (struct inode *dir,
 					struct dentry *dentry)
 {
@@ -1572,15 +1511,6 @@ static inline int security_inode_mknod (struct inode *dir,
 	return security_ops->inode_mknod (dir, dentry, mode, dev);
 }
 
-static inline void security_inode_post_mknod (struct inode *dir,
-					      struct dentry *dentry,
-					      int mode, dev_t dev)
-{
-	if (dentry->d_inode && unlikely (IS_PRIVATE (dentry->d_inode)))
-		return;
-	security_ops->inode_post_mknod (dir, dentry, mode, dev);
-}
-
 static inline int security_inode_rename (struct inode *old_dir,
 					 struct dentry *old_dentry,
 					 struct inode *new_dir,
@@ -2220,11 +2150,6 @@ static inline int security_inode_create (struct inode *dir,
 	return 0;
 }
 
-static inline void security_inode_post_create (struct inode *dir,
-					       struct dentry *dentry,
-					       int mode)
-{ }
-
 static inline int security_inode_link (struct dentry *old_dentry,
 				       struct inode *dir,
 				       struct dentry *new_dentry)
@@ -2250,11 +2175,6 @@ static inline int security_inode_symlink (struct inode *dir,
 	return 0;
 }
 
-static inline void security_inode_post_symlink (struct inode *dir,
-						struct dentry *dentry,
-						const char *old_name)
-{ }
-
 static inline int security_inode_mkdir (struct inode *dir,
 					struct dentry *dentry,
 					int mode)
@@ -2262,11 +2182,6 @@ static inline int security_inode_mkdir (struct inode *dir,
 	return 0;
 }
 
-static inline void security_inode_post_mkdir (struct inode *dir,
-					      struct dentry *dentry,
-					      int mode)
-{ }
-
 static inline int security_inode_rmdir (struct inode *dir,
 					struct dentry *dentry)
 {
@@ -2280,11 +2195,6 @@ static inline int security_inode_mknod (struct inode *dir,
 	return 0;
 }
 
-static inline void security_inode_post_mknod (struct inode *dir,
-					      struct dentry *dentry,
-					      int mode, dev_t dev)
-{ }
-
 static inline int security_inode_rename (struct inode *old_dir,
 					 struct dentry *old_dentry,
 					 struct inode *new_dir,
diff --git a/security/dummy.c b/security/dummy.c
index e8a00fa8046..5083314e14b 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -270,12 +270,6 @@ static int dummy_inode_create (struct inode *inode, struct dentry *dentry,
 	return 0;
 }
 
-static void dummy_inode_post_create (struct inode *inode, struct dentry *dentry,
-				     int mask)
-{
-	return;
-}
-
 static int dummy_inode_link (struct dentry *old_dentry, struct inode *inode,
 			     struct dentry *new_dentry)
 {
@@ -300,24 +294,12 @@ static int dummy_inode_symlink (struct inode *inode, struct dentry *dentry,
 	return 0;
 }
 
-static void dummy_inode_post_symlink (struct inode *inode,
-				      struct dentry *dentry, const char *name)
-{
-	return;
-}
-
 static int dummy_inode_mkdir (struct inode *inode, struct dentry *dentry,
 			      int mask)
 {
 	return 0;
 }
 
-static void dummy_inode_post_mkdir (struct inode *inode, struct dentry *dentry,
-				    int mask)
-{
-	return;
-}
-
 static int dummy_inode_rmdir (struct inode *inode, struct dentry *dentry)
 {
 	return 0;
@@ -329,12 +311,6 @@ static int dummy_inode_mknod (struct inode *inode, struct dentry *dentry,
 	return 0;
 }
 
-static void dummy_inode_post_mknod (struct inode *inode, struct dentry *dentry,
-				    int mode, dev_t dev)
-{
-	return;
-}
-
 static int dummy_inode_rename (struct inode *old_inode,
 			       struct dentry *old_dentry,
 			       struct inode *new_inode,
@@ -894,17 +870,13 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, inode_free_security);
 	set_to_dummy_if_null(ops, inode_init_security);
 	set_to_dummy_if_null(ops, inode_create);
-	set_to_dummy_if_null(ops, inode_post_create);
 	set_to_dummy_if_null(ops, inode_link);
 	set_to_dummy_if_null(ops, inode_post_link);
 	set_to_dummy_if_null(ops, inode_unlink);
 	set_to_dummy_if_null(ops, inode_symlink);
-	set_to_dummy_if_null(ops, inode_post_symlink);
 	set_to_dummy_if_null(ops, inode_mkdir);
-	set_to_dummy_if_null(ops, inode_post_mkdir);
 	set_to_dummy_if_null(ops, inode_rmdir);
 	set_to_dummy_if_null(ops, inode_mknod);
-	set_to_dummy_if_null(ops, inode_post_mknod);
 	set_to_dummy_if_null(ops, inode_rename);
 	set_to_dummy_if_null(ops, inode_post_rename);
 	set_to_dummy_if_null(ops, inode_readlink);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 265f33d3af9..c9c20828be7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1265,91 +1265,6 @@ static int inode_security_set_sid(struct inode *inode, u32 sid)
 	return 0;
 }
 
-/* Set the security attributes on a newly created file. */
-static int post_create(struct inode *dir,
-		       struct dentry *dentry)
-{
-
-	struct task_security_struct *tsec;
-	struct inode *inode;
-	struct inode_security_struct *dsec;
-	struct superblock_security_struct *sbsec;
-	struct inode_security_struct *isec;
-	u32 newsid;
-	char *context;
-	unsigned int len;
-	int rc;
-
-	tsec = current->security;
-	dsec = dir->i_security;
-	sbsec = dir->i_sb->s_security;
-
-	inode = dentry->d_inode;
-	if (!inode) {
-		/* Some file system types (e.g. NFS) may not instantiate
-		   a dentry for all create operations (e.g. symlink),
-		   so we have to check to see if the inode is non-NULL. */
-		printk(KERN_WARNING "post_create:  no inode, dir (dev=%s, "
-		       "ino=%ld)\n", dir->i_sb->s_id, dir->i_ino);
-		return 0;
-	}
-
-	isec = inode->i_security;
-
-	if (isec->security_attr_init)
-		return 0;
-
-	if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) {
-		newsid = tsec->create_sid;
-	} else {
-		rc = security_transition_sid(tsec->sid, dsec->sid,
-					     inode_mode_to_security_class(inode->i_mode),
-					     &newsid);
-		if (rc) {
-			printk(KERN_WARNING "post_create:  "
-			       "security_transition_sid failed, rc=%d (dev=%s "
-			       "ino=%ld)\n",
-			       -rc, inode->i_sb->s_id, inode->i_ino);
-			return rc;
-		}
-	}
-
-	rc = inode_security_set_sid(inode, newsid);
-	if (rc) {
-		printk(KERN_WARNING "post_create:  inode_security_set_sid "
-		       "failed, rc=%d (dev=%s ino=%ld)\n",
-		       -rc, inode->i_sb->s_id, inode->i_ino);
-		return rc;
-	}
-
-	if (sbsec->behavior == SECURITY_FS_USE_XATTR &&
-	    inode->i_op->setxattr) {
-		/* Use extended attributes. */
-		rc = security_sid_to_context(newsid, &context, &len);
-		if (rc) {
-			printk(KERN_WARNING "post_create:  sid_to_context "
-			       "failed, rc=%d (dev=%s ino=%ld)\n",
-			       -rc, inode->i_sb->s_id, inode->i_ino);
-			return rc;
-		}
-		down(&inode->i_sem);
-		rc = inode->i_op->setxattr(dentry,
-					   XATTR_NAME_SELINUX,
-					   context, len, 0);
-		up(&inode->i_sem);
-		kfree(context);
-		if (rc < 0) {
-			printk(KERN_WARNING "post_create:  setxattr failed, "
-			       "rc=%d (dev=%s ino=%ld)\n",
-			       -rc, inode->i_sb->s_id, inode->i_ino);
-			return rc;
-		}
-	}
-
-	return 0;
-}
-
-
 /* Hook functions begin here. */
 
 static int selinux_ptrace(struct task_struct *parent, struct task_struct *child)
@@ -2076,8 +1991,6 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 		*len = clen;
 	}
 
-	isec->security_attr_init = 1;
-
 	return 0;
 }
 
@@ -2086,11 +1999,6 @@ static int selinux_inode_create(struct inode *dir, struct dentry *dentry, int ma
 	return may_create(dir, dentry, SECCLASS_FILE);
 }
 
-static void selinux_inode_post_create(struct inode *dir, struct dentry *dentry, int mask)
-{
-	post_create(dir, dentry);
-}
-
 static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
 {
 	int rc;
@@ -2121,21 +2029,11 @@ static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const
 	return may_create(dir, dentry, SECCLASS_LNK_FILE);
 }
 
-static void selinux_inode_post_symlink(struct inode *dir, struct dentry *dentry, const char *name)
-{
-	post_create(dir, dentry);
-}
-
 static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, int mask)
 {
 	return may_create(dir, dentry, SECCLASS_DIR);
 }
 
-static void selinux_inode_post_mkdir(struct inode *dir, struct dentry *dentry, int mask)
-{
-	post_create(dir, dentry);
-}
-
 static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	return may_link(dir, dentry, MAY_RMDIR);
@@ -2152,11 +2050,6 @@ static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, int mod
 	return may_create(dir, dentry, inode_mode_to_security_class(mode));
 }
 
-static void selinux_inode_post_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-	post_create(dir, dentry);
-}
-
 static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
                                 struct inode *new_inode, struct dentry *new_dentry)
 {
@@ -4363,17 +4256,13 @@ static struct security_operations selinux_ops = {
 	.inode_free_security =		selinux_inode_free_security,
 	.inode_init_security =		selinux_inode_init_security,
 	.inode_create =			selinux_inode_create,
-	.inode_post_create =		selinux_inode_post_create,
 	.inode_link =			selinux_inode_link,
 	.inode_post_link =		selinux_inode_post_link,
 	.inode_unlink =			selinux_inode_unlink,
 	.inode_symlink =		selinux_inode_symlink,
-	.inode_post_symlink =		selinux_inode_post_symlink,
 	.inode_mkdir =			selinux_inode_mkdir,
-	.inode_post_mkdir =		selinux_inode_post_mkdir,
 	.inode_rmdir =			selinux_inode_rmdir,
 	.inode_mknod =			selinux_inode_mknod,
-	.inode_post_mknod =		selinux_inode_post_mknod,
 	.inode_rename =			selinux_inode_rename,
 	.inode_post_rename =		selinux_inode_post_rename,
 	.inode_readlink =		selinux_inode_readlink,
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index c515bc0b58a..887937c8134 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -46,7 +46,6 @@ struct inode_security_struct {
 	unsigned char initialized;     /* initialization flag */
 	struct semaphore sem;
 	unsigned char inherit;         /* inherit SID from parent entry */
-	unsigned char security_attr_init; /* security attributes init flag */
 };
 
 struct file_security_struct {
-- 
cgit v1.2.3-70-g09d2


From e31e14ec356f36b131576be5bc31d8fef7e95483 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Fri, 9 Sep 2005 13:01:45 -0700
Subject: [PATCH] remove the inode_post_link and inode_post_rename LSM hooks

This patch removes the inode_post_link and inode_post_rename LSM hooks as
they are unused (and likely useless).

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c               | 10 ++--------
 include/linux/security.h | 49 ------------------------------------------------
 security/dummy.c         | 17 -----------------
 security/selinux/hooks.c | 13 -------------
 4 files changed, 2 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 993a65a7d57..21d85f1ac83 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2012,10 +2012,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	DQUOT_INIT(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	up(&old_dentry->d_inode->i_sem);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, new_dentry->d_name.name);
-		security_inode_post_link(old_dentry, dir, new_dentry);
-	}
 	return error;
 }
 
@@ -2134,11 +2132,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 			d_rehash(new_dentry);
 		dput(new_dentry);
 	}
-	if (!error) {
+	if (!error)
 		d_move(old_dentry,new_dentry);
-		security_inode_post_rename(old_dir, old_dentry,
-					   new_dir, new_dentry);
-	}
 	return error;
 }
 
@@ -2164,7 +2159,6 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 		/* The following d_move() should become unconditional */
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME))
 			d_move(old_dentry, new_dentry);
-		security_inode_post_rename(old_dir, old_dentry, new_dir, new_dentry);
 	}
 	if (target)
 		up(&target->i_sem);
diff --git a/include/linux/security.h b/include/linux/security.h
index 875225bf898..55b02e1c73f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -281,11 +281,6 @@ struct swap_info_struct;
  *	@dir contains the inode structure of the parent directory of the new link.
  *	@new_dentry contains the dentry structure for the new link.
  *	Return 0 if permission is granted.
- * @inode_post_link:
- *	Set security attributes for a new hard link to a file.
- *	@old_dentry contains the dentry structure for the existing link.
- *	@dir contains the inode structure of the parent directory of the new file.
- *	@new_dentry contains the dentry structure for the new file link.
  * @inode_unlink:
  *	Check the permission to remove a hard link to a file. 
  *	@dir contains the inode structure of parent directory of the file.
@@ -326,12 +321,6 @@ struct swap_info_struct;
  *	@new_dir contains the inode structure for parent of the new link.
  *	@new_dentry contains the dentry structure of the new link.
  *	Return 0 if permission is granted.
- * @inode_post_rename:
- *	Set security attributes on a renamed file or directory.
- *	@old_dir contains the inode structure for parent of the old link.
- *	@old_dentry contains the dentry structure of the old link.
- *	@new_dir contains the inode structure for parent of the new link.
- *	@new_dentry contains the dentry structure of the new link.
  * @inode_readlink:
  *	Check the permission to read the symbolic link.
  *	@dentry contains the dentry structure for the file link.
@@ -1080,8 +1069,6 @@ struct security_operations {
 	                     struct dentry *dentry, int mode);
 	int (*inode_link) (struct dentry *old_dentry,
 	                   struct inode *dir, struct dentry *new_dentry);
-	void (*inode_post_link) (struct dentry *old_dentry,
-	                         struct inode *dir, struct dentry *new_dentry);
 	int (*inode_unlink) (struct inode *dir, struct dentry *dentry);
 	int (*inode_symlink) (struct inode *dir,
 	                      struct dentry *dentry, const char *old_name);
@@ -1091,10 +1078,6 @@ struct security_operations {
 	                    int mode, dev_t dev);
 	int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry,
 	                     struct inode *new_dir, struct dentry *new_dentry);
-	void (*inode_post_rename) (struct inode *old_dir,
-	                           struct dentry *old_dentry,
-	                           struct inode *new_dir,
-	                           struct dentry *new_dentry);
 	int (*inode_readlink) (struct dentry *dentry);
 	int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
 	int (*inode_permission) (struct inode *inode, int mask, struct nameidata *nd);
@@ -1459,15 +1442,6 @@ static inline int security_inode_link (struct dentry *old_dentry,
 	return security_ops->inode_link (old_dentry, dir, new_dentry);
 }
 
-static inline void security_inode_post_link (struct dentry *old_dentry,
-					     struct inode *dir,
-					     struct dentry *new_dentry)
-{
-	if (new_dentry->d_inode && unlikely (IS_PRIVATE (new_dentry->d_inode)))
-		return;
-	security_ops->inode_post_link (old_dentry, dir, new_dentry);
-}
-
 static inline int security_inode_unlink (struct inode *dir,
 					 struct dentry *dentry)
 {
@@ -1523,18 +1497,6 @@ static inline int security_inode_rename (struct inode *old_dir,
 					   new_dir, new_dentry);
 }
 
-static inline void security_inode_post_rename (struct inode *old_dir,
-					       struct dentry *old_dentry,
-					       struct inode *new_dir,
-					       struct dentry *new_dentry)
-{
-	if (unlikely (IS_PRIVATE (old_dentry->d_inode) ||
-	    (new_dentry->d_inode && IS_PRIVATE (new_dentry->d_inode))))
-		return;
-	security_ops->inode_post_rename (old_dir, old_dentry,
-						new_dir, new_dentry);
-}
-
 static inline int security_inode_readlink (struct dentry *dentry)
 {
 	if (unlikely (IS_PRIVATE (dentry->d_inode)))
@@ -2157,11 +2119,6 @@ static inline int security_inode_link (struct dentry *old_dentry,
 	return 0;
 }
 
-static inline void security_inode_post_link (struct dentry *old_dentry,
-					     struct inode *dir,
-					     struct dentry *new_dentry)
-{ }
-
 static inline int security_inode_unlink (struct inode *dir,
 					 struct dentry *dentry)
 {
@@ -2203,12 +2160,6 @@ static inline int security_inode_rename (struct inode *old_dir,
 	return 0;
 }
 
-static inline void security_inode_post_rename (struct inode *old_dir,
-					       struct dentry *old_dentry,
-					       struct inode *new_dir,
-					       struct dentry *new_dentry)
-{ }
-
 static inline int security_inode_readlink (struct dentry *dentry)
 {
 	return 0;
diff --git a/security/dummy.c b/security/dummy.c
index 5083314e14b..9623a61dfc7 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -276,13 +276,6 @@ static int dummy_inode_link (struct dentry *old_dentry, struct inode *inode,
 	return 0;
 }
 
-static void dummy_inode_post_link (struct dentry *old_dentry,
-				   struct inode *inode,
-				   struct dentry *new_dentry)
-{
-	return;
-}
-
 static int dummy_inode_unlink (struct inode *inode, struct dentry *dentry)
 {
 	return 0;
@@ -319,14 +312,6 @@ static int dummy_inode_rename (struct inode *old_inode,
 	return 0;
 }
 
-static void dummy_inode_post_rename (struct inode *old_inode,
-				     struct dentry *old_dentry,
-				     struct inode *new_inode,
-				     struct dentry *new_dentry)
-{
-	return;
-}
-
 static int dummy_inode_readlink (struct dentry *dentry)
 {
 	return 0;
@@ -871,14 +856,12 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, inode_init_security);
 	set_to_dummy_if_null(ops, inode_create);
 	set_to_dummy_if_null(ops, inode_link);
-	set_to_dummy_if_null(ops, inode_post_link);
 	set_to_dummy_if_null(ops, inode_unlink);
 	set_to_dummy_if_null(ops, inode_symlink);
 	set_to_dummy_if_null(ops, inode_mkdir);
 	set_to_dummy_if_null(ops, inode_rmdir);
 	set_to_dummy_if_null(ops, inode_mknod);
 	set_to_dummy_if_null(ops, inode_rename);
-	set_to_dummy_if_null(ops, inode_post_rename);
 	set_to_dummy_if_null(ops, inode_readlink);
 	set_to_dummy_if_null(ops, inode_follow_link);
 	set_to_dummy_if_null(ops, inode_permission);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c9c20828be7..3f0b533be92 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2009,11 +2009,6 @@ static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, stru
 	return may_link(dir, old_dentry, MAY_LINK);
 }
 
-static void selinux_inode_post_link(struct dentry *old_dentry, struct inode *inode, struct dentry *new_dentry)
-{
-	return;
-}
-
 static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int rc;
@@ -2056,12 +2051,6 @@ static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dent
 	return may_rename(old_inode, old_dentry, new_inode, new_dentry);
 }
 
-static void selinux_inode_post_rename(struct inode *old_inode, struct dentry *old_dentry,
-                                      struct inode *new_inode, struct dentry *new_dentry)
-{
-	return;
-}
-
 static int selinux_inode_readlink(struct dentry *dentry)
 {
 	return dentry_has_perm(current, NULL, dentry, FILE__READ);
@@ -4257,14 +4246,12 @@ static struct security_operations selinux_ops = {
 	.inode_init_security =		selinux_inode_init_security,
 	.inode_create =			selinux_inode_create,
 	.inode_link =			selinux_inode_link,
-	.inode_post_link =		selinux_inode_post_link,
 	.inode_unlink =			selinux_inode_unlink,
 	.inode_symlink =		selinux_inode_symlink,
 	.inode_mkdir =			selinux_inode_mkdir,
 	.inode_rmdir =			selinux_inode_rmdir,
 	.inode_mknod =			selinux_inode_mknod,
 	.inode_rename =			selinux_inode_rename,
-	.inode_post_rename =		selinux_inode_post_rename,
 	.inode_readlink =		selinux_inode_readlink,
 	.inode_follow_link =		selinux_inode_follow_link,
 	.inode_permission =		selinux_inode_permission,
-- 
cgit v1.2.3-70-g09d2


From f76baf9365bd66216bf0e0ebfc083e22eda6215b Mon Sep 17 00:00:00 2001
From: Alexander Krizhanovsky <klx@yandex.ru>
Date: Fri, 9 Sep 2005 13:01:59 -0700
Subject: [PATCH] autofs: fix "busy inodes after umount..."

This patch for old autofs (version 3) cleans dentries which are not putted
after killing the automount daemon (it's analogue of recent patch for
autofs4).

Signed-off-by: Alexander Krizhanovsky <klx@yandex.ru>
Cc: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs/autofs_i.h | 3 ++-
 fs/autofs/dirhash.c  | 5 +++--
 fs/autofs/inode.c    | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 6171431272d..990c28da5ae 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_sb_info {
 	struct file *pipe;
 	pid_t oz_pgrp;
 	int catatonic;
+	struct super_block *sb;
 	unsigned long exp_timeout;
 	ino_t next_dir_ino;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
@@ -134,7 +135,7 @@ void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
 void autofs_hash_delete(struct autofs_dir_ent *);
 struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
 void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_dirhash *);
+void autofs_hash_nuke(struct autofs_sb_info *);
 
 /* Expiration-handling functions */
 
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 448143fd079..5ccfcf26310 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -232,13 +232,13 @@ void autofs_hash_dputall(struct autofs_dirhash *dh)
 
 /* Delete everything.  This is used on filesystem destruction, so we
    make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_dirhash *dh)
+void autofs_hash_nuke(struct autofs_sb_info *sbi)
 {
 	int i;
 	struct autofs_dir_ent *ent, *nent;
 
 	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-		for ( ent = dh->h[i] ; ent ; ent = nent ) {
+		for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
 			nent = ent->next;
 			if ( ent->dentry )
 				dput(ent->dentry);
@@ -246,4 +246,5 @@ void autofs_hash_nuke(struct autofs_dirhash *dh)
 			kfree(ent);
 		}
 	}
+	shrink_dcache_sb(sbi->sb);
 }
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 4888c1fabbf..65e5ed42190 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -27,7 +27,7 @@ static void autofs_put_super(struct super_block *sb)
 	if ( !sbi->catatonic )
 		autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
 
-	autofs_hash_nuke(&sbi->dirhash);
+	autofs_hash_nuke(sbi);
 	for ( n = 0 ; n < AUTOFS_MAX_SYMLINKS ; n++ ) {
 		if ( test_bit(n, sbi->symlink_bitmap) )
 			kfree(sbi->symlink[n].data);
@@ -148,6 +148,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 	s->s_magic = AUTOFS_SUPER_MAGIC;
 	s->s_op = &autofs_sops;
 	s->s_time_gran = 1;
+	sbi->sb = s;
 
 	root_inode = iget(s, AUTOFS_ROOT_INO);
 	root = d_alloc_root(root_inode);
-- 
cgit v1.2.3-70-g09d2


From fac92becdaecff64dd91daab0292c5131de92f0d Mon Sep 17 00:00:00 2001
From: Andrew Stribblehill <a.d.stribblehill@durham.ac.uk>
Date: Fri, 9 Sep 2005 13:02:04 -0700
Subject: [PATCH] bfs: fix endianness, signedness; add trivial bugfix

* Makes BFS code endianness-clean.

* Fixes some signedness warnings.

* Fixes a problem in fs/bfs/inode.c:164 where inodes not synced to disk
  don't get fully marked as clean.  Here's how to reproduce it:

# mount -o loop -t bfs /bfs.img /mnt
# df -i /mnt
Filesystem            Inodes   IUsed   IFree IUse% Mounted on
/bfs.img                  48       1      47    3% /mnt
# df -k /mnt
Filesystem           1K-blocks      Used Available Use% Mounted on
/bfs.img                   512         5       508   1% /mnt
# cp 60k-archive.zip /mnt/mt.zip
# df -k /mnt
Filesystem           1K-blocks      Used Available Use% Mounted on
/bfs.img                   512        65       447  13% /mnt
# df -i /mnt
Filesystem            Inodes   IUsed   IFree IUse% Mounted on
/bfs.img                  48       2      46    5% /mnt
# rm /mnt/mt.zip
# echo $?
0

 [If the unlink happens before the buffers flush, the following happens:]

# df -i /mnt
Filesystem            Inodes   IUsed   IFree IUse% Mounted on
/bfs.img                  48       2      46    5% /mnt
# df -k /mnt
Filesystem           1K-blocks      Used Available Use% Mounted on
/bfs.img                   512        65       447  13% /mnt

 fs/bfs/bfs.h           |    1

Signed-off-by: Andrew Stribblehill <ads@wompom.org>
Cc: <tigran@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/bfs/bfs.h           |   1 -
 fs/bfs/dir.c           |  25 ++++++------
 fs/bfs/file.c          |  23 ++++++-----
 fs/bfs/inode.c         | 102 ++++++++++++++++++++++++++-----------------------
 include/linux/bfs_fs.h |  23 +++++------
 5 files changed, 93 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 1020dbc88be..1fbc53f14ab 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -20,7 +20,6 @@ struct bfs_sb_info {
 	unsigned long si_lasti;
 	unsigned long * si_imap;
 	struct buffer_head * si_sbh;		/* buffer header w/superblock */
-	struct bfs_super_block * si_bfs_sb;	/* superblock in si_sbh->b_data */
 };
 
 /*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 5a1e5ce057f..e240c335eb2 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -2,6 +2,7 @@
  *	fs/bfs/dir.c
  *	BFS directory operations.
  *	Copyright (C) 1999,2000  Tigran Aivazian <tigran@veritas.com>
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
  */
 
 #include <linux/time.h>
@@ -20,9 +21,9 @@
 #define dprintf(x...)
 #endif
 
-static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int ino);
+static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino);
 static struct buffer_head * bfs_find_entry(struct inode * dir, 
-	const char * name, int namelen, struct bfs_dirent ** res_dir);
+	const unsigned char * name, int namelen, struct bfs_dirent ** res_dir);
 
 static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 {
@@ -53,7 +54,7 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 			de = (struct bfs_dirent *)(bh->b_data + offset);
 			if (de->ino) {
 				int size = strnlen(de->name, BFS_NAMELEN);
-				if (filldir(dirent, de->name, size, f->f_pos, de->ino, DT_UNKNOWN) < 0) {
+				if (filldir(dirent, de->name, size, f->f_pos, le16_to_cpu(de->ino), DT_UNKNOWN) < 0) {
 					brelse(bh);
 					unlock_kernel();
 					return 0;
@@ -107,7 +108,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 	inode->i_mapping->a_ops = &bfs_aops;
 	inode->i_mode = mode;
 	inode->i_ino = ino;
-	BFS_I(inode)->i_dsk_ino = ino;
+	BFS_I(inode)->i_dsk_ino = cpu_to_le16(ino);
 	BFS_I(inode)->i_sblock = 0;
 	BFS_I(inode)->i_eblock = 0;
 	insert_inode_hash(inode);
@@ -139,7 +140,7 @@ static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, st
 	lock_kernel();
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
 	if (bh) {
-		unsigned long ino = le32_to_cpu(de->ino);
+		unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
 		brelse(bh);
 		inode = iget(dir->i_sb, ino);
 		if (!inode) {
@@ -183,7 +184,7 @@ static int bfs_unlink(struct inode * dir, struct dentry * dentry)
 	inode = dentry->d_inode;
 	lock_kernel();
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
-	if (!bh || de->ino != inode->i_ino) 
+	if (!bh || le16_to_cpu(de->ino) != inode->i_ino)
 		goto out_brelse;
 
 	if (!inode->i_nlink) {
@@ -224,7 +225,7 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
 				old_dentry->d_name.name, 
 				old_dentry->d_name.len, &old_de);
 
-	if (!old_bh || old_de->ino != old_inode->i_ino)
+	if (!old_bh || le16_to_cpu(old_de->ino) != old_inode->i_ino)
 		goto end_rename;
 
 	error = -EPERM;
@@ -270,7 +271,7 @@ struct inode_operations bfs_dir_inops = {
 	.rename			= bfs_rename,
 };
 
-static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int ino)
+static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino)
 {
 	struct buffer_head * bh;
 	struct bfs_dirent * de;
@@ -304,7 +305,7 @@ static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int
 				}
 				dir->i_mtime = CURRENT_TIME_SEC;
 				mark_inode_dirty(dir);
-				de->ino = ino;
+				de->ino = cpu_to_le16((u16)ino);
 				for (i=0; i<BFS_NAMELEN; i++)
 					de->name[i] = (i < namelen) ? name[i] : 0;
 				mark_buffer_dirty(bh);
@@ -317,7 +318,7 @@ static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int
 	return -ENOSPC;
 }
 
-static inline int bfs_namecmp(int len, const char * name, const char * buffer)
+static inline int bfs_namecmp(int len, const unsigned char * name, const char * buffer)
 {
 	if (len < BFS_NAMELEN && buffer[len])
 		return 0;
@@ -325,7 +326,7 @@ static inline int bfs_namecmp(int len, const char * name, const char * buffer)
 }
 
 static struct buffer_head * bfs_find_entry(struct inode * dir, 
-	const char * name, int namelen, struct bfs_dirent ** res_dir)
+	const unsigned char * name, int namelen, struct bfs_dirent ** res_dir)
 {
 	unsigned long block, offset;
 	struct buffer_head * bh;
@@ -346,7 +347,7 @@ static struct buffer_head * bfs_find_entry(struct inode * dir,
 		}
 		de = (struct bfs_dirent *)(bh->b_data + offset);
 		offset += BFS_DIRENT_SIZE;
-		if (de->ino && bfs_namecmp(namelen, name, de->name)) {
+		if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) {
 			*res_dir = de;
 			return bh;
 		}
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 747fd1ea55e..807723b65da 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -40,8 +40,8 @@ static int bfs_move_block(unsigned long from, unsigned long to, struct super_blo
 	return 0;
 }
 
-static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned long end, 
-				unsigned long where)
+static int bfs_move_blocks(struct super_block *sb, unsigned long start,
+                           unsigned long end, unsigned long where)
 {
 	unsigned long i;
 
@@ -57,20 +57,21 @@ static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned
 static int bfs_get_block(struct inode * inode, sector_t block, 
 	struct buffer_head * bh_result, int create)
 {
-	long phys;
+	unsigned long phys;
 	int err;
 	struct super_block *sb = inode->i_sb;
 	struct bfs_sb_info *info = BFS_SB(sb);
 	struct bfs_inode_info *bi = BFS_I(inode);
 	struct buffer_head *sbh = info->si_sbh;
 
-	if (block < 0 || block > info->si_blocks)
+	if (block > info->si_blocks)
 		return -EIO;
 
 	phys = bi->i_sblock + block;
 	if (!create) {
 		if (phys <= bi->i_eblock) {
-			dprintf("c=%d, b=%08lx, phys=%08lx (granted)\n", create, block, phys);
+			dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n",
+                                create, (unsigned long)block, phys);
 			map_bh(bh_result, sb, phys);
 		}
 		return 0;
@@ -80,7 +81,7 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	   of blocks allocated for this file, we can grant it */
 	if (inode->i_size && phys <= bi->i_eblock) {
 		dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", 
-				create, block, phys);
+				create, (unsigned long)block, phys);
 		map_bh(bh_result, sb, phys);
 		return 0;
 	}
@@ -88,11 +89,12 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	/* the rest has to be protected against itself */
 	lock_kernel();
 
-	/* if the last data block for this file is the last allocated block, we can
-	   extend the file trivially, without moving it anywhere */
+	/* if the last data block for this file is the last allocated
+	   block, we can extend the file trivially, without moving it
+	   anywhere */
 	if (bi->i_eblock == info->si_lf_eblk) {
 		dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", 
-				create, block, phys);
+				create, (unsigned long)block, phys);
 		map_bh(bh_result, sb, phys);
 		info->si_freeb -= phys - bi->i_eblock;
 		info->si_lf_eblk = bi->i_eblock = phys;
@@ -114,7 +116,8 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	} else
 		err = 0;
 
-	dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n", create, block, phys);
+	dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n",
+                create, (unsigned long)block, phys);
 	bi->i_sblock = phys;
 	phys += block;
 	info->si_lf_eblk = bi->i_eblock = phys;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 628c2c1a7d7..c7b39aa279d 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -3,6 +3,8 @@
  *	BFS superblock and inode operations.
  *	Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
  *	From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
+ *
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
  */
 
 #include <linux/module.h>
@@ -54,46 +56,50 @@ static void bfs_read_inode(struct inode * inode)
 	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
 	di = (struct bfs_inode *)bh->b_data + off;
 
-	inode->i_mode = 0x0000FFFF & di->i_mode;
-	if (di->i_vtype == BFS_VDIR) {
+	inode->i_mode = 0x0000FFFF &  le32_to_cpu(di->i_mode);
+	if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
 		inode->i_mode |= S_IFDIR;
 		inode->i_op = &bfs_dir_inops;
 		inode->i_fop = &bfs_dir_operations;
-	} else if (di->i_vtype == BFS_VREG) {
+	} else if (le32_to_cpu(di->i_vtype) == BFS_VREG) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &bfs_file_inops;
 		inode->i_fop = &bfs_file_operations;
 		inode->i_mapping->a_ops = &bfs_aops;
 	}
 
-	inode->i_uid = di->i_uid;
-	inode->i_gid = di->i_gid;
-	inode->i_nlink = di->i_nlink;
+	BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
+	BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
+	inode->i_uid =  le32_to_cpu(di->i_uid);
+	inode->i_gid =  le32_to_cpu(di->i_gid);
+	inode->i_nlink =  le32_to_cpu(di->i_nlink);
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
+        if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
 	inode->i_blksize = PAGE_SIZE;
-	inode->i_atime.tv_sec = di->i_atime;
-	inode->i_mtime.tv_sec = di->i_mtime;
-	inode->i_ctime.tv_sec = di->i_ctime;
+	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
+	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
+	inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
 	inode->i_atime.tv_nsec = 0;
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
-	BFS_I(inode)->i_dsk_ino = di->i_ino; /* can be 0 so we store a copy */
-	BFS_I(inode)->i_sblock = di->i_sblock;
-	BFS_I(inode)->i_eblock = di->i_eblock;
+	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); /* can be 0 so we store a copy */
 
 	brelse(bh);
 }
 
 static int bfs_write_inode(struct inode * inode, int unused)
 {
-	unsigned long ino = inode->i_ino;
+	unsigned int ino = (u16)inode->i_ino;
+        unsigned long i_sblock;
 	struct bfs_inode * di;
 	struct buffer_head * bh;
 	int block, off;
 
+        dprintf("ino=%08x\n", ino);
+
 	if (ino < BFS_ROOT_INO || ino > BFS_SB(inode->i_sb)->si_lasti) {
-		printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino);
+		printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino);
 		return -EIO;
 	}
 
@@ -101,7 +107,7 @@ static int bfs_write_inode(struct inode * inode, int unused)
 	block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh) {
-		printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino);
+		printf("Unable to read inode %s:%08x\n", inode->i_sb->s_id, ino);
 		unlock_kernel();
 		return -EIO;
 	}
@@ -109,24 +115,26 @@ static int bfs_write_inode(struct inode * inode, int unused)
 	off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
 	di = (struct bfs_inode *)bh->b_data + off;
 
-	if (inode->i_ino == BFS_ROOT_INO)
-		di->i_vtype = BFS_VDIR;
+	if (ino == BFS_ROOT_INO)
+		di->i_vtype = cpu_to_le32(BFS_VDIR);
 	else
-		di->i_vtype = BFS_VREG;
-
-	di->i_ino = inode->i_ino;
-	di->i_mode = inode->i_mode;
-	di->i_uid = inode->i_uid;
-	di->i_gid = inode->i_gid;
-	di->i_nlink = inode->i_nlink;
-	di->i_atime = inode->i_atime.tv_sec;
-	di->i_mtime = inode->i_mtime.tv_sec;
-	di->i_ctime = inode->i_ctime.tv_sec;
-	di->i_sblock = BFS_I(inode)->i_sblock;
-	di->i_eblock = BFS_I(inode)->i_eblock;
-	di->i_eoffset = di->i_sblock * BFS_BSIZE + inode->i_size - 1;
+		di->i_vtype = cpu_to_le32(BFS_VREG);
+
+	di->i_ino = cpu_to_le16(ino);
+	di->i_mode = cpu_to_le32(inode->i_mode);
+	di->i_uid = cpu_to_le32(inode->i_uid);
+	di->i_gid = cpu_to_le32(inode->i_gid);
+	di->i_nlink = cpu_to_le32(inode->i_nlink);
+	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+        i_sblock = BFS_I(inode)->i_sblock;
+	di->i_sblock = cpu_to_le32(i_sblock);
+	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
+	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
+        dprintf("Written ino=%d into %d:%d\n",le16_to_cpu(di->i_ino),block,off);
 	brelse(bh);
 	unlock_kernel();
 	return 0;
@@ -140,13 +148,14 @@ static void bfs_delete_inode(struct inode * inode)
 	int block, off;
 	struct super_block * s = inode->i_sb;
 	struct bfs_sb_info * info = BFS_SB(s);
+	struct bfs_inode_info * bi = BFS_I(inode);
 
-	dprintf("ino=%08lx\n", inode->i_ino);
+	dprintf("ino=%08lx\n", ino);
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	if (inode->i_ino < BFS_ROOT_INO || inode->i_ino > info->si_lasti) {
-		printf("invalid ino=%08lx\n", inode->i_ino);
+	if (ino < BFS_ROOT_INO || ino > info->si_lasti) {
+		printf("invalid ino=%08lx\n", ino);
 		return;
 	}
 	
@@ -162,13 +171,13 @@ static void bfs_delete_inode(struct inode * inode)
 		return;
 	}
 	off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
-	di = (struct bfs_inode *)bh->b_data + off;
-	if (di->i_ino) {
-		info->si_freeb += BFS_FILEBLOCKS(di);
+	di = (struct bfs_inode *) bh->b_data + off;
+        if (bi->i_dsk_ino) {
+		info->si_freeb += 1 + bi->i_eblock - bi->i_sblock;
 		info->si_freei++;
-		clear_bit(di->i_ino, info->si_imap);
+		clear_bit(ino, info->si_imap);
 		dump_imap("delete_inode", s);
-	}
+        }
 	di->i_ino = 0;
 	di->i_sblock = 0;
 	mark_buffer_dirty(bh);
@@ -274,14 +283,14 @@ static struct super_operations bfs_sops = {
 
 void dump_imap(const char *prefix, struct super_block * s)
 {
-#if 0
+#ifdef DEBUG
 	int i;
 	char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL);
 
 	if (!tmpbuf)
 		return;
 	for (i=BFS_SB(s)->si_lasti; i>=0; i--) {
-		if (i>PAGE_SIZE-100) break;
+		if (i > PAGE_SIZE-100) break;
 		if (test_bit(i, BFS_SB(s)->si_imap))
 			strcat(tmpbuf, "1");
 		else
@@ -297,7 +306,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	struct buffer_head * bh;
 	struct bfs_super_block * bfs_sb;
 	struct inode * inode;
-	int i, imap_len;
+	unsigned i, imap_len;
 	struct bfs_sb_info * info;
 
 	info = kmalloc(sizeof(*info), GFP_KERNEL);
@@ -312,19 +321,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	if(!bh)
 		goto out;
 	bfs_sb = (struct bfs_super_block *)bh->b_data;
-	if (bfs_sb->s_magic != BFS_MAGIC) {
+	if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
 		if (!silent)
 			printf("No BFS filesystem on %s (magic=%08x)\n", 
-				s->s_id, bfs_sb->s_magic);
+				s->s_id,  le32_to_cpu(bfs_sb->s_magic));
 		goto out;
 	}
 	if (BFS_UNCLEAN(bfs_sb, s) && !silent)
 		printf("%s is unclean, continuing\n", s->s_id);
 
 	s->s_magic = BFS_MAGIC;
-	info->si_bfs_sb = bfs_sb;
 	info->si_sbh = bh;
-	info->si_lasti = (bfs_sb->s_start - BFS_BSIZE)/sizeof(struct bfs_inode) 
+	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE)/sizeof(struct bfs_inode)
 			+ BFS_ROOT_INO - 1;
 
 	imap_len = info->si_lasti/8 + 1;
@@ -348,8 +356,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		goto out;
 	}
 
-	info->si_blocks = (bfs_sb->s_end + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
-	info->si_freeb = (bfs_sb->s_end + 1 - bfs_sb->s_start)>>BFS_BSIZE_BITS;
+	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 -  cpu_to_le32(bfs_sb->s_start))>>BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
 	info->si_lf_sblk = 0;
diff --git a/include/linux/bfs_fs.h b/include/linux/bfs_fs.h
index f7f0913cd11..c1237aa92e3 100644
--- a/include/linux/bfs_fs.h
+++ b/include/linux/bfs_fs.h
@@ -14,8 +14,9 @@
 #define BFS_INODES_PER_BLOCK	8
 
 /* SVR4 vnode type values (bfs_inode->i_vtype) */
-#define BFS_VDIR		2
-#define BFS_VREG		1
+#define BFS_VDIR 2L
+#define BFS_VREG 1L
+
 
 /* BFS inode layout on disk */
 struct bfs_inode {
@@ -58,22 +59,22 @@ struct bfs_super_block {
 	__u32 s_padding[118];
 };
 
-#define BFS_NZFILESIZE(ip) \
-        (((ip)->i_eoffset + 1) - (ip)->i_sblock * BFS_BSIZE)
-
-#define BFS_FILESIZE(ip) \
-        ((ip)->i_sblock == 0 ? 0 : BFS_NZFILESIZE(ip))
-
-#define BFS_FILEBLOCKS(ip) \
-        ((ip)->i_sblock == 0 ? 0 : ((ip)->i_eblock + 1) - (ip)->i_sblock)
 
 #define BFS_OFF2INO(offset) \
         ((((offset) - BFS_BSIZE) / sizeof(struct bfs_inode)) + BFS_ROOT_INO)
 
 #define BFS_INO2OFF(ino) \
 	((__u32)(((ino) - BFS_ROOT_INO) * sizeof(struct bfs_inode)) + BFS_BSIZE)
+#define BFS_NZFILESIZE(ip) \
+        ((cpu_to_le32((ip)->i_eoffset) + 1) -  cpu_to_le32((ip)->i_sblock) * BFS_BSIZE)
+
+#define BFS_FILESIZE(ip) \
+        ((ip)->i_sblock == 0 ? 0 : BFS_NZFILESIZE(ip))
 
+#define BFS_FILEBLOCKS(ip) \
+        ((ip)->i_sblock == 0 ? 0 : (cpu_to_le32((ip)->i_eblock) + 1) -  cpu_to_le32((ip)->i_sblock))
 #define BFS_UNCLEAN(bfs_sb, sb)	\
-	((bfs_sb->s_from != -1) && (bfs_sb->s_to != -1) && !(sb->s_flags & MS_RDONLY))
+	((cpu_to_le32(bfs_sb->s_from) != -1) && (cpu_to_le32(bfs_sb->s_to) != -1) && !(sb->s_flags & MS_RDONLY))
+
 
 #endif	/* _LINUX_BFS_FS_H */
-- 
cgit v1.2.3-70-g09d2


From 8f58202bf6b915656e116ece3bc4ace14bfe533a Mon Sep 17 00:00:00 2001
From: Wendy Cheng <wcheng@redhat.com>
Date: Fri, 9 Sep 2005 13:02:08 -0700
Subject: [PATCH] change io_cancel return code for no cancel case

Note that other than few exceptions, most of the current filesystem and/or
drivers do not have aio cancel specifically defined (kiob->ki_cancel field
is mostly NULL).  However, sys_io_cancel system call universally sets
return code to -EAGAIN.  This gives applications a wrong impression that
this call is implemented but just never works.  We have customer inquires
about this issue.

Changed by Benjamin LaHaise to EINVAL instead of ENOSYS

Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 4f641abac3c..769791df36b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1673,7 +1673,7 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 				ret = -EFAULT;
 		}
 	} else
-		printk(KERN_DEBUG "iocb has no cancel operation\n");
+		ret = -EINVAL;
 
 	put_ioctx(ctx);
 
-- 
cgit v1.2.3-70-g09d2


From ac0b1bc1edbe81c0cb36cad7e7f5b91f4d9e12ed Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@linux.intel.com>
Date: Fri, 9 Sep 2005 13:02:09 -0700
Subject: [PATCH] aio: kiocb locking to serialise retry and cancel

Implement a per-kiocb lock to serialise retry operations and cancel.  This
is done using wait_on_bit_lock() on the KIF_LOCKED bit of kiocb->ki_flags.
Also, make the cancellation path lock the kiocb and subsequently release
all references to it if the cancel was successful.  This version includes a
fix for the deadlock with __aio_run_iocbs.

Signed-off-by: Benjamin LaHaise <bcrl@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 769791df36b..201c1847fa0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -546,6 +546,24 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ioctx;
 }
 
+static int lock_kiocb_action(void *param)
+{
+	schedule();
+	return 0;
+}
+
+static inline void lock_kiocb(struct kiocb *iocb)
+{
+	wait_on_bit_lock(&iocb->ki_flags, KIF_LOCKED, lock_kiocb_action,
+			 TASK_UNINTERRUPTIBLE);
+}
+
+static inline void unlock_kiocb(struct kiocb *iocb)
+{
+	kiocbClearLocked(iocb);
+	wake_up_bit(&iocb->ki_flags, KIF_LOCKED);
+}
+
 /*
  * use_mm
  *	Makes the calling kernel thread take on the specified
@@ -786,7 +804,9 @@ static int __aio_run_iocbs(struct kioctx *ctx)
 		 * Hold an extra reference while retrying i/o.
 		 */
 		iocb->ki_users++;       /* grab extra reference */
+		lock_kiocb(iocb);
 		aio_run_iocb(iocb);
+		unlock_kiocb(iocb);
 		if (__aio_put_req(ctx, iocb))  /* drop extra ref */
 			put_ioctx(ctx);
  	}
@@ -1527,10 +1547,9 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 
 	spin_lock_irq(&ctx->ctx_lock);
-	if (likely(list_empty(&ctx->run_list))) {
-		aio_run_iocb(req);
-	} else {
-		list_add_tail(&req->ki_run_list, &ctx->run_list);
+	aio_run_iocb(req);
+	unlock_kiocb(req);
+	if (!list_empty(&ctx->run_list)) {
 		/* drain the run list */
 		while (__aio_run_iocbs(ctx))
 			;
@@ -1661,6 +1680,7 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 	if (NULL != cancel) {
 		struct io_event tmp;
 		pr_debug("calling cancel\n");
+		lock_kiocb(kiocb);
 		memset(&tmp, 0, sizeof(tmp));
 		tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
 		tmp.data = kiocb->ki_user_data;
@@ -1672,6 +1692,7 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 			if (copy_to_user(result, &tmp, sizeof(tmp)))
 				ret = -EFAULT;
 		}
+		unlock_kiocb(kiocb);
 	} else
 		ret = -EINVAL;
 
-- 
cgit v1.2.3-70-g09d2


From badf16621c1f9d1ac753be056fce11b43d6e0be5 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Fri, 9 Sep 2005 13:04:10 -0700
Subject: [PATCH] files: break up files struct

In order for the RCU to work, the file table array, sets and their sizes must
be updated atomically.  Instead of ensuring this through too many memory
barriers, we put the arrays and their sizes in a separate structure.  This
patch takes the first step of putting the file table elements in a separate
structure fdtable that is embedded withing files_struct.  It also changes all
the users to refer to the file table using files_fdtable() macro.  Subsequent
applciation of RCU becomes easier after this.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/osf_sys.c  |  4 ++-
 arch/ia64/kernel/perfmon.c   |  7 ++--
 arch/sparc64/solaris/ioctl.c |  8 +++--
 drivers/char/tty_io.c        |  4 ++-
 fs/exec.c                    |  8 +++--
 fs/fcntl.c                   | 47 +++++++++++++++----------
 fs/file.c                    | 42 ++++++++++++++---------
 fs/locks.c                   |  8 +++--
 fs/open.c                    | 41 +++++++++++++---------
 fs/proc/array.c              |  5 ++-
 fs/proc/base.c               |  4 ++-
 fs/select.c                  | 12 +++++--
 include/linux/file.h         | 23 ++++++++-----
 include/linux/init_task.h    | 13 ++++---
 kernel/exit.c                | 21 +++++++-----
 kernel/fork.c                | 82 +++++++++++++++++++++++++++-----------------
 security/selinux/hooks.c     |  6 ++--
 17 files changed, 211 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 167fd89f870..2b034182a0c 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -974,6 +974,7 @@ osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp,
 	size_t size;
 	long timeout;
 	int ret = -EINVAL;
+	struct fdtable *fdt;
 
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (tvp) {
@@ -995,7 +996,8 @@ osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp,
 		}
 	}
 
-	if (n < 0 || n > current->files->max_fdset)
+	fdt = files_fdtable(current->files);
+	if (n < 0 || n > fdt->max_fdset)
 		goto out_nofds;
 
 	/*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f1201ac8a11..4ad97b3b39d 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -38,6 +38,7 @@
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/bitops.h>
+#include <linux/rcupdate.h>
 
 #include <asm/errno.h>
 #include <asm/intrinsics.h>
@@ -2217,15 +2218,17 @@ static void
 pfm_free_fd(int fd, struct file *file)
 {
 	struct files_struct *files = current->files;
+	struct fdtable *fdt = files_fdtable(files);
 
 	/* 
 	 * there ie no fd_uninstall(), so we do it here
 	 */
 	spin_lock(&files->file_lock);
-        files->fd[fd] = NULL;
+	rcu_assign_pointer(fdt->fd[fd], NULL);
 	spin_unlock(&files->file_lock);
 
-	if (file) put_filp(file);
+	if (file)
+		put_filp(file);
 	put_unused_fd(fd);
 }
 
diff --git a/arch/sparc64/solaris/ioctl.c b/arch/sparc64/solaris/ioctl.c
index cac0a1cf005..374766455f5 100644
--- a/arch/sparc64/solaris/ioctl.c
+++ b/arch/sparc64/solaris/ioctl.c
@@ -293,11 +293,13 @@ static struct module_info {
 static inline int solaris_sockmod(unsigned int fd, unsigned int cmd, u32 arg)
 {
 	struct inode *ino;
+	struct fdtable *fdt;
 	/* I wonder which of these tests are superfluous... --patrik */
 	spin_lock(&current->files->file_lock);
-	if (! current->files->fd[fd] ||
-	    ! current->files->fd[fd]->f_dentry ||
-	    ! (ino = current->files->fd[fd]->f_dentry->d_inode) ||
+	fdt = files_fdtable(current->files);
+	if (! fdt->fd[fd] ||
+	    ! fdt->fd[fd]->f_dentry ||
+	    ! (ino = fdt->fd[fd]->f_dentry->d_inode) ||
 	    ! S_ISSOCK(ino->i_mode)) {
 		spin_unlock(&current->files->file_lock);
 		return TBADF;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6a56ae4f772..0bfc7af6891 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2454,6 +2454,7 @@ static void __do_SAK(void *arg)
 	int		i;
 	struct file	*filp;
 	struct tty_ldisc *disc;
+	struct fdtable *fdt;
 	
 	if (!tty)
 		return;
@@ -2480,7 +2481,8 @@ static void __do_SAK(void *arg)
 		task_lock(p);
 		if (p->files) {
 			spin_lock(&p->files->file_lock);
-			for (i=0; i < p->files->max_fds; i++) {
+			fdt = files_fdtable(p->files);
+			for (i=0; i < fdt->max_fds; i++) {
 				filp = fcheck_files(p->files, i);
 				if (!filp)
 					continue;
diff --git a/fs/exec.c b/fs/exec.c
index 222ab1c572d..14dd03907cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -798,6 +798,7 @@ no_thread_group:
 static inline void flush_old_files(struct files_struct * files)
 {
 	long j = -1;
+	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
 	for (;;) {
@@ -805,12 +806,13 @@ static inline void flush_old_files(struct files_struct * files)
 
 		j++;
 		i = j * __NFDBITS;
-		if (i >= files->max_fds || i >= files->max_fdset)
+		fdt = files_fdtable(files);
+		if (i >= fdt->max_fds || i >= fdt->max_fdset)
 			break;
-		set = files->close_on_exec->fds_bits[j];
+		set = fdt->close_on_exec->fds_bits[j];
 		if (!set)
 			continue;
-		files->close_on_exec->fds_bits[j] = 0;
+		fdt->close_on_exec->fds_bits[j] = 0;
 		spin_unlock(&files->file_lock);
 		for ( ; set ; i++,set >>= 1) {
 			if (set & 1) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6fbc9d8fcc3..bfecc623808 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -24,20 +24,24 @@
 void fastcall set_close_on_exec(unsigned int fd, int flag)
 {
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
 	if (flag)
-		FD_SET(fd, files->close_on_exec);
+		FD_SET(fd, fdt->close_on_exec);
 	else
-		FD_CLR(fd, files->close_on_exec);
+		FD_CLR(fd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
 }
 
 static inline int get_close_on_exec(unsigned int fd)
 {
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 	int res;
 	spin_lock(&files->file_lock);
-	res = FD_ISSET(fd, files->close_on_exec);
+	fdt = files_fdtable(files);
+	res = FD_ISSET(fd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
 	return res;
 }
@@ -54,24 +58,26 @@ static int locate_fd(struct files_struct *files,
 	unsigned int newfd;
 	unsigned int start;
 	int error;
+	struct fdtable *fdt;
 
 	error = -EINVAL;
 	if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
+	fdt = files_fdtable(files);
 repeat:
 	/*
 	 * Someone might have closed fd's in the range
-	 * orig_start..files->next_fd
+	 * orig_start..fdt->next_fd
 	 */
 	start = orig_start;
-	if (start < files->next_fd)
-		start = files->next_fd;
+	if (start < fdt->next_fd)
+		start = fdt->next_fd;
 
 	newfd = start;
-	if (start < files->max_fdset) {
-		newfd = find_next_zero_bit(files->open_fds->fds_bits,
-			files->max_fdset, start);
+	if (start < fdt->max_fdset) {
+		newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
+			fdt->max_fdset, start);
 	}
 	
 	error = -EMFILE;
@@ -89,8 +95,8 @@ repeat:
 	if (error)
 		goto repeat;
 
-	if (start <= files->next_fd)
-		files->next_fd = newfd + 1;
+	if (start <= fdt->next_fd)
+		fdt->next_fd = newfd + 1;
 	
 	error = newfd;
 	
@@ -101,13 +107,16 @@ out:
 static int dupfd(struct file *file, unsigned int start)
 {
 	struct files_struct * files = current->files;
+	struct fdtable *fdt;
 	int fd;
 
 	spin_lock(&files->file_lock);
 	fd = locate_fd(files, file, start);
 	if (fd >= 0) {
-		FD_SET(fd, files->open_fds);
-		FD_CLR(fd, files->close_on_exec);
+		/* locate_fd() may have expanded fdtable, load the ptr */
+		fdt = files_fdtable(files);
+		FD_SET(fd, fdt->open_fds);
+		FD_CLR(fd, fdt->close_on_exec);
 		spin_unlock(&files->file_lock);
 		fd_install(fd, file);
 	} else {
@@ -123,6 +132,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 	int err = -EBADF;
 	struct file * file, *tofree;
 	struct files_struct * files = current->files;
+	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
 	if (!(file = fcheck(oldfd)))
@@ -148,13 +158,14 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 
 	/* Yes. It's a race. In user space. Nothing sane to do */
 	err = -EBUSY;
-	tofree = files->fd[newfd];
-	if (!tofree && FD_ISSET(newfd, files->open_fds))
+	fdt = files_fdtable(files);
+	tofree = fdt->fd[newfd];
+	if (!tofree && FD_ISSET(newfd, fdt->open_fds))
 		goto out_fput;
 
-	files->fd[newfd] = file;
-	FD_SET(newfd, files->open_fds);
-	FD_CLR(newfd, files->close_on_exec);
+	fdt->fd[newfd] = file;
+	FD_SET(newfd, fdt->open_fds);
+	FD_CLR(newfd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
 
 	if (tofree)
diff --git a/fs/file.c b/fs/file.c
index 92b5f25985d..f5926ce73f3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -59,13 +59,15 @@ static int expand_fd_array(struct files_struct *files, int nr)
 {
 	struct file **new_fds;
 	int error, nfds;
+	struct fdtable *fdt;
 
 	
 	error = -EMFILE;
-	if (files->max_fds >= NR_OPEN || nr >= NR_OPEN)
+	fdt = files_fdtable(files);
+	if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN)
 		goto out;
 
-	nfds = files->max_fds;
+	nfds = fdt->max_fds;
 	spin_unlock(&files->file_lock);
 
 	/* 
@@ -95,13 +97,14 @@ static int expand_fd_array(struct files_struct *files, int nr)
 		goto out;
 
 	/* Copy the existing array and install the new pointer */
+	fdt = files_fdtable(files);
 
-	if (nfds > files->max_fds) {
+	if (nfds > fdt->max_fds) {
 		struct file **old_fds;
 		int i;
 		
-		old_fds = xchg(&files->fd, new_fds);
-		i = xchg(&files->max_fds, nfds);
+		old_fds = xchg(&fdt->fd, new_fds);
+		i = xchg(&fdt->max_fds, nfds);
 
 		/* Don't copy/clear the array if we are creating a new
 		   fd array for fork() */
@@ -164,12 +167,14 @@ static int expand_fdset(struct files_struct *files, int nr)
 {
 	fd_set *new_openset = NULL, *new_execset = NULL;
 	int error, nfds = 0;
+	struct fdtable *fdt;
 
 	error = -EMFILE;
-	if (files->max_fdset >= NR_OPEN || nr >= NR_OPEN)
+	fdt = files_fdtable(files);
+	if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN)
 		goto out;
 
-	nfds = files->max_fdset;
+	nfds = fdt->max_fdset;
 	spin_unlock(&files->file_lock);
 
 	/* Expand to the max in easy steps */
@@ -193,24 +198,25 @@ static int expand_fdset(struct files_struct *files, int nr)
 	error = 0;
 	
 	/* Copy the existing tables and install the new pointers */
-	if (nfds > files->max_fdset) {
-		int i = files->max_fdset / (sizeof(unsigned long) * 8);
-		int count = (nfds - files->max_fdset) / 8;
+	fdt = files_fdtable(files);
+	if (nfds > fdt->max_fdset) {
+		int i = fdt->max_fdset / (sizeof(unsigned long) * 8);
+		int count = (nfds - fdt->max_fdset) / 8;
 		
 		/* 
 		 * Don't copy the entire array if the current fdset is
 		 * not yet initialised.  
 		 */
 		if (i) {
-			memcpy (new_openset, files->open_fds, files->max_fdset/8);
-			memcpy (new_execset, files->close_on_exec, files->max_fdset/8);
+			memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8);
+			memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8);
 			memset (&new_openset->fds_bits[i], 0, count);
 			memset (&new_execset->fds_bits[i], 0, count);
 		}
 		
-		nfds = xchg(&files->max_fdset, nfds);
-		new_openset = xchg(&files->open_fds, new_openset);
-		new_execset = xchg(&files->close_on_exec, new_execset);
+		nfds = xchg(&fdt->max_fdset, nfds);
+		new_openset = xchg(&fdt->open_fds, new_openset);
+		new_execset = xchg(&fdt->close_on_exec, new_execset);
 		spin_unlock(&files->file_lock);
 		free_fdset (new_openset, nfds);
 		free_fdset (new_execset, nfds);
@@ -237,13 +243,15 @@ out:
 int expand_files(struct files_struct *files, int nr)
 {
 	int err, expand = 0;
+	struct fdtable *fdt;
 
-	if (nr >= files->max_fdset) {
+	fdt = files_fdtable(files);
+	if (nr >= fdt->max_fdset) {
 		expand = 1;
 		if ((err = expand_fdset(files, nr)))
 			goto out;
 	}
-	if (nr >= files->max_fds) {
+	if (nr >= fdt->max_fds) {
 		expand = 1;
 		if ((err = expand_fd_array(files, nr)))
 			goto out;
diff --git a/fs/locks.c b/fs/locks.c
index 11956b6179f..c2c09b4798d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2198,21 +2198,23 @@ void steal_locks(fl_owner_t from)
 {
 	struct files_struct *files = current->files;
 	int i, j;
+	struct fdtable *fdt;
 
 	if (from == files)
 		return;
 
 	lock_kernel();
 	j = 0;
+	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
 		i = j * __NFDBITS;
-		if (i >= files->max_fdset || i >= files->max_fds)
+		if (i >= fdt->max_fdset || i >= fdt->max_fds)
 			break;
-		set = files->open_fds->fds_bits[j++];
+		set = fdt->open_fds->fds_bits[j++];
 		while (set) {
 			if (set & 1) {
-				struct file *file = files->fd[i];
+				struct file *file = fdt->fd[i];
 				if (file)
 					__steal_locks(file, from);
 			}
diff --git a/fs/open.c b/fs/open.c
index 4ee2dcc31c2..b6542516a0c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -842,14 +842,16 @@ int get_unused_fd(void)
 {
 	struct files_struct * files = current->files;
 	int fd, error;
+	struct fdtable *fdt;
 
   	error = -EMFILE;
 	spin_lock(&files->file_lock);
 
 repeat:
- 	fd = find_next_zero_bit(files->open_fds->fds_bits, 
-				files->max_fdset, 
-				files->next_fd);
+	fdt = files_fdtable(files);
+ 	fd = find_next_zero_bit(fdt->open_fds->fds_bits,
+				fdt->max_fdset,
+				fdt->next_fd);
 
 	/*
 	 * N.B. For clone tasks sharing a files structure, this test
@@ -872,14 +874,14 @@ repeat:
 		goto repeat;
 	}
 
-	FD_SET(fd, files->open_fds);
-	FD_CLR(fd, files->close_on_exec);
-	files->next_fd = fd + 1;
+	FD_SET(fd, fdt->open_fds);
+	FD_CLR(fd, fdt->close_on_exec);
+	fdt->next_fd = fd + 1;
 #if 1
 	/* Sanity check */
-	if (files->fd[fd] != NULL) {
+	if (fdt->fd[fd] != NULL) {
 		printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
-		files->fd[fd] = NULL;
+		fdt->fd[fd] = NULL;
 	}
 #endif
 	error = fd;
@@ -893,9 +895,10 @@ EXPORT_SYMBOL(get_unused_fd);
 
 static inline void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
-	__FD_CLR(fd, files->open_fds);
-	if (fd < files->next_fd)
-		files->next_fd = fd;
+	struct fdtable *fdt = files_fdtable(files);
+	__FD_CLR(fd, fdt->open_fds);
+	if (fd < fdt->next_fd)
+		fdt->next_fd = fd;
 }
 
 void fastcall put_unused_fd(unsigned int fd)
@@ -924,10 +927,12 @@ EXPORT_SYMBOL(put_unused_fd);
 void fastcall fd_install(unsigned int fd, struct file * file)
 {
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 	spin_lock(&files->file_lock);
-	if (unlikely(files->fd[fd] != NULL))
+	fdt = files_fdtable(files);
+	if (unlikely(fdt->fd[fd] != NULL))
 		BUG();
-	files->fd[fd] = file;
+	fdt->fd[fd] = file;
 	spin_unlock(&files->file_lock);
 }
 
@@ -1010,15 +1015,17 @@ asmlinkage long sys_close(unsigned int fd)
 {
 	struct file * filp;
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
-	if (fd >= files->max_fds)
+	fdt = files_fdtable(files);
+	if (fd >= fdt->max_fds)
 		goto out_unlock;
-	filp = files->fd[fd];
+	filp = fdt->fd[fd];
 	if (!filp)
 		goto out_unlock;
-	files->fd[fd] = NULL;
-	FD_CLR(fd, files->close_on_exec);
+	fdt->fd[fd] = NULL;
+	FD_CLR(fd, fdt->close_on_exec);
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
 	return filp_close(filp, files);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 37668fe998a..d88d518d30f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 {
 	struct group_info *group_info;
 	int g;
+	struct fdtable *fdt = NULL;
 
 	read_lock(&tasklist_lock);
 	buffer += sprintf(buffer,
@@ -179,10 +180,12 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		p->gid, p->egid, p->sgid, p->fsgid);
 	read_unlock(&tasklist_lock);
 	task_lock(p);
+	if (p->files)
+		fdt = files_fdtable(p->files);
 	buffer += sprintf(buffer,
 		"FDSize:\t%d\n"
 		"Groups:\t",
-		p->files ? p->files->max_fds : 0);
+		fdt ? fdt->max_fds : 0);
 
 	group_info = p->group_info;
 	get_group_info(group_info);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 84751f3f52d..d0087a0b024 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1039,6 +1039,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 	int retval;
 	char buf[NUMBUF];
 	struct files_struct * files;
+	struct fdtable *fdt;
 
 	retval = -ENOENT;
 	if (!pid_alive(p))
@@ -1062,8 +1063,9 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			if (!files)
 				goto out;
 			spin_lock(&files->file_lock);
+			fdt = files_fdtable(files);
 			for (fd = filp->f_pos-2;
-			     fd < files->max_fds;
+			     fd < fdt->max_fds;
 			     fd++, filp->f_pos++) {
 				unsigned int i,j;
 
diff --git a/fs/select.c b/fs/select.c
index b80e7eb0ac0..2e56325c73c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -132,11 +132,13 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
 	unsigned long *open_fds;
 	unsigned long set;
 	int max;
+	struct fdtable *fdt;
 
 	/* handle last in-complete long-word first */
 	set = ~(~0UL << (n & (__NFDBITS-1)));
 	n /= __NFDBITS;
-	open_fds = current->files->open_fds->fds_bits+n;
+	fdt = files_fdtable(current->files);
+	open_fds = fdt->open_fds->fds_bits+n;
 	max = 0;
 	if (set) {
 		set &= BITS(fds, n);
@@ -299,6 +301,7 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 	char *bits;
 	long timeout;
 	int ret, size, max_fdset;
+	struct fdtable *fdt;
 
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (tvp) {
@@ -326,7 +329,8 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 		goto out_nofds;
 
 	/* max_fdset can increase, so grab it once to avoid race */
-	max_fdset = current->files->max_fdset;
+	fdt = files_fdtable(current->files);
+	max_fdset = fdt->max_fdset;
 	if (n > max_fdset)
 		n = max_fdset;
 
@@ -464,9 +468,11 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
  	unsigned int i;
 	struct poll_list *head;
  	struct poll_list *walk;
+	struct fdtable *fdt;
 
 	/* Do a sanity check on nfds ... */
-	if (nfds > current->files->max_fdset && nfds > OPEN_MAX)
+	fdt = files_fdtable(current->files);
+	if (nfds > fdt->max_fdset && nfds > OPEN_MAX)
 		return -EINVAL;
 
 	if (timeout) {
diff --git a/include/linux/file.h b/include/linux/file.h
index 5206beb9a80..db372230848 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -16,23 +16,29 @@
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
 
+struct fdtable {
+	unsigned int max_fds;
+	int max_fdset;
+	int next_fd;
+	struct file ** fd;      /* current fd array */
+	fd_set *close_on_exec;
+	fd_set *open_fds;
+};
+
 /*
  * Open file table structure
  */
 struct files_struct {
         atomic_t count;
         spinlock_t file_lock;     /* Protects all the below members.  Nests inside tsk->alloc_lock */
-        int max_fds;
-        int max_fdset;
-        int next_fd;
-        struct file ** fd;      /* current fd array */
-        fd_set *close_on_exec;
-        fd_set *open_fds;
+	struct fdtable fdtab;
         fd_set close_on_exec_init;
         fd_set open_fds_init;
         struct file * fd_array[NR_OPEN_DEFAULT];
 };
 
+#define files_fdtable(files) (&(files)->fdtab)
+
 extern void FASTCALL(__fput(struct file *));
 extern void FASTCALL(fput(struct file *));
 
@@ -63,9 +69,10 @@ extern int expand_files(struct files_struct *, int nr);
 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
 {
 	struct file * file = NULL;
+	struct fdtable *fdt = files_fdtable(files);
 
-	if (fd < files->max_fds)
-		file = files->fd[fd];
+	if (fd < fdt->max_fds)
+		file = fdt->fd[fd];
 	return file;
 }
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index c727c195a91..94aefa54a1b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -3,16 +3,21 @@
 
 #include <linux/file.h>
 
-#define INIT_FILES \
-{ 							\
-	.count		= ATOMIC_INIT(1), 		\
-	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+#define INIT_FDTABLE \
+{							\
 	.max_fds	= NR_OPEN_DEFAULT, 		\
 	.max_fdset	= __FD_SETSIZE, 		\
 	.next_fd	= 0, 				\
 	.fd		= &init_files.fd_array[0], 	\
 	.close_on_exec	= &init_files.close_on_exec_init, \
 	.open_fds	= &init_files.open_fds_init, 	\
+}
+
+#define INIT_FILES \
+{ 							\
+	.count		= ATOMIC_INIT(1), 		\
+	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.fdtab		= INIT_FDTABLE,			\
 	.close_on_exec_init = { { 0, } }, 		\
 	.open_fds_init	= { { 0, } }, 			\
 	.fd_array	= { NULL, } 			\
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b0fb9f09f2..83beb1e93b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize);
 static inline void close_files(struct files_struct * files)
 {
 	int i, j;
+	struct fdtable *fdt;
 
 	j = 0;
+	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
 		i = j * __NFDBITS;
-		if (i >= files->max_fdset || i >= files->max_fds)
+		if (i >= fdt->max_fdset || i >= fdt->max_fds)
 			break;
-		set = files->open_fds->fds_bits[j++];
+		set = fdt->open_fds->fds_bits[j++];
 		while (set) {
 			if (set & 1) {
-				struct file * file = xchg(&files->fd[i], NULL);
+				struct file * file = xchg(&fdt->fd[i], NULL);
 				if (file)
 					filp_close(file, files);
 			}
@@ -403,16 +405,19 @@ struct files_struct *get_files_struct(struct task_struct *task)
 
 void fastcall put_files_struct(struct files_struct *files)
 {
+	struct fdtable *fdt;
+
 	if (atomic_dec_and_test(&files->count)) {
 		close_files(files);
 		/*
 		 * Free the fd and fdset arrays if we expanded them.
 		 */
-		if (files->fd != &files->fd_array[0])
-			free_fd_array(files->fd, files->max_fds);
-		if (files->max_fdset > __FD_SETSIZE) {
-			free_fdset(files->open_fds, files->max_fdset);
-			free_fdset(files->close_on_exec, files->max_fdset);
+		fdt = files_fdtable(files);
+		if (fdt->fd != &files->fd_array[0])
+			free_fd_array(fdt->fd, fdt->max_fds);
+		if (fdt->max_fdset > __FD_SETSIZE) {
+			free_fdset(fdt->open_fds, fdt->max_fdset);
+			free_fdset(fdt->close_on_exec, fdt->max_fdset);
 		}
 		kmem_cache_free(files_cachep, files);
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index b2580206503..ecc694debb5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -568,21 +568,47 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 static int count_open_files(struct files_struct *files, int size)
 {
 	int i;
+	struct fdtable *fdt;
 
 	/* Find the last open fd */
+	fdt = files_fdtable(files);
 	for (i = size/(8*sizeof(long)); i > 0; ) {
-		if (files->open_fds->fds_bits[--i])
+		if (fdt->open_fds->fds_bits[--i])
 			break;
 	}
 	i = (i+1) * 8 * sizeof(long);
 	return i;
 }
 
+static struct files_struct *alloc_files(void)
+{
+	struct files_struct *newf;
+	struct fdtable *fdt;
+
+	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	fdt = files_fdtable(newf);
+	fdt->next_fd = 0;
+	fdt->max_fds = NR_OPEN_DEFAULT;
+	fdt->max_fdset = __FD_SETSIZE;
+	fdt->close_on_exec = &newf->close_on_exec_init;
+	fdt->open_fds = &newf->open_fds_init;
+	fdt->fd = &newf->fd_array[0];
+out:
+	return newf;
+}
+
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct files_struct *oldf, *newf;
 	struct file **old_fds, **new_fds;
 	int open_files, size, i, error = 0, expand;
+	struct fdtable *old_fdt, *new_fdt;
 
 	/*
 	 * A background process may not have any files ...
@@ -603,35 +629,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	 */
 	tsk->files = NULL;
 	error = -ENOMEM;
-	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
-	if (!newf) 
+	newf = alloc_files();
+	if (!newf)
 		goto out;
 
-	atomic_set(&newf->count, 1);
-
-	spin_lock_init(&newf->file_lock);
-	newf->next_fd	    = 0;
-	newf->max_fds	    = NR_OPEN_DEFAULT;
-	newf->max_fdset	    = __FD_SETSIZE;
-	newf->close_on_exec = &newf->close_on_exec_init;
-	newf->open_fds	    = &newf->open_fds_init;
-	newf->fd	    = &newf->fd_array[0];
-
 	spin_lock(&oldf->file_lock);
-
-	open_files = count_open_files(oldf, oldf->max_fdset);
+	old_fdt = files_fdtable(oldf);
+	new_fdt = files_fdtable(newf);
+	size = old_fdt->max_fdset;
+	open_files = count_open_files(oldf, old_fdt->max_fdset);
 	expand = 0;
 
 	/*
 	 * Check whether we need to allocate a larger fd array or fd set.
 	 * Note: we're not a clone task, so the open count won't  change.
 	 */
-	if (open_files > newf->max_fdset) {
-		newf->max_fdset = 0;
+	if (open_files > new_fdt->max_fdset) {
+		new_fdt->max_fdset = 0;
 		expand = 1;
 	}
-	if (open_files > newf->max_fds) {
-		newf->max_fds = 0;
+	if (open_files > new_fdt->max_fds) {
+		new_fdt->max_fds = 0;
 		expand = 1;
 	}
 
@@ -646,11 +664,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 		spin_lock(&oldf->file_lock);
 	}
 
-	old_fds = oldf->fd;
-	new_fds = newf->fd;
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
 
-	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
-	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+	memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
@@ -663,24 +681,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 			 * is partway through open().  So make sure that this
 			 * fd is available to the new process.
 			 */
-			FD_CLR(open_files - i, newf->open_fds);
+			FD_CLR(open_files - i, new_fdt->open_fds);
 		}
 		*new_fds++ = f;
 	}
 	spin_unlock(&oldf->file_lock);
 
 	/* compute the remainder to be cleared */
-	size = (newf->max_fds - open_files) * sizeof(struct file *);
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
 
 	/* This is long word aligned thus could use a optimized version */ 
 	memset(new_fds, 0, size); 
 
-	if (newf->max_fdset > open_files) {
-		int left = (newf->max_fdset-open_files)/8;
+	if (new_fdt->max_fdset > open_files) {
+		int left = (new_fdt->max_fdset-open_files)/8;
 		int start = open_files / (8 * sizeof(unsigned long));
 
-		memset(&newf->open_fds->fds_bits[start], 0, left);
-		memset(&newf->close_on_exec->fds_bits[start], 0, left);
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 	}
 
 	tsk->files = newf;
@@ -689,9 +707,9 @@ out:
 	return error;
 
 out_release:
-	free_fdset (newf->close_on_exec, newf->max_fdset);
-	free_fdset (newf->open_fds, newf->max_fdset);
-	free_fd_array(newf->fd, newf->max_fds);
+	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
+	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
+	free_fd_array(new_fdt->fd, new_fdt->max_fds);
 	kmem_cache_free(files_cachep, newf);
 	goto out;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3f0b533be92..acb5a495a90 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1594,6 +1594,7 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 	struct avc_audit_data ad;
 	struct file *file, *devnull = NULL;
 	struct tty_struct *tty = current->signal->tty;
+	struct fdtable *fdt;
 	long j = -1;
 
 	if (tty) {
@@ -1627,9 +1628,10 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 
 		j++;
 		i = j * __NFDBITS;
-		if (i >= files->max_fds || i >= files->max_fdset)
+		fdt = files_fdtable(files);
+		if (i >= fdt->max_fds || i >= fdt->max_fdset)
 			break;
-		set = files->open_fds->fds_bits[j];
+		set = fdt->open_fds->fds_bits[j];
 		if (!set)
 			continue;
 		spin_unlock(&files->file_lock);
-- 
cgit v1.2.3-70-g09d2


From ab2af1f5005069321c5d130f09cce577b03f43ef Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Fri, 9 Sep 2005 13:04:13 -0700
Subject: [PATCH] files: files struct with RCU

Patch to eliminate struct files_struct.file_lock spinlock on the reader side
and use rcu refcounting rcuref_xxx api for the f_count refcounter.  The
updates to the fdtable are done by allocating a new fdtable structure and
setting files->fdt to point to the new structure.  The fdtable structure is
protected by RCU thereby allowing lock-free lookup.  For fd arrays/sets that
are vmalloced, we use keventd to free them since RCU callbacks can't sleep.  A
global list of fdtable to be freed is not scalable, so we use a per-cpu list.
If keventd is already handling the current cpu's work, we use a timer to defer
queueing of that work.

Since the last publication, this patch has been re-written to avoid using
explicit memory barriers and use rcu_assign_pointer(), rcu_dereference()
premitives instead.  This required that the fd information is kept in a
separate structure (fdtable) and updated atomically.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c                  |   3 +-
 fs/fcntl.c                |  13 +-
 fs/file.c                 | 389 ++++++++++++++++++++++++++++++----------------
 fs/file_table.c           |  40 +++--
 fs/open.c                 |   8 +-
 include/linux/file.h      |  11 +-
 include/linux/fs.h        |   4 +-
 include/linux/init_task.h |   5 +
 kernel/exit.c             |  15 +-
 kernel/fork.c             |  23 ++-
 10 files changed, 345 insertions(+), 166 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 201c1847fa0..38f62680fd6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#include <linux/rcuref.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -499,7 +500,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfecc623808..d2f3ed8acd9 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
+#include <linux/rcupdate.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -64,8 +65,8 @@ static int locate_fd(struct files_struct *files,
 	if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
-	fdt = files_fdtable(files);
 repeat:
+	fdt = files_fdtable(files);
 	/*
 	 * Someone might have closed fd's in the range
 	 * orig_start..fdt->next_fd
@@ -95,9 +96,15 @@ repeat:
 	if (error)
 		goto repeat;
 
+	/*
+	 * We reacquired files_lock, so we are safe as long as
+	 * we reacquire the fdtable pointer and use it while holding
+	 * the lock, no one can free it during that time.
+	 */
+	fdt = files_fdtable(files);
 	if (start <= fdt->next_fd)
 		fdt->next_fd = newfd + 1;
-	
+
 	error = newfd;
 	
 out:
@@ -163,7 +170,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 	if (!tofree && FD_ISSET(newfd, fdt->open_fds))
 		goto out_fput;
 
-	fdt->fd[newfd] = file;
+	rcu_assign_pointer(fdt->fd[newfd], file);
 	FD_SET(newfd, fdt->open_fds);
 	FD_CLR(newfd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
diff --git a/fs/file.c b/fs/file.c
index f5926ce73f3..2127a7b9dc3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -13,6 +13,25 @@
 #include <linux/vmalloc.h>
 #include <linux/file.h>
 #include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+
+struct fdtable_defer {
+	spinlock_t lock;
+	struct work_struct wq;
+	struct timer_list timer;
+	struct fdtable *next;
+};
+
+/*
+ * We use this list to defer free fdtables that have vmalloced
+ * sets/arrays. By keeping a per-cpu list, we avoid having to embed
+ * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
+ * this per-task structure.
+ */
+static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
 
 
 /*
@@ -48,85 +67,143 @@ void free_fd_array(struct file **array, int num)
 		vfree(array);
 }
 
-/*
- * Expand the fd array in the files_struct.  Called with the files
- * spinlock held for write.
- */
-
-static int expand_fd_array(struct files_struct *files, int nr)
-	__releases(files->file_lock)
-	__acquires(files->file_lock)
+static void __free_fdtable(struct fdtable *fdt)
 {
-	struct file **new_fds;
-	int error, nfds;
-	struct fdtable *fdt;
+	int fdset_size, fdarray_size;
 
-	
-	error = -EMFILE;
-	fdt = files_fdtable(files);
-	if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN)
-		goto out;
+	fdset_size = fdt->max_fdset / 8;
+	fdarray_size = fdt->max_fds * sizeof(struct file *);
+	free_fdset(fdt->open_fds, fdset_size);
+	free_fdset(fdt->close_on_exec, fdset_size);
+	free_fd_array(fdt->fd, fdarray_size);
+	kfree(fdt);
+}
 
-	nfds = fdt->max_fds;
-	spin_unlock(&files->file_lock);
+static void fdtable_timer(unsigned long data)
+{
+	struct fdtable_defer *fddef = (struct fdtable_defer *)data;
 
-	/* 
-	 * Expand to the max in easy steps, and keep expanding it until
-	 * we have enough for the requested fd array size. 
+	spin_lock(&fddef->lock);
+	/*
+	 * If someone already emptied the queue return.
 	 */
+	if (!fddef->next)
+		goto out;
+	if (!schedule_work(&fddef->wq))
+		mod_timer(&fddef->timer, 5);
+out:
+	spin_unlock(&fddef->lock);
+}
 
-	do {
-#if NR_OPEN_DEFAULT < 256
-		if (nfds < 256)
-			nfds = 256;
-		else 
-#endif
-		if (nfds < (PAGE_SIZE / sizeof(struct file *)))
-			nfds = PAGE_SIZE / sizeof(struct file *);
-		else {
-			nfds = nfds * 2;
-			if (nfds > NR_OPEN)
-				nfds = NR_OPEN;
-		}
-	} while (nfds <= nr);
+static void free_fdtable_work(struct fdtable_defer *f)
+{
+	struct fdtable *fdt;
 
-	error = -ENOMEM;
-	new_fds = alloc_fd_array(nfds);
-	spin_lock(&files->file_lock);
-	if (!new_fds)
-		goto out;
+	spin_lock_bh(&f->lock);
+	fdt = f->next;
+	f->next = NULL;
+	spin_unlock_bh(&f->lock);
+	while(fdt) {
+		struct fdtable *next = fdt->next;
+		__free_fdtable(fdt);
+		fdt = next;
+	}
+}
 
-	/* Copy the existing array and install the new pointer */
-	fdt = files_fdtable(files);
+static void free_fdtable_rcu(struct rcu_head *rcu)
+{
+	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
+	int fdset_size, fdarray_size;
+	struct fdtable_defer *fddef;
 
-	if (nfds > fdt->max_fds) {
-		struct file **old_fds;
-		int i;
-		
-		old_fds = xchg(&fdt->fd, new_fds);
-		i = xchg(&fdt->max_fds, nfds);
-
-		/* Don't copy/clear the array if we are creating a new
-		   fd array for fork() */
-		if (i) {
-			memcpy(new_fds, old_fds, i * sizeof(struct file *));
-			/* clear the remainder of the array */
-			memset(&new_fds[i], 0,
-			       (nfds-i) * sizeof(struct file *)); 
-
-			spin_unlock(&files->file_lock);
-			free_fd_array(old_fds, i);
-			spin_lock(&files->file_lock);
-		}
+	BUG_ON(!fdt);
+	fdset_size = fdt->max_fdset / 8;
+	fdarray_size = fdt->max_fds * sizeof(struct file *);
+
+	if (fdt->free_files) {
+		/*
+		 * The this fdtable was embedded in the files structure
+		 * and the files structure itself was getting destroyed.
+		 * It is now safe to free the files structure.
+		 */
+		kmem_cache_free(files_cachep, fdt->free_files);
+		return;
+	}
+	if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
+		/*
+		 * The fdtable was embedded
+		 */
+		return;
+	}
+	if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
+		kfree(fdt->open_fds);
+		kfree(fdt->close_on_exec);
+		kfree(fdt->fd);
+		kfree(fdt);
 	} else {
-		/* Somebody expanded the array while we slept ... */
-		spin_unlock(&files->file_lock);
-		free_fd_array(new_fds, nfds);
-		spin_lock(&files->file_lock);
+		fddef = &get_cpu_var(fdtable_defer_list);
+		spin_lock(&fddef->lock);
+		fdt->next = fddef->next;
+		fddef->next = fdt;
+		/*
+		 * vmallocs are handled from the workqueue context.
+		 * If the per-cpu workqueue is running, then we
+		 * defer work scheduling through a timer.
+		 */
+		if (!schedule_work(&fddef->wq))
+			mod_timer(&fddef->timer, 5);
+		spin_unlock(&fddef->lock);
+		put_cpu_var(fdtable_defer_list);
 	}
-	error = 0;
-out:
-	return error;
+}
+
+void free_fdtable(struct fdtable *fdt)
+{
+	if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
+					fdt->max_fds > NR_OPEN_DEFAULT)
+		call_rcu(&fdt->rcu, free_fdtable_rcu);
+}
+
+/*
+ * Expand the fdset in the files_struct.  Called with the files spinlock
+ * held for write.
+ */
+static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
+{
+	int i;
+	int count;
+
+	BUG_ON(nfdt->max_fdset < fdt->max_fdset);
+	BUG_ON(nfdt->max_fds < fdt->max_fds);
+	/* Copy the existing tables and install the new pointers */
+
+	i = fdt->max_fdset / (sizeof(unsigned long) * 8);
+	count = (nfdt->max_fdset - fdt->max_fdset) / 8;
+
+	/*
+	 * Don't copy the entire array if the current fdset is
+	 * not yet initialised.
+	 */
+	if (i) {
+		memcpy (nfdt->open_fds, fdt->open_fds,
+						fdt->max_fdset/8);
+		memcpy (nfdt->close_on_exec, fdt->close_on_exec,
+						fdt->max_fdset/8);
+		memset (&nfdt->open_fds->fds_bits[i], 0, count);
+		memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
+	}
+
+	/* Don't copy/clear the array if we are creating a new
+	   fd array for fork() */
+	if (fdt->max_fds) {
+		memcpy(nfdt->fd, fdt->fd,
+			fdt->max_fds * sizeof(struct file *));
+		/* clear the remainder of the array */
+		memset(&nfdt->fd[fdt->max_fds], 0,
+		       (nfdt->max_fds - fdt->max_fds) *
+					sizeof(struct file *));
+	}
+	nfdt->next_fd = fdt->next_fd;
 }
 
 /*
@@ -157,28 +234,21 @@ void free_fdset(fd_set *array, int num)
 		vfree(array);
 }
 
-/*
- * Expand the fdset in the files_struct.  Called with the files spinlock
- * held for write.
- */
-static int expand_fdset(struct files_struct *files, int nr)
-	__releases(file->file_lock)
-	__acquires(file->file_lock)
+static struct fdtable *alloc_fdtable(int nr)
 {
-	fd_set *new_openset = NULL, *new_execset = NULL;
-	int error, nfds = 0;
-	struct fdtable *fdt;
-
-	error = -EMFILE;
-	fdt = files_fdtable(files);
-	if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN)
-		goto out;
+	struct fdtable *fdt = NULL;
+	int nfds = 0;
+  	fd_set *new_openset = NULL, *new_execset = NULL;
+	struct file **new_fds;
 
-	nfds = fdt->max_fdset;
-	spin_unlock(&files->file_lock);
+	fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
+	if (!fdt)
+  		goto out;
+	memset(fdt, 0, sizeof(*fdt));
 
-	/* Expand to the max in easy steps */
-	do {
+	nfds = __FD_SETSIZE;
+  	/* Expand to the max in easy steps */
+  	do {
 		if (nfds < (PAGE_SIZE * 8))
 			nfds = PAGE_SIZE * 8;
 		else {
@@ -188,50 +258,88 @@ static int expand_fdset(struct files_struct *files, int nr)
 		}
 	} while (nfds <= nr);
 
-	error = -ENOMEM;
-	new_openset = alloc_fdset(nfds);
-	new_execset = alloc_fdset(nfds);
-	spin_lock(&files->file_lock);
-	if (!new_openset || !new_execset)
+  	new_openset = alloc_fdset(nfds);
+  	new_execset = alloc_fdset(nfds);
+  	if (!new_openset || !new_execset)
+  		goto out;
+	fdt->open_fds = new_openset;
+	fdt->close_on_exec = new_execset;
+	fdt->max_fdset = nfds;
+
+	nfds = NR_OPEN_DEFAULT;
+	/*
+	 * Expand to the max in easy steps, and keep expanding it until
+	 * we have enough for the requested fd array size.
+	 */
+	do {
+#if NR_OPEN_DEFAULT < 256
+		if (nfds < 256)
+			nfds = 256;
+		else
+#endif
+		if (nfds < (PAGE_SIZE / sizeof(struct file *)))
+			nfds = PAGE_SIZE / sizeof(struct file *);
+		else {
+			nfds = nfds * 2;
+			if (nfds > NR_OPEN)
+				nfds = NR_OPEN;
+  		}
+	} while (nfds <= nr);
+	new_fds = alloc_fd_array(nfds);
+	if (!new_fds)
 		goto out;
+	fdt->fd = new_fds;
+	fdt->max_fds = nfds;
+	fdt->free_files = NULL;
+	return fdt;
+out:
+  	if (new_openset)
+  		free_fdset(new_openset, nfds);
+  	if (new_execset)
+  		free_fdset(new_execset, nfds);
+	kfree(fdt);
+	return NULL;
+}
 
-	error = 0;
-	
-	/* Copy the existing tables and install the new pointers */
+/*
+ * Expands the file descriptor table - it will allocate a new fdtable and
+ * both fd array and fdset. It is expected to be called with the
+ * files_lock held.
+ */
+static int expand_fdtable(struct files_struct *files, int nr)
+	__releases(files->file_lock)
+	__acquires(files->file_lock)
+{
+	int error = 0;
+	struct fdtable *fdt;
+	struct fdtable *nfdt = NULL;
+
+	spin_unlock(&files->file_lock);
+	nfdt = alloc_fdtable(nr);
+	if (!nfdt) {
+		error = -ENOMEM;
+		spin_lock(&files->file_lock);
+		goto out;
+	}
+
+	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
-	if (nfds > fdt->max_fdset) {
-		int i = fdt->max_fdset / (sizeof(unsigned long) * 8);
-		int count = (nfds - fdt->max_fdset) / 8;
-		
-		/* 
-		 * Don't copy the entire array if the current fdset is
-		 * not yet initialised.  
-		 */
-		if (i) {
-			memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8);
-			memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8);
-			memset (&new_openset->fds_bits[i], 0, count);
-			memset (&new_execset->fds_bits[i], 0, count);
-		}
-		
-		nfds = xchg(&fdt->max_fdset, nfds);
-		new_openset = xchg(&fdt->open_fds, new_openset);
-		new_execset = xchg(&fdt->close_on_exec, new_execset);
+	/*
+	 * Check again since another task may have expanded the
+	 * fd table while we dropped the lock
+	 */
+	if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
+		copy_fdtable(nfdt, fdt);
+	} else {
+		/* Somebody expanded while we dropped file_lock */
 		spin_unlock(&files->file_lock);
-		free_fdset (new_openset, nfds);
-		free_fdset (new_execset, nfds);
+		__free_fdtable(nfdt);
 		spin_lock(&files->file_lock);
-		return 0;
-	} 
-	/* Somebody expanded the array while we slept ... */
-
+		goto out;
+	}
+	rcu_assign_pointer(files->fdt, nfdt);
+	free_fdtable(fdt);
 out:
-	spin_unlock(&files->file_lock);
-	if (new_openset)
-		free_fdset(new_openset, nfds);
-	if (new_execset)
-		free_fdset(new_execset, nfds);
-	spin_lock(&files->file_lock);
 	return error;
 }
 
@@ -246,17 +354,36 @@ int expand_files(struct files_struct *files, int nr)
 	struct fdtable *fdt;
 
 	fdt = files_fdtable(files);
-	if (nr >= fdt->max_fdset) {
-		expand = 1;
-		if ((err = expand_fdset(files, nr)))
+	if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
+		if (fdt->max_fdset >= NR_OPEN ||
+			fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
+			err = -EMFILE;
 			goto out;
-	}
-	if (nr >= fdt->max_fds) {
+		}
 		expand = 1;
-		if ((err = expand_fd_array(files, nr)))
+		if ((err = expand_fdtable(files, nr)))
 			goto out;
 	}
 	err = expand;
 out:
 	return err;
 }
+
+static void __devinit fdtable_defer_list_init(int cpu)
+{
+	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
+	spin_lock_init(&fddef->lock);
+	INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
+	init_timer(&fddef->timer);
+	fddef->timer.data = (unsigned long)fddef;
+	fddef->timer.function = fdtable_timer;
+	fddef->next = NULL;
+}
+
+void __init files_defer_init(void)
+{
+	int i;
+	/* Really early - can't use for_each_cpu */
+	for (i = 0; i < NR_CPUS; i++)
+		fdtable_defer_list_init(i);
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index 43e9e1737de..86ec8ae985b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/security.h>
 #include <linux/eventpoll.h>
+#include <linux/rcupdate.h>
 #include <linux/mount.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
@@ -53,11 +54,17 @@ void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
 	spin_unlock_irqrestore(&filp_count_lock, flags);
 }
 
-static inline void file_free(struct file *f)
+static inline void file_free_rcu(struct rcu_head *head)
 {
+	struct file *f =  container_of(head, struct file, f_rcuhead);
 	kmem_cache_free(filp_cachep, f);
 }
 
+static inline void file_free(struct file *f)
+{
+	call_rcu(&f->f_rcuhead, file_free_rcu);
+}
+
 /* Find an unused file structure and return a pointer to it.
  * Returns NULL, if there are no more free file structures or
  * we run out of memory.
@@ -110,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
 
 void fastcall fput(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count))
+	if (rcuref_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -156,11 +163,17 @@ struct file fastcall *fget(unsigned int fd)
 	struct file *file;
 	struct files_struct *files = current->files;
 
-	spin_lock(&files->file_lock);
+	rcu_read_lock();
 	file = fcheck_files(files, fd);
-	if (file)
-		get_file(file);
-	spin_unlock(&files->file_lock);
+	if (file) {
+		if (!rcuref_inc_lf(&file->f_count)) {
+			/* File object ref couldn't be taken */
+			rcu_read_unlock();
+			return NULL;
+		}
+	}
+	rcu_read_unlock();
+
 	return file;
 }
 
@@ -182,21 +195,25 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 	if (likely((atomic_read(&files->count) == 1))) {
 		file = fcheck_files(files, fd);
 	} else {
-		spin_lock(&files->file_lock);
+		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			get_file(file);
-			*fput_needed = 1;
+			if (rcuref_inc_lf(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
 		}
-		spin_unlock(&files->file_lock);
+		rcu_read_unlock();
 	}
+
 	return file;
 }
 
 
 void put_filp(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count)) {
+	if (rcuref_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
@@ -257,4 +274,5 @@ void __init files_init(unsigned long mempages)
 	files_stat.max_files = n; 
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
+	files_defer_init();
 } 
diff --git a/fs/open.c b/fs/open.c
index b6542516a0c..2fac58c5191 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -24,6 +24,7 @@
 #include <linux/personality.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
+#include <linux/rcupdate.h>
 
 #include <asm/unistd.h>
 
@@ -930,9 +931,8 @@ void fastcall fd_install(unsigned int fd, struct file * file)
 	struct fdtable *fdt;
 	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
-	if (unlikely(fdt->fd[fd] != NULL))
-		BUG();
-	fdt->fd[fd] = file;
+	BUG_ON(fdt->fd[fd] != NULL);
+	rcu_assign_pointer(fdt->fd[fd], file);
 	spin_unlock(&files->file_lock);
 }
 
@@ -1024,7 +1024,7 @@ asmlinkage long sys_close(unsigned int fd)
 	filp = fdt->fd[fd];
 	if (!filp)
 		goto out_unlock;
-	fdt->fd[fd] = NULL;
+	rcu_assign_pointer(fdt->fd[fd], NULL);
 	FD_CLR(fd, fdt->close_on_exec);
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
diff --git a/include/linux/file.h b/include/linux/file.h
index db372230848..f5bbd4c508b 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include <linux/posix_types.h>
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 /*
  * The default fd array needs to be at least BITS_PER_LONG,
@@ -23,6 +24,9 @@ struct fdtable {
 	struct file ** fd;      /* current fd array */
 	fd_set *close_on_exec;
 	fd_set *open_fds;
+	struct rcu_head rcu;
+	struct files_struct *free_files;
+	struct fdtable *next;
 };
 
 /*
@@ -31,13 +35,14 @@ struct fdtable {
 struct files_struct {
         atomic_t count;
         spinlock_t file_lock;     /* Protects all the below members.  Nests inside tsk->alloc_lock */
+	struct fdtable *fdt;
 	struct fdtable fdtab;
         fd_set close_on_exec_init;
         fd_set open_fds_init;
         struct file * fd_array[NR_OPEN_DEFAULT];
 };
 
-#define files_fdtable(files) (&(files)->fdtab)
+#define files_fdtable(files) (rcu_dereference((files)->fdt))
 
 extern void FASTCALL(__fput(struct file *));
 extern void FASTCALL(fput(struct file *));
@@ -65,6 +70,8 @@ extern fd_set *alloc_fdset(int);
 extern void free_fdset(fd_set *, int);
 
 extern int expand_files(struct files_struct *, int nr);
+extern void free_fdtable(struct fdtable *fdt);
+extern void __init files_defer_init(void);
 
 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
 {
@@ -72,7 +79,7 @@ static inline struct file * fcheck_files(struct files_struct *files, unsigned in
 	struct fdtable *fdt = files_fdtable(files);
 
 	if (fd < fdt->max_fds)
-		file = fdt->fd[fd];
+		file = rcu_dereference(fdt->fd[fd]);
 	return file;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd93ab7da90..7f61227827d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -9,6 +9,7 @@
 #include <linux/config.h>
 #include <linux/limits.h>
 #include <linux/ioctl.h>
+#include <linux/rcuref.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -597,12 +598,13 @@ struct file {
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
+	struct rcu_head 	f_rcuhead;
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
-#define get_file(x)	atomic_inc(&(x)->f_count)
+#define get_file(x)	rcuref_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 94aefa54a1b..68ab5f2ab9c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -2,6 +2,7 @@
 #define _LINUX__INIT_TASK_H
 
 #include <linux/file.h>
+#include <linux/rcupdate.h>
 
 #define INIT_FDTABLE \
 {							\
@@ -11,12 +12,16 @@
 	.fd		= &init_files.fd_array[0], 	\
 	.close_on_exec	= &init_files.close_on_exec_init, \
 	.open_fds	= &init_files.open_fds_init, 	\
+	.rcu		= RCU_HEAD_INIT, 		\
+	.free_files	= NULL,		 		\
+	.next		= NULL,		 		\
 }
 
 #define INIT_FILES \
 { 							\
 	.count		= ATOMIC_INIT(1), 		\
 	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.fdt		= &init_files.fdtab, 		\
 	.fdtab		= INIT_FDTABLE,			\
 	.close_on_exec_init = { { 0, } }, 		\
 	.open_fds_init	= { { 0, } }, 			\
diff --git a/kernel/exit.c b/kernel/exit.c
index 83beb1e93b1..6d2089a1bce 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -411,15 +411,16 @@ void fastcall put_files_struct(struct files_struct *files)
 		close_files(files);
 		/*
 		 * Free the fd and fdset arrays if we expanded them.
+		 * If the fdtable was embedded, pass files for freeing
+		 * at the end of the RCU grace period. Otherwise,
+		 * you can free files immediately.
 		 */
 		fdt = files_fdtable(files);
-		if (fdt->fd != &files->fd_array[0])
-			free_fd_array(fdt->fd, fdt->max_fds);
-		if (fdt->max_fdset > __FD_SETSIZE) {
-			free_fdset(fdt->open_fds, fdt->max_fdset);
-			free_fdset(fdt->close_on_exec, fdt->max_fdset);
-		}
-		kmem_cache_free(files_cachep, files);
+		if (fdt == &files->fdtab)
+			fdt->free_files = files;
+		else
+			kmem_cache_free(files_cachep, files);
+		free_fdtable(fdt);
 	}
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index ecc694debb5..8149f360288 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -565,13 +566,12 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 	return 0;
 }
 
-static int count_open_files(struct files_struct *files, int size)
+static int count_open_files(struct fdtable *fdt)
 {
+	int size = fdt->max_fdset;
 	int i;
-	struct fdtable *fdt;
 
 	/* Find the last open fd */
-	fdt = files_fdtable(files);
 	for (i = size/(8*sizeof(long)); i > 0; ) {
 		if (fdt->open_fds->fds_bits[--i])
 			break;
@@ -592,13 +592,17 @@ static struct files_struct *alloc_files(void)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
-	fdt = files_fdtable(newf);
+	fdt = &newf->fdtab;
 	fdt->next_fd = 0;
 	fdt->max_fds = NR_OPEN_DEFAULT;
 	fdt->max_fdset = __FD_SETSIZE;
 	fdt->close_on_exec = &newf->close_on_exec_init;
 	fdt->open_fds = &newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&fdt->rcu);
+	fdt->free_files = NULL;
+	fdt->next = NULL;
+	rcu_assign_pointer(newf->fdt, fdt);
 out:
 	return newf;
 }
@@ -637,7 +641,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	old_fdt = files_fdtable(oldf);
 	new_fdt = files_fdtable(newf);
 	size = old_fdt->max_fdset;
-	open_files = count_open_files(oldf, old_fdt->max_fdset);
+	open_files = count_open_files(old_fdt);
 	expand = 0;
 
 	/*
@@ -661,7 +665,14 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 		spin_unlock(&newf->file_lock);
 		if (error < 0)
 			goto out_release;
+		new_fdt = files_fdtable(newf);
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
 		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
 	}
 
 	old_fds = old_fdt->fd;
@@ -683,7 +694,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 			 */
 			FD_CLR(open_files - i, new_fdt->open_fds);
 		}
-		*new_fds++ = f;
+		rcu_assign_pointer(*new_fds++, f);
 	}
 	spin_unlock(&oldf->file_lock);
 
-- 
cgit v1.2.3-70-g09d2


From b835996f628eadb55c5fb222ba46fe9395bf73c7 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Fri, 9 Sep 2005 13:04:14 -0700
Subject: [PATCH] files: lock-free fd look-up

With the use of RCU in files structure, the look-up of files using fds can now
be lock-free.  The lookup is protected by rcu_read_lock()/rcu_read_unlock().
This patch changes the readers to use lock-free lookup.

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Ravikiran Thirumalai <kiran_th@gmail.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/irixioctl.c    |  5 +++--
 arch/sparc64/solaris/ioctl.c    |  7 ++++---
 drivers/char/tty_io.c           |  4 ++--
 fs/fcntl.c                      |  4 ++--
 fs/proc/base.c                  | 29 +++++++++++++++--------------
 fs/select.c                     | 13 ++++++++++---
 net/ipv4/netfilter/ipt_owner.c  |  1 +
 net/ipv6/netfilter/ip6t_owner.c |  1 +
 security/selinux/hooks.c        |  2 +-
 9 files changed, 39 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/arch/mips/kernel/irixioctl.c b/arch/mips/kernel/irixioctl.c
index 4cd3d38a22c..3cdc22346f4 100644
--- a/arch/mips/kernel/irixioctl.c
+++ b/arch/mips/kernel/irixioctl.c
@@ -14,6 +14,7 @@
 #include <linux/syscalls.h>
 #include <linux/tty.h>
 #include <linux/file.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctl.h>
@@ -33,7 +34,7 @@ static struct tty_struct *get_tty(int fd)
 	struct file *filp;
 	struct tty_struct *ttyp = NULL;
 
-	spin_lock(&current->files->file_lock);
+	rcu_read_lock();
 	filp = fcheck(fd);
 	if(filp && filp->private_data) {
 		ttyp = (struct tty_struct *) filp->private_data;
@@ -41,7 +42,7 @@ static struct tty_struct *get_tty(int fd)
 		if(ttyp->magic != TTY_MAGIC)
 			ttyp =NULL;
 	}
-	spin_unlock(&current->files->file_lock);
+	rcu_read_unlock();
 	return ttyp;
 }
 
diff --git a/arch/sparc64/solaris/ioctl.c b/arch/sparc64/solaris/ioctl.c
index 374766455f5..be0a054e3ed 100644
--- a/arch/sparc64/solaris/ioctl.c
+++ b/arch/sparc64/solaris/ioctl.c
@@ -24,6 +24,7 @@
 #include <linux/netdevice.h>
 #include <linux/mtio.h>
 #include <linux/time.h>
+#include <linux/rcupdate.h>
 #include <linux/compat.h>
 
 #include <net/sock.h>
@@ -295,16 +296,16 @@ static inline int solaris_sockmod(unsigned int fd, unsigned int cmd, u32 arg)
 	struct inode *ino;
 	struct fdtable *fdt;
 	/* I wonder which of these tests are superfluous... --patrik */
-	spin_lock(&current->files->file_lock);
+	rcu_read_lock();
 	fdt = files_fdtable(current->files);
 	if (! fdt->fd[fd] ||
 	    ! fdt->fd[fd]->f_dentry ||
 	    ! (ino = fdt->fd[fd]->f_dentry->d_inode) ||
 	    ! S_ISSOCK(ino->i_mode)) {
-		spin_unlock(&current->files->file_lock);
+		rcu_read_unlock();
 		return TBADF;
 	}
-	spin_unlock(&current->files->file_lock);
+	rcu_read_unlock();
 	
 	switch (cmd & 0xff) {
 	case 109: /* SI_SOCKPARAMS */
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 0bfc7af6891..e5953f3433f 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2480,7 +2480,7 @@ static void __do_SAK(void *arg)
 		}
 		task_lock(p);
 		if (p->files) {
-			spin_lock(&p->files->file_lock);
+			rcu_read_lock();
 			fdt = files_fdtable(p->files);
 			for (i=0; i < fdt->max_fds; i++) {
 				filp = fcheck_files(p->files, i);
@@ -2495,7 +2495,7 @@ static void __do_SAK(void *arg)
 					break;
 				}
 			}
-			spin_unlock(&p->files->file_lock);
+			rcu_read_unlock();
 		}
 		task_unlock(p);
 	} while_each_task_pid(session, PIDTYPE_SID, p);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d2f3ed8acd9..863b46e0d78 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -40,10 +40,10 @@ static inline int get_close_on_exec(unsigned int fd)
 	struct files_struct *files = current->files;
 	struct fdtable *fdt;
 	int res;
-	spin_lock(&files->file_lock);
+	rcu_read_lock();
 	fdt = files_fdtable(files);
 	res = FD_ISSET(fd, fdt->close_on_exec);
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock();
 	return res;
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d0087a0b024..23db452ab42 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -62,6 +62,7 @@
 #include <linux/namespace.h>
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
+#include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -283,16 +284,16 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 
 	files = get_files_struct(task);
 	if (files) {
-		spin_lock(&files->file_lock);
+		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
 			*mnt = mntget(file->f_vfsmnt);
 			*dentry = dget(file->f_dentry);
-			spin_unlock(&files->file_lock);
+			rcu_read_unlock();
 			put_files_struct(files);
 			return 0;
 		}
-		spin_unlock(&files->file_lock);
+		rcu_read_unlock();
 		put_files_struct(files);
 	}
 	return -ENOENT;
@@ -1062,7 +1063,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			files = get_files_struct(p);
 			if (!files)
 				goto out;
-			spin_lock(&files->file_lock);
+			rcu_read_lock();
 			fdt = files_fdtable(files);
 			for (fd = filp->f_pos-2;
 			     fd < fdt->max_fds;
@@ -1071,7 +1072,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 
 				if (!fcheck_files(files, fd))
 					continue;
-				spin_unlock(&files->file_lock);
+				rcu_read_unlock();
 
 				j = NUMBUF;
 				i = fd;
@@ -1083,12 +1084,12 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 
 				ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
 				if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
-					spin_lock(&files->file_lock);
+					rcu_read_lock();
 					break;
 				}
-				spin_lock(&files->file_lock);
+				rcu_read_lock();
 			}
-			spin_unlock(&files->file_lock);
+			rcu_read_unlock();
 			put_files_struct(files);
 	}
 out:
@@ -1263,9 +1264,9 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 	files = get_files_struct(task);
 	if (files) {
-		spin_lock(&files->file_lock);
+		rcu_read_lock();
 		if (fcheck_files(files, fd)) {
-			spin_unlock(&files->file_lock);
+			rcu_read_unlock();
 			put_files_struct(files);
 			if (task_dumpable(task)) {
 				inode->i_uid = task->euid;
@@ -1277,7 +1278,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 			security_task_to_inode(task, inode);
 			return 1;
 		}
-		spin_unlock(&files->file_lock);
+		rcu_read_unlock();
 		put_files_struct(files);
 	}
 	d_drop(dentry);
@@ -1369,7 +1370,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (!files)
 		goto out_unlock;
 	inode->i_mode = S_IFLNK;
-	spin_lock(&files->file_lock);
+	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock2;
@@ -1377,7 +1378,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 		inode->i_mode |= S_IRUSR | S_IXUSR;
 	if (file->f_mode & 2)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock();
 	put_files_struct(files);
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
@@ -1387,7 +1388,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	return NULL;
 
 out_unlock2:
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock();
 	put_files_struct(files);
 out_unlock:
 	iput(inode);
diff --git a/fs/select.c b/fs/select.c
index 2e56325c73c..f10a10317d5 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -22,6 +22,7 @@
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 
@@ -185,9 +186,9 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 	int retval, i;
 	long __timeout = *timeout;
 
- 	spin_lock(&current->files->file_lock);
+	rcu_read_lock();
 	retval = max_select_fd(n, fds);
-	spin_unlock(&current->files->file_lock);
+	rcu_read_unlock();
 
 	if (retval < 0)
 		return retval;
@@ -329,8 +330,10 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 		goto out_nofds;
 
 	/* max_fdset can increase, so grab it once to avoid race */
+	rcu_read_lock();
 	fdt = files_fdtable(current->files);
 	max_fdset = fdt->max_fdset;
+	rcu_read_unlock();
 	if (n > max_fdset)
 		n = max_fdset;
 
@@ -469,10 +472,14 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
 	struct poll_list *head;
  	struct poll_list *walk;
 	struct fdtable *fdt;
+	int max_fdset;
 
 	/* Do a sanity check on nfds ... */
+	rcu_read_lock();
 	fdt = files_fdtable(current->files);
-	if (nfds > fdt->max_fdset && nfds > OPEN_MAX)
+	max_fdset = fdt->max_fdset;
+	rcu_read_unlock();
+	if (nfds > max_fdset && nfds > OPEN_MAX)
 		return -EINVAL;
 
 	if (timeout) {
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index c1889f88262..0cee2862ed8 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/file.h>
+#include <linux/rcupdate.h>
 #include <net/sock.h>
 
 #include <linux/netfilter_ipv4/ipt_owner.h>
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index 9b91decbfdd..4de4cdad4b7 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/file.h>
+#include <linux/rcupdate.h>
 #include <net/sock.h>
 
 #include <linux/netfilter_ipv6/ip6t_owner.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index acb5a495a90..f40c8221ec1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1652,7 +1652,7 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 						continue;
 					}
 					if (devnull) {
-						atomic_inc(&devnull->f_count);
+						rcuref_inc(&devnull->f_count);
 					} else {
 						devnull = dentry_open(dget(selinux_null), mntget(selinuxfs_mount), O_RDWR);
 						if (!devnull) {
-- 
cgit v1.2.3-70-g09d2


From 93fa58cb831337fdf5d36b3b913441100a484dae Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:18 -0700
Subject: [PATCH] v9fs: Documentation, Makefiles, Configuration

OVERVIEW

V9FS is a distributed file system for Linux which provides an
implementation of the Plan 9 resource sharing protocol 9P.  It can be
used to share all sorts of resources: static files, synthetic file servers
(such as /proc or /sys), devices, and application file servers (such as
FUSE).

BACKGROUND

Plan 9 (http://plan9.bell-labs.com/plan9) is a research operating
system and associated applications suite developed by the Computing
Science Research Center of AT&T Bell Laboratories (now a part of
Lucent Technologies), the same group that developed UNIX , C, and C++.
Plan 9 was initially released in 1993 to universities, and then made
generally available in 1995. Its core operating systems code laid the
foundation for the Inferno Operating System released as a product by
Lucent Bell-Labs in 1997. The Inferno venture was the only commercial
embodiment of Plan 9 and is currently maintained as a product by Vita
Nuova (http://www.vitanuova.com). After updated releases in 2000 and
2002, Plan 9 was open-sourced under the OSI approved Lucent Public
License in 2003.

The Plan 9 project was started by Ken Thompson and Rob Pike in 1985.
Their intent was to explore potential solutions to some of the
shortcomings of UNIX in the face of the widespread use of high-speed
networks to connect machines. In UNIX, networking was an afterthought
and UNIX clusters became little more than a network of stand-alone
systems. Plan 9 was designed from first principles as a seamless
distributed system with integrated secure network resource sharing.
Applications and services were architected in such a way as to allow
for implicit distribution across a cluster of systems. Configuring an
environment to use remote application components or services in place
of their local equivalent could be achieved with a few simple command
line instructions. For the most part, application implementations
operated independent of the location of their actual resources.

Commercial operating systems haven't changed much in the 20 years
since Plan 9 was conceived. Network and distributed systems support is
provided by a patchwork of middle-ware, with an endless number of
packages supplying pieces of the puzzle. Matters are complicated by
the use of different complicated protocols for individual services,
and separate implementations for kernel and application resources.
The V9FS project (http://v9fs.sourceforge.net) is an attempt to bring
Plan 9's unified approach to resource sharing to Linux and other
operating systems via support for the 9P2000 resource sharing
protocol.

V9FS HISTORY

V9FS was originally developed by Ron Minnich and Maya Gokhale at Los
Alamos National Labs (LANL) in 1997.  In November of 2001, Greg Watson
setup a SourceForge project as a public repository for the code which
supported the Linux 2.4 kernel.

About a year ago, I picked up the initial attempt Ron Minnich had
made to provide 2.6 support and got the code integrated into a 2.6.5
kernel.   I then went through a line-for-line re-write attempting to
clean-up the code while more closely following the Linux Kernel style
guidelines.  I co-authored a paper with Ron Minnich on the V9FS Linux
support including performance comparisons to NFSv3 using Bonnie and
PostMark - this paper appeared at the USENIX/FREENIX 2005
conference in April 2005:
( http://www.usenix.org/events/usenix05/tech/freenix/hensbergen.html ).

CALL FOR PARTICIPATION/REQUEST FOR COMMENTS

Our 2.6 kernel support is stabilizing and we'd like to begin pursuing
its integration into the official kernel tree.  We would appreciate any
review, comments, critiques, and additions from this community and are
actively seeking people to join our project and help us produce
something that would be acceptable and useful to the Linux community.

STATUS

The code is reasonably stable, although there are no doubt corner cases
our regression tests haven't discovered yet.  It is in regular use by several
of the developers and has been tested on x86 and PowerPC
(32-bit and 64-bit) in both small and large (LANL cluster) deployments.
Our current regression tests include fsx, bonnie, and postmark.

It was our intention to keep things as simple as possible for this
release -- trying to focus on correctness within the core of the
protocol support versus a rich set of features.  For example: a more
complete security model and cache layer are in the road map, but
excluded from this release.   Additionally, we have removed support for
mmap operations at Al Viro's request.

PERFORMANCE

Detailed performance numbers and analysis are included in the FREENIX
paper, but we show comparable performance to NFSv3 for large file
operations based on the Bonnie benchmark, and superior performance for
many small file operations based on the PostMark benchmark.   Somewhat
preliminary graphs (from the FREENIX paper) are available
(http://v9fs.sourceforge.net/perf/index.html).

RESOURCES

The source code is available in a few different forms:

tarballs: http://v9fs.sf.net
CVSweb: http://cvs.sourceforge.net/viewcvs.py/v9fs/linux-9p/
CVS: :pserver:anonymous@cvs.sourceforge.net:/cvsroot/v9fs/linux-9p
Git: rsync://v9fs.graverobber.org/v9fs (webgit: http://v9fs.graverobber.org)
9P: tcp!v9fs.graverobber.org!6564

The user-level server is available from either the Plan 9 distribution
or from http://v9fs.sf.net
Other support applications are still being developed, but preliminary
version can be downloaded from sourceforge.

Documentation on the protocol has historically been the Plan 9 Man
pages (http://plan9.bell-labs.com/sys/man/5/INDEX.html), but there is
an effort under way to write a more complete Internet-Draft style
specification (http://v9fs.sf.net/rfc).

There are a couple of mailing lists supporting v9fs, but the most used
is v9fs-developer@lists.sourceforge.net -- please direct/cc your
comments there so the other v9fs contibutors can participate in the
conversation.  There is also an IRC channel: irc://freenode.net/#v9fs

This part of the patch contains Documentation, Makefiles, and configuration
file changes.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/v9fs.txt | 95 ++++++++++++++++++++++++++++++++++++++
 MAINTAINERS                        | 11 +++++
 fs/9p/Makefile                     | 17 +++++++
 fs/Kconfig                         | 11 +++++
 fs/Makefile                        |  1 +
 5 files changed, 135 insertions(+)
 create mode 100644 Documentation/filesystems/v9fs.txt
 create mode 100644 fs/9p/Makefile

(limited to 'fs')

diff --git a/Documentation/filesystems/v9fs.txt b/Documentation/filesystems/v9fs.txt
new file mode 100644
index 00000000000..4e92feb6b50
--- /dev/null
+++ b/Documentation/filesystems/v9fs.txt
@@ -0,0 +1,95 @@
+			V9FS: 9P2000 for Linux
+			======================
+
+ABOUT
+=====
+
+v9fs is a Unix implementation of the Plan 9 9p remote filesystem protocol.
+
+This software was originally developed by Ron Minnich <rminnich@lanl.gov>
+and Maya Gokhale <maya@lanl.gov>.  Additional development by Greg Watson
+<gwatson@lanl.gov> and most recently Eric Van Hensbergen
+<ericvh@gmail.com> and Latchesar Ionkov <lucho@ionkov.net>.
+
+USAGE
+=====
+
+For remote file server:
+
+	mount -t 9P 10.10.1.2 /mnt/9
+
+For Plan 9 From User Space applications (http://swtch.com/plan9)
+
+	mount -t 9P `namespace`/acme /mnt/9 -o proto=unix,name=$USER
+
+OPTIONS
+=======
+
+  proto=name	select an alternative transport.  Valid options are
+  		currently:
+ 			unix - specifying a named pipe mount point
+ 			tcp  - specifying a normal TCP/IP connection
+ 			fd   - used passed file descriptors for connection
+                                (see rfdno and wfdno)
+
+  name=name	user name to attempt mount as on the remote server.  The
+  		server may override or ignore this value.  Certain user
+		names may require authentication.
+
+  aname=name	aname specifies the file tree to access when the server is
+  		offering several exported file systems.
+
+  debug=n	specifies debug level.  The debug level is a bitmask.
+  			0x01 = display verbose error messages
+			0x02 = developer debug (DEBUG_CURRENT)
+			0x04 = display 9P trace
+			0x08 = display VFS trace
+			0x10 = display Marshalling debug
+			0x20 = display RPC debug
+			0x40 = display transport debug
+			0x80 = display allocation debug
+
+  rfdno=n	the file descriptor for reading with proto=fd
+
+  wfdno=n	the file descriptor for writing with proto=fd
+
+  maxdata=n	the number of bytes to use for 9P packet payload (msize)
+
+  port=n	port to connect to on the remote server
+
+  timeout=n	request timeouts (in ms) (default 60000ms)
+
+  noextend	force legacy mode (no 9P2000.u semantics)
+
+  uid		attempt to mount as a particular uid
+
+  gid		attempt to mount with a particular gid
+
+  afid		security channel - used by Plan 9 authentication protocols
+
+  nodevmap	do not map special files - represent them as normal files.
+  		This can be used to share devices/named pipes/sockets between
+		hosts.  This functionality will be expanded in later versions.
+
+RESOURCES
+=========
+
+The Linux version of the 9P server, along with some client-side utilities
+can be found at http://v9fs.sf.net (along with a CVS repository of the
+development branch of this module).  There are user and developer mailing
+lists here, as well as a bug-tracker.
+
+For more information on the Plan 9 Operating System check out
+http://plan9.bell-labs.com/plan9
+
+For information on Plan 9 from User Space (Plan 9 applications and libraries
+ported to Linux/BSD/OSX/etc) check out http://swtch.com/plan9
+
+
+STATUS
+======
+
+The 2.6 kernel support is working on PPC and x86.
+
+PLEASE USE THE SOURCEFORGE BUG-TRACKER TO REPORT PROBLEMS.
+
diff --git a/MAINTAINERS b/MAINTAINERS
index eaa46594f02..8429bdb1d2a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2684,6 +2684,17 @@ L:	rio500-users@lists.sourceforge.net
 W:	http://rio500.sourceforge.net
 S:	Maintained
 
+V9FS FILE SYSTEM
+P:      Eric Van Hensbergen
+M:      ericvh@gmail.com
+P:      Ron Minnich
+M:      rminnich@lanl.gov
+P:      Latchesar Ionkov
+M:      lucho@ionkov.net
+L:      v9fs-developer@lists.sourceforge.net
+W:      http://v9fs.sf.net
+S:      Maintained
+
 VIDEO FOR LINUX
 P:	Mauro Carvalho Chehab
 M:	mchehab@brturbo.com.br
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
new file mode 100644
index 00000000000..e4e4ffe5a7d
--- /dev/null
+++ b/fs/9p/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_9P_FS) := 9p2000.o
+
+9p2000-objs := \
+	vfs_super.o \
+	vfs_inode.o \
+	vfs_file.o \
+	vfs_dir.o \
+	vfs_dentry.o \
+	error.o \
+	mux.o \
+	trans_fd.o \
+	trans_sock.o \
+	9p.o \
+	conv.o \
+	v9fs.o \
+	fid.o
+
diff --git a/fs/Kconfig b/fs/Kconfig
index 5e817902cb3..443aed4e206 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1703,6 +1703,17 @@ config AFS_FS
 config RXRPC
 	tristate
 
+config 9P_FS
+	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+	depends on INET && EXPERIMENTAL
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Partition Types"
diff --git a/fs/Makefile b/fs/Makefile
index 15158309dee..d646502c1ef 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_RELAYFS_FS)	+= relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
+obj-$(CONFIG_9P_FS)		+= 9p/
 obj-$(CONFIG_AFS_FS)		+= afs/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
-- 
cgit v1.2.3-70-g09d2


From e69e7fe5b0c86b7271045444a3a681136234c659 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:18 -0700
Subject: [PATCH] v9fs: VFS file, dentry, and directory operations

This part of the patch contains the VFS file, dentry & directory interfaces.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_dentry.c | 126 +++++++++++++++++
 fs/9p/vfs_dir.c    | 226 ++++++++++++++++++++++++++++++
 fs/9p/vfs_file.c   | 401 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 753 insertions(+)
 create mode 100644 fs/9p/vfs_dentry.c
 create mode 100644 fs/9p/vfs_dir.c
 create mode 100644 fs/9p/vfs_file.c

(limited to 'fs')

diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
new file mode 100644
index 00000000000..306c96741f8
--- /dev/null
+++ b/fs/9p/vfs_dentry.c
@@ -0,0 +1,126 @@
+/*
+ *  linux/fs/9p/vfs_dentry.c
+ *
+ * This file contians vfs dentry ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+/**
+ * v9fs_dentry_validate - VFS dcache hook to validate cache
+ * @dentry:  dentry that is being validated
+ * @nd: path data
+ *
+ * dcache really shouldn't be used for 9P2000 as at all due to
+ * potential attached semantics to directory traversal (walk).
+ *
+ * FUTURE: look into how to use dcache to allow multi-stage
+ * walks in Plan 9 & potential for better dcache operation which
+ * would remain valid for Plan 9 semantics.  Older versions
+ * had validation via stat for those interested.  However, since
+ * stat has the same approximate overhead as walk there really
+ * is no difference.  The only improvement would be from a
+ * time-decay cache like NFS has and that undermines the
+ * synchronous nature of 9P2000.
+ *
+ */
+
+static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *dc = current->fs->pwd;
+
+	dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry);
+	if (v9fs_fid_lookup(dentry, FID_OP)) {
+		dprintk(DEBUG_VFS, "VALID\n");
+		return 1;
+	}
+
+	while (dc != NULL) {
+		if (dc == dentry) {
+			dprintk(DEBUG_VFS, "VALID\n");
+			return 1;
+		}
+		if (dc == dc->d_parent)
+			break;
+
+		dc = dc->d_parent;
+	}
+
+	dprintk(DEBUG_VFS, "INVALID\n");
+	return 0;
+}
+
+/**
+ * v9fs_dentry_release - called when dentry is going to be freed
+ * @dentry:  dentry that is being release
+ *
+ */
+
+void v9fs_dentry_release(struct dentry *dentry)
+{
+	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+
+	if (dentry->d_fsdata != NULL) {
+		struct list_head *fid_list = dentry->d_fsdata;
+		struct v9fs_fid *temp = NULL;
+		struct v9fs_fid *current_fid = NULL;
+		struct v9fs_fcall *fcall = NULL;
+
+		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
+			if (v9fs_t_clunk
+			    (current_fid->v9ses, current_fid->fid, &fcall))
+				dprintk(DEBUG_ERROR, "clunk failed: %s\n",
+					FCALL_ERROR(fcall));
+
+			v9fs_put_idpool(current_fid->fid,
+					&current_fid->v9ses->fidpool);
+
+			kfree(fcall);
+			v9fs_fid_destroy(current_fid);
+		}
+
+		kfree(dentry->d_fsdata);	/* free the list_head */
+	}
+}
+
+struct dentry_operations v9fs_dentry_operations = {
+	.d_revalidate = v9fs_dentry_validate,
+	.d_release = v9fs_dentry_release,
+};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
new file mode 100644
index 00000000000..c478a738418
--- /dev/null
+++ b/fs/9p/vfs_dir.c
@@ -0,0 +1,226 @@
+/*
+ * linux/fs/9p/vfs_dir.c
+ *
+ * This file contains vfs directory ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+/**
+ * dt_type - return file type
+ * @mistat: mistat structure
+ *
+ */
+
+static inline int dt_type(struct v9fs_stat *mistat)
+{
+	unsigned long perm = mistat->mode;
+	int rettype = DT_REG;
+
+	if (perm & V9FS_DMDIR)
+		rettype = DT_DIR;
+	if (perm & V9FS_DMSYMLINK)
+		rettype = DT_LNK;
+
+	return rettype;
+}
+
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filep: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *file = filp->private_data;
+	unsigned int i, n;
+	int fid = -1;
+	int ret = 0;
+	struct v9fs_stat *mi = NULL;
+	int over = 0;
+
+	dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name);
+
+	fid = file->fid;
+
+	mi = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	if (!mi)
+		return -ENOMEM;
+
+	if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) {
+		kfree(file->rdir_fcall);
+		file->rdir_fcall = NULL;
+	}
+
+	if (file->rdir_fcall) {
+		n = file->rdir_fcall->params.rread.count;
+		i = file->rdir_fpos;
+		while (i < n) {
+			int s = v9fs_deserialize_stat(v9ses,
+				  file->rdir_fcall->params.rread.data + i,
+			          n - i, mi, v9ses->maxdata);
+
+			if (s == 0) {
+				dprintk(DEBUG_ERROR,
+					"error while deserializing mistat\n");
+				ret = -EIO;
+				goto FreeStructs;
+			}
+
+			over = filldir(dirent, mi->name, strlen(mi->name),
+				    filp->f_pos, v9fs_qid2ino(&mi->qid),
+				    dt_type(mi));
+
+			if (over) {
+				file->rdir_fpos = i;
+				file->rdir_pos = filp->f_pos;
+				break;
+			}
+
+			i += s;
+			filp->f_pos += s;
+		}
+
+		if (!over) {
+			kfree(file->rdir_fcall);
+			file->rdir_fcall = NULL;
+		}
+	}
+
+	while (!over) {
+		ret = v9fs_t_read(v9ses, fid, filp->f_pos,
+					    v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
+		if (ret < 0) {
+			dprintk(DEBUG_ERROR, "error while reading: %d: %p\n",
+				ret, fcall);
+			goto FreeStructs;
+		} else if (ret == 0)
+			break;
+
+		n = ret;
+		i = 0;
+		while (i < n) {
+			int s = v9fs_deserialize_stat(v9ses,
+			          fcall->params.rread.data + i, n - i, mi,
+			          v9ses->maxdata);
+
+			if (s == 0) {
+				dprintk(DEBUG_ERROR,
+					"error while deserializing mistat\n");
+				return -EIO;
+			}
+
+			over = filldir(dirent, mi->name, strlen(mi->name),
+				    filp->f_pos, v9fs_qid2ino(&mi->qid),
+				    dt_type(mi));
+
+			if (over) {
+				file->rdir_fcall = fcall;
+				file->rdir_fpos = i;
+				file->rdir_pos = filp->f_pos;
+				fcall = NULL;
+				break;
+			}
+
+			i += s;
+			filp->f_pos += s;
+		}
+
+		kfree(fcall);
+	}
+
+      FreeStructs:
+	kfree(fcall);
+	kfree(mi);
+	return ret;
+}
+
+/**
+ * v9fs_dir_release - close a directory
+ * @inode: inode of the directory
+ * @filp: file pointer to a directory
+ *
+ */
+
+int v9fs_dir_release(struct inode *inode, struct file *filp)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *fid = filp->private_data;
+	int fidnum = -1;
+
+	dprintk(DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp,
+		fid->fid);
+	fidnum = fid->fid;
+
+	filemap_fdatawrite(inode->i_mapping);
+	filemap_fdatawait(inode->i_mapping);
+
+	if (fidnum >= 0) {
+		fid->fidopen--;
+		dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
+			fid->fid);
+
+		if (fid->fidopen == 0) {
+			if (v9fs_t_clunk(v9ses, fidnum, NULL))
+				dprintk(DEBUG_ERROR, "clunk failed\n");
+
+			v9fs_put_idpool(fid->fid, &v9ses->fidpool);
+		}
+
+		kfree(fid->rdir_fcall);
+
+		filp->private_data = NULL;
+		v9fs_fid_destroy(fid);
+	}
+
+	d_drop(filp->f_dentry);
+	return 0;
+}
+
+struct file_operations v9fs_dir_operations = {
+	.read = generic_read_dir,
+	.readdir = v9fs_dir_readdir,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
new file mode 100644
index 00000000000..1f8ae7d580a
--- /dev/null
+++ b/fs/9p/vfs_file.c
@@ -0,0 +1,401 @@
+/*
+ *  linux/fs/9p/vfs_file.c
+ *
+ * This file contians vfs file ops for 9P2000.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * v9fs_file_open - open a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ */
+
+int v9fs_file_open(struct inode *inode, struct file *file)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *v9fid = v9fs_fid_lookup(file->f_dentry, FID_WALK);
+	struct v9fs_fid *v9newfid = NULL;
+	struct v9fs_fcall *fcall = NULL;
+	int open_mode = 0;
+	unsigned int iounit = 0;
+	int newfid = -1;
+	long result = -1;
+
+	dprintk(DEBUG_VFS, "inode: %p file: %p v9fid= %p\n", inode, file,
+		v9fid);
+
+	if (!v9fid) {
+		struct dentry *dentry = file->f_dentry;
+		dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n");
+
+		/* XXX - some duplication from lookup, generalize later */
+		/* basically vfs_lookup is too heavy weight */
+		v9fid = v9fs_fid_lookup(file->f_dentry, FID_OP);
+		if (!v9fid)
+			return -EBADF;
+
+		v9fid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
+		if (!v9fid)
+			return -EBADF;
+
+		newfid = v9fs_get_idpool(&v9ses->fidpool);
+		if (newfid < 0) {
+			eprintk(KERN_WARNING, "newfid fails!\n");
+			return -ENOSPC;
+		}
+
+		result =
+		    v9fs_t_walk(v9ses, v9fid->fid, newfid,
+				(char *)file->f_dentry->d_name.name, NULL);
+		if (result < 0) {
+			v9fs_put_idpool(newfid, &v9ses->fidpool);
+			dprintk(DEBUG_ERROR, "rewalk didn't work\n");
+			return -EBADF;
+		}
+
+		v9fid = v9fs_fid_create(dentry);
+		if (v9fid == NULL) {
+			dprintk(DEBUG_ERROR, "couldn't insert\n");
+			return -ENOMEM;
+		}
+		v9fid->fid = newfid;
+	}
+
+	if (v9fid->fidcreate) {
+		/* create case */
+		newfid = v9fid->fid;
+		iounit = v9fid->iounit;
+		v9fid->fidcreate = 0;
+	} else {
+		if (!S_ISDIR(inode->i_mode))
+			newfid = v9fid->fid;
+		else {
+			newfid = v9fs_get_idpool(&v9ses->fidpool);
+			if (newfid < 0) {
+				eprintk(KERN_WARNING, "allocation failed\n");
+				return -ENOSPC;
+			}
+			/* This would be a somewhat critical clone */
+			result =
+			    v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL,
+					&fcall);
+			if (result < 0) {
+				dprintk(DEBUG_ERROR, "clone error: %s\n",
+					FCALL_ERROR(fcall));
+				kfree(fcall);
+				return result;
+			}
+
+			v9newfid = v9fs_fid_create(file->f_dentry);
+			v9newfid->fid = newfid;
+			v9newfid->qid = v9fid->qid;
+			v9newfid->iounit = v9fid->iounit;
+			v9newfid->fidopen = 0;
+			v9newfid->fidclunked = 0;
+			v9newfid->v9ses = v9ses;
+			v9fid = v9newfid;
+			kfree(fcall);
+		}
+
+		/* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
+		/* translate open mode appropriately */
+		open_mode = file->f_flags & 0x3;
+
+		if (file->f_flags & O_EXCL)
+			open_mode |= V9FS_OEXCL;
+
+		if (v9ses->extended) {
+			if (file->f_flags & O_TRUNC)
+				open_mode |= V9FS_OTRUNC;
+
+			if (file->f_flags & O_APPEND)
+				open_mode |= V9FS_OAPPEND;
+		}
+
+		result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
+		if (result < 0) {
+			dprintk(DEBUG_ERROR,
+				"open failed, open_mode 0x%x: %s\n", open_mode,
+				FCALL_ERROR(fcall));
+			kfree(fcall);
+			return result;
+		}
+
+		iounit = fcall->params.ropen.iounit;
+		kfree(fcall);
+	}
+
+
+	file->private_data = v9fid;
+
+	v9fid->rdir_pos = 0;
+	v9fid->rdir_fcall = NULL;
+	v9fid->fidopen = 1;
+	v9fid->filp = file;
+	v9fid->iounit = iounit;
+
+	return 0;
+}
+
+/**
+ * v9fs_file_lock - lock a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ * XXX - this looks like a local only lock, we should extend into 9P
+ *       by using open exclusive
+ */
+
+static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	int res = 0;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	dprintk(DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+
+	/* No mandatory locks */
+	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+		return -ENOLCK;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_fdatawrite(inode->i_mapping);
+		filemap_fdatawait(inode->i_mapping);
+		invalidate_inode_pages(&inode->i_data);
+	}
+
+	return res;
+}
+
+/**
+ * v9fs_read - read from a file (internal)
+ * @filep: file pointer to read
+ * @data: data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
+static ssize_t
+v9fs_read(struct file *filp, char *buffer, size_t count, loff_t * offset)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *v9f = filp->private_data;
+	struct v9fs_fcall *fcall = NULL;
+	int fid = v9f->fid;
+	int rsize = 0;
+	int result = 0;
+	int total = 0;
+
+	dprintk(DEBUG_VFS, "\n");
+
+	rsize = v9ses->maxdata - V9FS_IOHDRSZ;
+	if (v9f->iounit != 0 && rsize > v9f->iounit)
+		rsize = v9f->iounit;
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		result = v9fs_t_read(v9ses, fid, *offset, rsize, &fcall);
+
+		if (result < 0) {
+			printk(KERN_ERR "9P2000: v9fs_t_read returned %d\n",
+			       result);
+
+			kfree(fcall);
+			return total;
+		} else
+			*offset += result;
+
+		/* XXX - extra copy */
+		memcpy(buffer, fcall->params.rread.data, result);
+		count -= result;
+		buffer += result;
+		total += result;
+
+		kfree(fcall);
+
+		if (result < rsize)
+			break;
+	} while (count);
+
+	return total;
+}
+
+/**
+ * v9fs_file_read - read from a file
+ * @filep: file pointer to read
+ * @data: data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
+static ssize_t
+v9fs_file_read(struct file *filp, char __user * data, size_t count,
+	       loff_t * offset)
+{
+	int retval = -1;
+	int ret = 0;
+	char *buffer;
+
+	buffer = kmalloc(count, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	retval = v9fs_read(filp, buffer, count, offset);
+	if (retval > 0) {
+		if ((ret = copy_to_user(data, buffer, retval)) != 0) {
+			dprintk(DEBUG_ERROR, "Problem copying to user %d\n",
+				ret);
+			retval = ret;
+		}
+	}
+
+	kfree(buffer);
+
+	return retval;
+}
+
+/**
+ * v9fs_write - write to a file
+ * @filep: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+
+static ssize_t
+v9fs_write(struct file *filp, char *buffer, size_t count, loff_t * offset)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *v9fid = filp->private_data;
+	struct v9fs_fcall *fcall;
+	int fid = v9fid->fid;
+	int result = -EIO;
+	int rsize = 0;
+	int total = 0;
+
+	dprintk(DEBUG_VFS, "data %p count %d offset %x\n", buffer, (int)count,
+		(int)*offset);
+	rsize = v9ses->maxdata - V9FS_IOHDRSZ;
+	if (v9fid->iounit != 0 && rsize > v9fid->iounit)
+		rsize = v9fid->iounit;
+
+	dump_data(buffer, count);
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		result =
+		    v9fs_t_write(v9ses, fid, *offset, rsize, buffer, &fcall);
+		if (result < 0) {
+			eprintk(KERN_ERR, "error while writing: %s(%d)\n",
+				FCALL_ERROR(fcall), result);
+			kfree(fcall);
+			return result;
+		} else
+			*offset += result;
+
+		kfree(fcall);
+
+		if (result != rsize) {
+			eprintk(KERN_ERR,
+				"short write: v9fs_t_write returned %d\n",
+				result);
+			break;
+		}
+
+		count -= result;
+		buffer += result;
+		total += result;
+	} while (count);
+
+	return total;
+}
+
+/**
+ * v9fs_file_write - write to a file
+ * @filep: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+		size_t count, loff_t * offset)
+{
+	int ret = -1;
+	char *buffer;
+
+	buffer = kmalloc(count, GFP_KERNEL);
+	if (buffer == NULL)
+		return -ENOMEM;
+
+	ret = copy_from_user(buffer, data, count);
+	if (ret) {
+		dprintk(DEBUG_ERROR, "Problem copying from user\n");
+		ret = -EFAULT;
+	} else {
+		ret = v9fs_write(filp, buffer, count, offset);
+	}
+
+	kfree(buffer);
+
+	return ret;
+}
+
+struct file_operations v9fs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_file_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+};
-- 
cgit v1.2.3-70-g09d2


From 2bad8471511ce5cc3ea90d0940622bd4b56b9cce Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:19 -0700
Subject: [PATCH] v9fs: VFS inode operations

This part of the patch contains the VFS inode interfaces.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 1371 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1371 insertions(+)
 create mode 100644 fs/9p/vfs_inode.c

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
new file mode 100644
index 00000000000..ef78af7ef04
--- /dev/null
+++ b/fs/9p/vfs_inode.c
@@ -0,0 +1,1371 @@
+/*
+ *  linux/fs/9p/vfs_inode.c
+ *
+ * This file contians vfs inode ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+static struct inode_operations v9fs_dir_inode_operations;
+static struct inode_operations v9fs_file_inode_operations;
+static struct inode_operations v9fs_symlink_inode_operations;
+
+/**
+ * unixmode2p9mode - convert unix mode bits to plan 9
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+
+static inline int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
+{
+	int res;
+	res = mode & 0777;
+	if (S_ISDIR(mode))
+		res |= V9FS_DMDIR;
+	if (v9ses->extended) {
+		if (S_ISLNK(mode))
+			res |= V9FS_DMSYMLINK;
+		if (v9ses->nodev == 0) {
+			if (S_ISSOCK(mode))
+				res |= V9FS_DMSOCKET;
+			if (S_ISFIFO(mode))
+				res |= V9FS_DMNAMEDPIPE;
+			if (S_ISBLK(mode))
+				res |= V9FS_DMDEVICE;
+			if (S_ISCHR(mode))
+				res |= V9FS_DMDEVICE;
+		}
+
+		if ((mode & S_ISUID) == S_ISUID)
+			res |= V9FS_DMSETUID;
+		if ((mode & S_ISGID) == S_ISGID)
+			res |= V9FS_DMSETGID;
+		if ((mode & V9FS_DMLINK))
+			res |= V9FS_DMLINK;
+	}
+
+	return res;
+}
+
+/**
+ * p9mode2unixmode- convert plan9 mode bits to unix mode bits
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+
+static inline int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+{
+	int res;
+
+	res = mode & 0777;
+
+	if ((mode & V9FS_DMDIR) == V9FS_DMDIR)
+		res |= S_IFDIR;
+	else if ((mode & V9FS_DMSYMLINK) && (v9ses->extended))
+		res |= S_IFLNK;
+	else if ((mode & V9FS_DMSOCKET) && (v9ses->extended)
+		 && (v9ses->nodev == 0))
+		res |= S_IFSOCK;
+	else if ((mode & V9FS_DMNAMEDPIPE) && (v9ses->extended)
+		 && (v9ses->nodev == 0))
+		res |= S_IFIFO;
+	else if ((mode & V9FS_DMDEVICE) && (v9ses->extended)
+		 && (v9ses->nodev == 0))
+		res |= S_IFBLK;
+	else
+		res |= S_IFREG;
+
+	if (v9ses->extended) {
+		if ((mode & V9FS_DMSETUID) == V9FS_DMSETUID)
+			res |= S_ISUID;
+
+		if ((mode & V9FS_DMSETGID) == V9FS_DMSETGID)
+			res |= S_ISGID;
+	}
+
+	return res;
+}
+
+/**
+ * v9fs_blank_mistat - helper function to setup a 9P stat structure
+ * @v9ses: 9P session info (for determining extended mode)
+ * @mistat: structure to initialize
+ *
+ */
+
+static inline void
+v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat)
+{
+	mistat->type = ~0;
+	mistat->dev = ~0;
+	mistat->qid.type = ~0;
+	mistat->qid.version = ~0;
+	*((long long *)&mistat->qid.path) = ~0;
+	mistat->mode = ~0;
+	mistat->atime = ~0;
+	mistat->mtime = ~0;
+	mistat->length = ~0;
+	mistat->name = mistat->data;
+	mistat->uid = mistat->data;
+	mistat->gid = mistat->data;
+	mistat->muid = mistat->data;
+	if (v9ses->extended) {
+		mistat->n_uid = ~0;
+		mistat->n_gid = ~0;
+		mistat->n_muid = ~0;
+		mistat->extension = mistat->data;
+	}
+	*mistat->data = 0;
+}
+
+/**
+ * v9fs_mistat2unix - convert mistat to unix stat
+ * @mistat: Plan 9 metadata (mistat) structure
+ * @stat: unix metadata (stat) structure to populate
+ * @sb: superblock
+ *
+ */
+
+static void
+v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
+		 struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses = sb ? sb->s_fs_info : NULL;
+
+	buf->st_nlink = 1;
+
+	buf->st_atime = mistat->atime;
+	buf->st_mtime = mistat->mtime;
+	buf->st_ctime = mistat->mtime;
+
+	if (v9ses && v9ses->extended) {
+		/* TODO: string to uid mapping via user-space daemon */
+		buf->st_uid = mistat->n_uid;
+		buf->st_gid = mistat->n_gid;
+
+		sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
+		sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
+	} else {
+		buf->st_uid = v9ses->uid;
+		buf->st_gid = v9ses->gid;
+	}
+
+	buf->st_uid = (unsigned short)-1;
+	buf->st_gid = (unsigned short)-1;
+
+	if (v9ses && v9ses->extended) {
+		if (mistat->n_uid != -1)
+			sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
+
+		if (mistat->n_gid != -1)
+			sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
+	}
+
+	if (buf->st_uid == (unsigned short)-1)
+		buf->st_uid = v9ses->uid;
+	if (buf->st_gid == (unsigned short)-1)
+		buf->st_gid = v9ses->gid;
+
+	buf->st_mode = p9mode2unixmode(v9ses, mistat->mode);
+	if ((S_ISBLK(buf->st_mode)) || (S_ISCHR(buf->st_mode))) {
+		char type = 0;
+		int major = -1;
+		int minor = -1;
+		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			buf->st_mode &= ~S_IFBLK;
+			buf->st_mode |= S_IFCHR;
+			break;
+		case 'b':
+			break;
+		default:
+			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
+				type, mistat->extension);
+		};
+		buf->st_rdev = MKDEV(major, minor);
+	} else
+		buf->st_rdev = 0;
+
+	buf->st_size = mistat->length;
+
+	buf->st_blksize = sb->s_blocksize;
+	buf->st_blocks =
+	    (buf->st_size + buf->st_blksize - 1) >> sb->s_blocksize_bits;
+}
+
+/**
+ * v9fs_get_inode - helper function to setup an inode
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode = NULL;
+
+	dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+
+	inode = new_inode(sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = sb->s_blocksize;
+		inode->i_blocks = 0;
+		inode->i_rdev = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+		switch (mode & S_IFMT) {
+		case S_IFIFO:
+		case S_IFBLK:
+		case S_IFCHR:
+		case S_IFSOCK:
+		case S_IFREG:
+			inode->i_op = &v9fs_file_inode_operations;
+			inode->i_fop = &v9fs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_nlink++;
+			inode->i_op = &v9fs_dir_inode_operations;
+			inode->i_fop = &v9fs_dir_operations;
+			break;
+		case S_IFLNK:
+			inode->i_op = &v9fs_symlink_inode_operations;
+			break;
+		default:
+			dprintk(DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+				mode, mode & S_IFMT);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		eprintk(KERN_WARNING, "Problem allocating inode\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	return inode;
+}
+
+/**
+ * v9fs_create - helper function to create files and directories
+ * @dir: directory inode file is being created in
+ * @file_dentry: dentry file is being created in
+ * @perm: permissions file is being created with
+ * @open_mode: resulting open mode for file ???
+ *
+ */
+
+static int
+v9fs_create(struct inode *dir,
+	    struct dentry *file_dentry,
+	    unsigned int perm, unsigned int open_mode)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct super_block *sb = dir->i_sb;
+	struct v9fs_fid *dirfid =
+	    v9fs_fid_lookup(file_dentry->d_parent, FID_WALK);
+	struct v9fs_fid *fid = NULL;
+	struct inode *file_inode = NULL;
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_qid qid;
+	struct stat newstat;
+	int dirfidnum = -1;
+	long newfid = -1;
+	int result = 0;
+	unsigned int iounit = 0;
+
+	perm = unixmode2p9mode(v9ses, perm);
+
+	dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir,
+		file_dentry, perm, open_mode);
+
+	if (!dirfid)
+		return -EBADF;
+
+	dirfidnum = dirfid->fid;
+	if (dirfidnum < 0) {
+		dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n",
+			dir->i_ino);
+		return -EBADF;
+	}
+
+	if (file_dentry->d_inode) {
+		dprintk(DEBUG_ERROR,
+			"Odd. There is an inode for dir %lu, name :%s:\n",
+			dir->i_ino, file_dentry->d_name.name);
+		return -EEXIST;
+	}
+
+	newfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (newfid < 0) {
+		eprintk(KERN_WARNING, "no free fids available\n");
+		return -ENOSPC;
+	}
+
+	result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		newfid = 0;
+		goto CleanUpFid;
+	}
+
+	kfree(fcall);
+
+	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
+			       perm, open_mode, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "create fails: %s(%d)\n",
+			FCALL_ERROR(fcall), result);
+
+		goto CleanUpFid;
+	}
+
+	iounit = fcall->params.rcreate.iounit;
+	qid = fcall->params.rcreate.qid;
+	kfree(fcall);
+
+	fid = v9fs_fid_create(file_dentry);
+	if (!fid) {
+		result = -ENOMEM;
+		goto CleanUpFid;
+	}
+
+	fid->fid = newfid;
+	fid->fidopen = 0;
+	fid->fidcreate = 1;
+	fid->qid = qid;
+	fid->iounit = iounit;
+	fid->rdir_pos = 0;
+	fid->rdir_fcall = NULL;
+	fid->v9ses = v9ses;
+
+	if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) ||
+	    (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) ||
+	    (perm & V9FS_DMDEVICE))
+		return 0;
+
+	result = v9fs_t_stat(v9ses, newfid, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall),
+			result);
+		goto CleanUpFid;
+	}
+
+	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
+
+	file_inode = v9fs_get_inode(sb, newstat.st_mode);
+	if ((!file_inode) || IS_ERR(file_inode)) {
+		dprintk(DEBUG_ERROR, "create inode failed\n");
+		result = -EBADF;
+		goto CleanUpFid;
+	}
+
+	v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb);
+	kfree(fcall);
+	d_instantiate(file_dentry, file_inode);
+
+	if (perm & V9FS_DMDIR) {
+		if (v9fs_t_clunk(v9ses, newfid, &fcall))
+			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n",
+				FCALL_ERROR(fcall));
+
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		kfree(fcall);
+		fid->fidopen = 0;
+		fid->fidcreate = 0;
+		d_drop(file_dentry);
+	}
+
+	return 0;
+
+      CleanUpFid:
+	kfree(fcall);
+
+	if (newfid) {
+		if (v9fs_t_clunk(v9ses, newfid, &fcall))
+			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
+				FCALL_ERROR(fcall));
+
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		kfree(fcall);
+	}
+	return result;
+}
+
+/**
+ * v9fs_remove - helper function to remove files and directories
+ * @inode: directory inode that is being deleted
+ * @dentry:  dentry that is being deleted
+ * @rmdir: where we are a file or a directory
+ *
+ */
+
+static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct super_block *sb = NULL;
+	struct v9fs_session_info *v9ses = NULL;
+	struct v9fs_fid *v9fid = NULL;
+	struct inode *file_inode = NULL;
+	int fid = -1;
+	int result = 0;
+
+	dprintk(DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
+		rmdir);
+
+	file_inode = file->d_inode;
+	sb = file_inode->i_sb;
+	v9ses = v9fs_inode2v9ses(file_inode);
+	v9fid = v9fs_fid_lookup(file, FID_OP);
+
+	if (!v9fid) {
+		dprintk(DEBUG_ERROR,
+			"no v9fs_fid\n");
+		return -EBADF;
+	}
+
+	fid = v9fid->fid;
+	if (fid < 0) {
+		dprintk(DEBUG_ERROR, "inode #%lu, no fid!\n",
+			file_inode->i_ino);
+		return -EBADF;
+	}
+
+	result = v9fs_t_remove(v9ses, fid, &fcall);
+	if (result < 0)
+		dprintk(DEBUG_ERROR, "remove of file fails: %s(%d)\n",
+			FCALL_ERROR(fcall), result);
+	else {
+		v9fs_put_idpool(fid, &v9ses->fidpool);
+		v9fs_fid_destroy(v9fid);
+	}
+
+	kfree(fcall);
+	return result;
+}
+
+/**
+ * v9fs_vfs_create - VFS hook to create files
+ * @inode: directory inode that is being deleted
+ * @dentry:  dentry that is being deleted
+ * @perm: create permissions
+ * @nd: path information
+ *
+ */
+
+static int
+v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
+		struct nameidata *nd)
+{
+	return v9fs_create(inode, dentry, perm, O_RDWR);
+}
+
+/**
+ * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
+ * @i:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode)
+{
+	return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY);
+}
+
+/**
+ * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
+ * @dir:  inode that is being walked from
+ * @dentry: dentry that is being walked to?
+ * @nameidata: path data
+ *
+ */
+
+static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nameidata)
+{
+	struct super_block *sb;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dirfid;
+	struct v9fs_fid *fid;
+	struct inode *inode;
+	struct v9fs_fcall *fcall = NULL;
+	struct stat newstat;
+	int dirfidnum = -1;
+	int newfid = -1;
+	int result = 0;
+
+	dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+		dir, dentry->d_iname, dentry, nameidata);
+
+	sb = dir->i_sb;
+	v9ses = v9fs_inode2v9ses(dir);
+	dirfid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
+
+	if (!dirfid) {
+		dprintk(DEBUG_ERROR, "no dirfid\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	dirfidnum = dirfid->fid;
+
+	if (dirfidnum < 0) {
+		dprintk(DEBUG_ERROR, "no dirfid for inode %p, #%lu\n",
+			dir, dir->i_ino);
+		return ERR_PTR(-EBADF);
+	}
+
+	newfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (newfid < 0) {
+		eprintk(KERN_WARNING, "newfid fails!\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	result =
+	    v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name,
+			NULL);
+	if (result < 0) {
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		if (result == -ENOENT) {
+			d_add(dentry, NULL);
+			dprintk(DEBUG_ERROR,
+				"Return negative dentry %p count %d\n",
+				dentry, atomic_read(&dentry->d_count));
+			return NULL;
+		}
+		dprintk(DEBUG_ERROR, "walk error:%d\n", result);
+		goto FreeFcall;
+	}
+
+	result = v9fs_t_stat(v9ses, newfid, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "stat error\n");
+		goto FreeFcall;
+	}
+
+	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
+	inode = v9fs_get_inode(sb, newstat.st_mode);
+
+	if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) {
+		eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n",
+			PTR_ERR(inode));
+
+		result = -ENOSPC;
+		goto FreeFcall;
+	}
+
+	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+
+	fid = v9fs_fid_create(dentry);
+	if (fid == NULL) {
+		dprintk(DEBUG_ERROR, "couldn't insert\n");
+		result = -ENOMEM;
+		goto FreeFcall;
+	}
+
+	fid->fid = newfid;
+	fid->fidopen = 0;
+	fid->v9ses = v9ses;
+	fid->qid = fcall->params.rstat.stat->qid;
+
+	dentry->d_op = &v9fs_dentry_operations;
+	v9fs_mistat2inode(fcall->params.rstat.stat, inode, inode->i_sb);
+
+	d_add(dentry, inode);
+	kfree(fcall);
+
+	return NULL;
+
+      FreeFcall:
+	kfree(fcall);
+	return ERR_PTR(result);
+}
+
+/**
+ * v9fs_vfs_unlink - VFS unlink hook to delete an inode
+ * @i:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ *
+ */
+
+static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, 0);
+}
+
+/**
+ * v9fs_vfs_rmdir - VFS unlink hook to delete a directory
+ * @i:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ *
+ */
+
+static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, 1);
+}
+
+/**
+ * v9fs_vfs_rename - VFS hook to rename an inode
+ * @old_dir:  old dir inode
+ * @old_dentry: old dentry
+ * @new_dir: new dir inode
+ * @new_dentry: new dentry
+ *
+ */
+
+static int
+v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(old_inode);
+	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_WALK);
+	struct v9fs_fid *olddirfid =
+	    v9fs_fid_lookup(old_dentry->d_parent, FID_WALK);
+	struct v9fs_fid *newdirfid =
+	    v9fs_fid_lookup(new_dentry->d_parent, FID_WALK);
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_fcall *fcall = NULL;
+	int fid = -1;
+	int olddirfidnum = -1;
+	int newdirfidnum = -1;
+	int retval = 0;
+
+	dprintk(DEBUG_VFS, "\n");
+
+	if ((!oldfid) || (!olddirfid) || (!newdirfid)) {
+		dprintk(DEBUG_ERROR, "problem with arguments\n");
+		return -EBADF;
+	}
+
+	/* 9P can only handle file rename in the same directory */
+	if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
+		dprintk(DEBUG_ERROR, "old dir and new dir are different\n");
+		retval = -EPERM;
+		goto FreeFcallnBail;
+	}
+
+	fid = oldfid->fid;
+	olddirfidnum = olddirfid->fid;
+	newdirfidnum = newdirfid->fid;
+
+	if (fid < 0) {
+		dprintk(DEBUG_ERROR, "no fid for old file #%lu\n",
+			old_inode->i_ino);
+		retval = -EBADF;
+		goto FreeFcallnBail;
+	}
+
+	v9fs_blank_mistat(v9ses, mistat);
+
+	strcpy(mistat->data + 1, v9ses->name);
+	mistat->name = mistat->data + 1 + strlen(v9ses->name);
+
+	if (new_dentry->d_name.len >
+	    (v9ses->maxdata - strlen(v9ses->name) - sizeof(struct v9fs_stat))) {
+		dprintk(DEBUG_ERROR, "new name too long\n");
+		goto FreeFcallnBail;
+	}
+
+	strcpy(mistat->name, new_dentry->d_name.name);
+	retval = v9fs_t_wstat(v9ses, fid, mistat, &fcall);
+
+      FreeFcallnBail:
+	kfree(mistat);
+
+	if (retval < 0)
+		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+			FCALL_ERROR(fcall));
+
+	kfree(fcall);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_getattr - retreive file metadata
+ * @mnt - mount information
+ * @dentry - file to get attributes on
+ * @stat - metadata structure to populate
+ *
+ */
+
+static int
+v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+	int err = -EPERM;
+
+	dprintk(DEBUG_VFS, "dentry: %p\n", dentry);
+	if (!fid) {
+		dprintk(DEBUG_ERROR,
+			"couldn't find fid associated with dentry\n");
+		return -EBADF;
+	}
+
+	err = v9fs_t_stat(v9ses, fid->fid, &fcall);
+
+	if (err < 0)
+		dprintk(DEBUG_ERROR, "stat error\n");
+	else {
+		v9fs_mistat2inode(fcall->params.rstat.stat, dentry->d_inode,
+				  dentry->d_inode->i_sb);
+		generic_fillattr(dentry->d_inode, stat);
+	}
+
+	kfree(fcall);
+	return err;
+}
+
+/**
+ * v9fs_vfs_setattr - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_fcall *fcall = NULL;
+	int res = -EPERM;
+
+	dprintk(DEBUG_VFS, "\n");
+	if (!fid) {
+		dprintk(DEBUG_ERROR,
+			"Couldn't find fid associated with dentry\n");
+		return -EBADF;
+	}
+
+	if (!mistat)
+		return -ENOMEM;
+
+	v9fs_blank_mistat(v9ses, mistat);
+	if (iattr->ia_valid & ATTR_MODE)
+		mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode);
+
+	if (iattr->ia_valid & ATTR_MTIME)
+		mistat->mtime = iattr->ia_mtime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_ATIME)
+		mistat->atime = iattr->ia_atime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_SIZE)
+		mistat->length = iattr->ia_size;
+
+	if (v9ses->extended) {
+		char *uid = kmalloc(strlen(mistat->uid), GFP_KERNEL);
+		char *gid = kmalloc(strlen(mistat->gid), GFP_KERNEL);
+		char *muid = kmalloc(strlen(mistat->muid), GFP_KERNEL);
+		char *name = kmalloc(strlen(mistat->name), GFP_KERNEL);
+		char *extension = kmalloc(strlen(mistat->extension),
+					  GFP_KERNEL);
+
+		if ((!uid) || (!gid) || (!muid) || (!name) || (!extension)) {
+			kfree(uid);
+			kfree(gid);
+			kfree(muid);
+			kfree(name);
+			kfree(extension);
+
+			return -ENOMEM;
+		}
+
+		strcpy(uid, mistat->uid);
+		strcpy(gid, mistat->gid);
+		strcpy(muid, mistat->muid);
+		strcpy(name, mistat->name);
+		strcpy(extension, mistat->extension);
+
+		if (iattr->ia_valid & ATTR_UID) {
+			if (strlen(uid) != 8) {
+				dprintk(DEBUG_ERROR, "uid strlen is %u not 8\n",
+					(unsigned int)strlen(uid));
+				sprintf(uid, "%08x", iattr->ia_uid);
+			} else {
+				kfree(uid);
+				uid = kmalloc(9, GFP_KERNEL);
+			}
+
+			sprintf(uid, "%08x", iattr->ia_uid);
+			mistat->n_uid = iattr->ia_uid;
+		}
+
+		if (iattr->ia_valid & ATTR_GID) {
+			if (strlen(gid) != 8)
+				dprintk(DEBUG_ERROR, "gid strlen is %u not 8\n",
+					(unsigned int)strlen(gid));
+			else {
+				kfree(gid);
+				gid = kmalloc(9, GFP_KERNEL);
+			}
+
+			sprintf(gid, "%08x", iattr->ia_gid);
+			mistat->n_gid = iattr->ia_gid;
+		}
+
+		mistat->uid = mistat->data;
+		strcpy(mistat->uid, uid);
+		mistat->gid = mistat->data + strlen(uid) + 1;
+		strcpy(mistat->gid, gid);
+		mistat->muid = mistat->gid + strlen(gid) + 1;
+		strcpy(mistat->muid, muid);
+		mistat->name = mistat->muid + strlen(muid) + 1;
+		strcpy(mistat->name, name);
+		mistat->extension = mistat->name + strlen(name) + 1;
+		strcpy(mistat->extension, extension);
+
+		kfree(uid);
+		kfree(gid);
+		kfree(muid);
+		kfree(name);
+		kfree(extension);
+	}
+
+	res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall);
+
+	if (res < 0)
+		dprintk(DEBUG_ERROR, "wstat error: %s\n", FCALL_ERROR(fcall));
+
+	kfree(mistat);
+	kfree(fcall);
+
+	if (res >= 0)
+		res = inode_setattr(dentry->d_inode, iattr);
+
+	return res;
+}
+
+/**
+ * v9fs_mistat2inode - populate an inode structure with mistat info
+ * @mistat: Plan 9 metadata (mistat) structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
+		  struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	inode->i_nlink = 1;
+
+	inode->i_atime.tv_sec = mistat->atime;
+	inode->i_mtime.tv_sec = mistat->mtime;
+	inode->i_ctime.tv_sec = mistat->mtime;
+
+	inode->i_uid = -1;
+	inode->i_gid = -1;
+
+	if (v9ses->extended) {
+		/* TODO: string to uid mapping via user-space daemon */
+		inode->i_uid = mistat->n_uid;
+		inode->i_gid = mistat->n_gid;
+
+		if (mistat->n_uid == -1)
+			sscanf(mistat->uid, "%x", &inode->i_uid);
+
+		if (mistat->n_gid == -1)
+			sscanf(mistat->gid, "%x", &inode->i_gid);
+	}
+
+	if (inode->i_uid == -1)
+		inode->i_uid = v9ses->uid;
+	if (inode->i_gid == -1)
+		inode->i_gid = v9ses->gid;
+
+	inode->i_mode = p9mode2unixmode(v9ses, mistat->mode);
+	if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
+		char type = 0;
+		int major = -1;
+		int minor = -1;
+		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			inode->i_mode &= ~S_IFBLK;
+			inode->i_mode |= S_IFCHR;
+			break;
+		case 'b':
+			break;
+		default:
+			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
+				type, mistat->extension);
+		};
+		inode->i_rdev = MKDEV(major, minor);
+	} else
+		inode->i_rdev = 0;
+
+	inode->i_size = mistat->length;
+
+	inode->i_blksize = sb->s_blocksize;
+	inode->i_blocks =
+	    (inode->i_size + inode->i_blksize - 1) >> sb->s_blocksize_bits;
+}
+
+/**
+ * v9fs_qid2ino - convert qid into inode number
+ * @qid: qid to hash
+ *
+ * BUG: potential for inode number collisions?
+ */
+
+ino_t v9fs_qid2ino(struct v9fs_qid *qid)
+{
+	u64 path = qid->path + 2;
+	ino_t i = 0;
+
+	if (sizeof(ino_t) == sizeof(path))
+		memcpy(&i, &path, sizeof(ino_t));
+	else
+		i = (ino_t) (path ^ (path >> 32));
+
+	return i;
+}
+
+/**
+ * v9fs_vfs_symlink - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See 9P2000.u RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int retval = -EPERM;
+	struct v9fs_fid *newfid;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct super_block *sb = dir ? dir->i_sb : NULL;
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		symname);
+
+	if ((!dentry) || (!sb) || (!v9ses)) {
+		dprintk(DEBUG_ERROR, "problem with arguments\n");
+		return -EBADF;
+	}
+
+	if (!v9ses->extended) {
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeFcall;
+	}
+
+	/* issue a create */
+	retval = v9fs_create(dir, dentry, S_IFLNK, 0);
+	if (retval != 0)
+		goto FreeFcall;
+
+	newfid = v9fs_fid_lookup(dentry, FID_OP);
+
+	/* issue a twstat */
+	v9fs_blank_mistat(v9ses, mistat);
+	strcpy(mistat->data + 1, symname);
+	mistat->extension = mistat->data + 1;
+	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+	if (retval < 0) {
+		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeFcall;
+	}
+
+	kfree(fcall);
+
+	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeFcall;
+	}
+
+	d_drop(dentry);		/* FID - will this also clunk? */
+
+      FreeFcall:
+	kfree(mistat);
+	kfree(fcall);
+
+	return retval;
+}
+
+/**
+ * v9fs_readlink - read a symlink's location (internal version)
+ * @dentry: dentry for symlink
+ * @buf: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+
+static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+	int retval = -EPERM;
+
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+
+	if (!fid) {
+		dprintk(DEBUG_ERROR, "could not resolve fid from dentry\n");
+		retval = -EBADF;
+		goto FreeFcall;
+	}
+
+	if (!v9ses->extended) {
+		retval = -EBADF;
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeFcall;
+	}
+
+	dprintk(DEBUG_VFS, " %s\n", dentry->d_name.name);
+	retval = v9fs_t_stat(v9ses, fid->fid, &fcall);
+
+	if (retval < 0) {
+		dprintk(DEBUG_ERROR, "stat error\n");
+		goto FreeFcall;
+	}
+
+	if (!fcall)
+		return -EIO;
+
+	if (!(fcall->params.rstat.stat->mode & V9FS_DMSYMLINK)) {
+		retval = -EINVAL;
+		goto FreeFcall;
+	}
+
+	/* copy extension buffer into buffer */
+	if (strlen(fcall->params.rstat.stat->extension) < buflen)
+		buflen = strlen(fcall->params.rstat.stat->extension);
+
+	memcpy(buffer, fcall->params.rstat.stat->extension, buflen + 1);
+
+	retval = buflen;
+
+      FreeFcall:
+	kfree(fcall);
+
+	return retval;
+}
+
+/**
+ * v9fs_vfs_readlink - read a symlink's location
+ * @dentry: dentry for symlink
+ * @buf: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+
+static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
+			     int buflen)
+{
+	int retval;
+	int ret;
+	char *link = __getname();
+
+	if (strlen(link) < buflen)
+		buflen = strlen(link);
+
+	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+
+	retval = v9fs_readlink(dentry, link, buflen);
+
+	if (retval > 0) {
+		if ((ret = copy_to_user(buffer, link, retval)) != 0) {
+			dprintk(DEBUG_ERROR, "problem copying to user: %d\n",
+				ret);
+			retval = ret;
+		}
+	}
+
+	putname(link);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_follow_link - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int len = 0;
+	char *link = __getname();
+
+	dprintk(DEBUG_VFS, "%s n", dentry->d_name.name);
+
+	if (!link)
+		link = ERR_PTR(-ENOMEM);
+	else {
+		len = v9fs_readlink(dentry, link, strlen(link));
+
+		if (len < 0) {
+			putname(link);
+			link = ERR_PTR(len);
+		} else
+			link[len] = 0;
+	}
+	nd_set_link(nd, link);
+
+	return NULL;
+}
+
+/**
+ * v9fs_vfs_put_link - release a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+
+	dprintk(DEBUG_VFS, " %s %s\n", dentry->d_name.name, s);
+	if (!IS_ERR(s))
+		putname(s);
+}
+
+/**
+ * v9fs_vfs_link - create a hardlink
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @new_dentry: dentry for link
+ *
+ */
+
+/* XXX - lots of code dup'd from symlink and creates,
+ * figure out a better reuse strategy
+ */
+
+static int
+v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int retval = -EPERM;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_OP);
+	struct v9fs_fid *newfid = NULL;
+	char *symname = __getname();
+
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		old_dentry->d_name.name);
+
+	if (!v9ses->extended) {
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeMem;
+	}
+
+	/* get fid of old_dentry */
+	sprintf(symname, "hardlink(%d)\n", oldfid->fid);
+
+	/* issue a create */
+	retval = v9fs_create(dir, dentry, V9FS_DMLINK, 0);
+	if (retval != 0)
+		goto FreeMem;
+
+	newfid = v9fs_fid_lookup(dentry, FID_OP);
+	if (!newfid) {
+		dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
+		goto FreeMem;
+	}
+
+	/* issue a twstat */
+	v9fs_blank_mistat(v9ses, mistat);
+	strcpy(mistat->data + 1, symname);
+	mistat->extension = mistat->data + 1;
+	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+	if (retval < 0) {
+		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeMem;
+	}
+
+	kfree(fcall);
+
+	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeMem;
+	}
+
+	d_drop(dentry);		/* FID - will this also clunk? */
+
+	kfree(fcall);
+	fcall = NULL;
+
+      FreeMem:
+	kfree(mistat);
+	kfree(fcall);
+	putname(symname);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_mknod - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @dev_t: device associated with special file
+ *
+ */
+
+static int
+v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	int retval = -EPERM;
+	struct v9fs_fid *newfid;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	char *symname = __getname();
+
+	dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+
+	if (!new_valid_dev(rdev)) {
+		retval = -EINVAL;
+		goto FreeMem;
+	}
+
+	if (!v9ses->extended) {
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeMem;
+	}
+
+	/* issue a create */
+	retval = v9fs_create(dir, dentry, mode, 0);
+
+	if (retval != 0)
+		goto FreeMem;
+
+	newfid = v9fs_fid_lookup(dentry, FID_OP);
+	if (!newfid) {
+		dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n");
+		retval = -EINVAL;
+		goto FreeMem;
+	}
+
+	/* build extension */
+	if (S_ISBLK(mode))
+		sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISCHR(mode))
+		sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISFIFO(mode)) ;	/* DO NOTHING */
+	else {
+		retval = -EINVAL;
+		goto FreeMem;
+	}
+
+	if (!S_ISFIFO(mode)) {
+		/* issue a twstat */
+		v9fs_blank_mistat(v9ses, mistat);
+		strcpy(mistat->data + 1, symname);
+		mistat->extension = mistat->data + 1;
+		retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+		if (retval < 0) {
+			dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+				FCALL_ERROR(fcall));
+			goto FreeMem;
+		}
+
+		kfree(fcall);
+	}
+
+	/* need to update dcache so we show up */
+	kfree(fcall);
+
+	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeMem;
+	}
+
+	d_drop(dentry);		/* FID - will this also clunk? */
+
+      FreeMem:
+	kfree(mistat);
+	kfree(fcall);
+	putname(symname);
+
+	return retval;
+}
+
+static struct inode_operations v9fs_dir_inode_operations = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.symlink = v9fs_vfs_symlink,
+	.link = v9fs_vfs_link,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.readlink = v9fs_vfs_readlink,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static struct inode_operations v9fs_file_inode_operations = {
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static struct inode_operations v9fs_symlink_inode_operations = {
+	.readlink = v9fs_vfs_readlink,
+	.follow_link = v9fs_vfs_follow_link,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
-- 
cgit v1.2.3-70-g09d2


From 9e82cf6a802a72f0f447eb4c76d6a3fc8736a31d Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:20 -0700
Subject: [PATCH] v9fs: VFS superblock operations and glue

This part of the patch contains VFS superblock and mapping code.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/v9fs.c      | 448 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/v9fs.h      | 105 +++++++++++++
 fs/9p/v9fs_vfs.h  |  53 +++++++
 fs/9p/vfs_super.c | 271 +++++++++++++++++++++++++++++++++
 4 files changed, 877 insertions(+)
 create mode 100644 fs/9p/v9fs.c
 create mode 100644 fs/9p/v9fs.h
 create mode 100644 fs/9p/v9fs_vfs.h
 create mode 100644 fs/9p/vfs_super.c

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
new file mode 100644
index 00000000000..14d663ebfcb
--- /dev/null
+++ b/fs/9p/v9fs.c
@@ -0,0 +1,448 @@
+/*
+ *  linux/fs/9p/v9fs.c
+ *
+ *  This file contains functions assisting in mapping VFS to 9P2000
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/parser.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "transport.h"
+#include "mux.h"
+#include "conv.h"
+
+/* TODO: sysfs or debugfs interface */
+int v9fs_debug_level = 0;	/* feature-rific global debug level  */
+
+/*
+  * Option Parsing (code inspired by NFS code)
+  *
+  */
+
+enum {
+	/* Options that take integer arguments */
+	Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, Opt_debug,
+	Opt_rfdno, Opt_wfdno,
+	/* String options */
+	Opt_name, Opt_remotename,
+	/* Options that take no arguments */
+	Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd,
+	/* Error token */
+	Opt_err
+};
+
+static match_table_t tokens = {
+	{Opt_port, "port=%u"},
+	{Opt_msize, "msize=%u"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_afid, "afid=%u"},
+	{Opt_rfdno, "rfdno=%u"},
+	{Opt_wfdno, "wfdno=%u"},
+	{Opt_debug, "debug=%u"},
+	{Opt_name, "name=%s"},
+	{Opt_remotename, "aname=%s"},
+	{Opt_unix, "proto=unix"},
+	{Opt_tcp, "proto=tcp"},
+	{Opt_fd, "proto=fd"},
+	{Opt_tcp, "tcp"},
+	{Opt_unix, "unix"},
+	{Opt_fd, "fd"},
+	{Opt_legacy, "noextend"},
+	{Opt_nodevmap, "nodevmap"},
+	{Opt_err, NULL}
+};
+
+/*
+ *  Parse option string.
+ */
+
+/**
+ * v9fs_parse_options - parse mount options into session structure
+ * @options: options string passed from mount
+ * @v9ses: existing v9fs session information
+ *
+ */
+
+static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int ret;
+
+	/* setup defaults */
+	v9ses->port = V9FS_PORT;
+	v9ses->maxdata = 9000;
+	v9ses->proto = PROTO_TCP;
+	v9ses->extended = 1;
+	v9ses->afid = ~0;
+	v9ses->debug = 0;
+	v9ses->rfdno = ~0;
+	v9ses->wfdno = ~0;
+
+	if (!options)
+		return;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		if (token < Opt_name) {
+			if ((ret = match_int(&args[0], &option)) < 0) {
+				dprintk(DEBUG_ERROR,
+					"integer field, but no integer?\n");
+				continue;
+			}
+
+		}
+		switch (token) {
+		case Opt_port:
+			v9ses->port = option;
+			break;
+		case Opt_msize:
+			v9ses->maxdata = option;
+			break;
+		case Opt_uid:
+			v9ses->uid = option;
+			break;
+		case Opt_gid:
+			v9ses->gid = option;
+			break;
+		case Opt_afid:
+			v9ses->afid = option;
+			break;
+		case Opt_rfdno:
+			v9ses->rfdno = option;
+			break;
+		case Opt_wfdno:
+			v9ses->wfdno = option;
+			break;
+		case Opt_debug:
+			v9ses->debug = option;
+			break;
+		case Opt_tcp:
+			v9ses->proto = PROTO_TCP;
+			break;
+		case Opt_unix:
+			v9ses->proto = PROTO_UNIX;
+			break;
+		case Opt_fd:
+			v9ses->proto = PROTO_FD;
+			break;
+		case Opt_name:
+			match_strcpy(v9ses->name, &args[0]);
+			break;
+		case Opt_remotename:
+			match_strcpy(v9ses->remotename, &args[0]);
+			break;
+		case Opt_legacy:
+			v9ses->extended = 0;
+			break;
+		case Opt_nodevmap:
+			v9ses->nodev = 1;
+			break;
+		default:
+			continue;
+		}
+	}
+}
+
+/**
+ * v9fs_inode2v9ses - safely extract v9fs session info from super block
+ * @inode: inode to extract information from
+ *
+ * Paranoid function to extract v9ses information from superblock,
+ * if anything is missing it will report an error.
+ *
+ */
+
+struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
+{
+	return (inode->i_sb->s_fs_info);
+}
+
+/**
+ * v9fs_get_idpool - allocate numeric id from pool
+ * @p - pool to allocate from
+ *
+ * XXX - This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+
+int v9fs_get_idpool(struct v9fs_idpool *p)
+{
+	int i = 0;
+	int error;
+
+retry:
+	if (idr_pre_get(&p->pool, GFP_KERNEL) == 0)
+		return 0;
+
+	if (down_interruptible(&p->lock) == -EINTR) {
+		eprintk(KERN_WARNING, "Interrupted while locking\n");
+		return -1;
+	}
+
+	error = idr_get_new(&p->pool, NULL, &i);
+	up(&p->lock);
+
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error)
+		return -1;
+
+	return i;
+}
+
+/**
+ * v9fs_put_idpool - release numeric id from pool
+ * @p - pool to allocate from
+ *
+ * XXX - This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+
+void v9fs_put_idpool(int id, struct v9fs_idpool *p)
+{
+	if (down_interruptible(&p->lock) == -EINTR) {
+		eprintk(KERN_WARNING, "Interrupted while locking\n");
+		return;
+	}
+	idr_remove(&p->pool, id);
+	up(&p->lock);
+}
+
+/**
+ * v9fs_session_init - initialize session
+ * @v9ses: session information structure
+ * @dev_name: device being mounted
+ * @data: options
+ *
+ */
+
+int
+v9fs_session_init(struct v9fs_session_info *v9ses,
+		  const char *dev_name, char *data)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_transport *trans_proto;
+	int n = 0;
+	int newfid = -1;
+	int retval = -EINVAL;
+
+	v9ses->name = __getname();
+	if (!v9ses->name)
+		return -ENOMEM;
+
+	v9ses->remotename = __getname();
+	if (!v9ses->remotename) {
+		putname(v9ses->name);
+		return -ENOMEM;
+	}
+
+	strcpy(v9ses->name, V9FS_DEFUSER);
+	strcpy(v9ses->remotename, V9FS_DEFANAME);
+
+	v9fs_parse_options(data, v9ses);
+
+	/* set global debug level */
+	v9fs_debug_level = v9ses->debug;
+
+	/* id pools that are session-dependent: FIDs and TIDs */
+	idr_init(&v9ses->fidpool.pool);
+	init_MUTEX(&v9ses->fidpool.lock);
+	idr_init(&v9ses->tidpool.pool);
+	init_MUTEX(&v9ses->tidpool.lock);
+
+
+	switch (v9ses->proto) {
+	case PROTO_TCP:
+		trans_proto = &v9fs_trans_tcp;
+		break;
+	case PROTO_UNIX:
+		trans_proto = &v9fs_trans_unix;
+		*v9ses->remotename = 0;
+		break;
+	case PROTO_FD:
+		trans_proto = &v9fs_trans_fd;
+		*v9ses->remotename = 0;
+		if((v9ses->wfdno == ~0) || (v9ses->rfdno == ~0)) {
+			printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
+			retval = -ENOPROTOOPT;
+			goto SessCleanUp;
+		}
+		break;
+	default:
+		printk(KERN_ERR "v9fs: Bad mount protocol %d\n", v9ses->proto);
+		retval = -ENOPROTOOPT;
+		goto SessCleanUp;
+	};
+
+	v9ses->transport = trans_proto;
+
+	if ((retval = v9ses->transport->init(v9ses, dev_name, data)) < 0) {
+		eprintk(KERN_ERR, "problem initializing transport\n");
+		goto SessCleanUp;
+	}
+
+	v9ses->inprogress = 0;
+	v9ses->shutdown = 0;
+	v9ses->session_hung = 0;
+
+	if ((retval = v9fs_mux_init(v9ses, dev_name)) < 0) {
+		dprintk(DEBUG_ERROR, "problem initializing mux\n");
+		goto SessCleanUp;
+	}
+
+	if (v9ses->afid == ~0) {
+		if (v9ses->extended)
+			retval =
+			    v9fs_t_version(v9ses, v9ses->maxdata, "9P2000.u",
+					   &fcall);
+		else
+			retval = v9fs_t_version(v9ses, v9ses->maxdata, "9P2000",
+						&fcall);
+
+		if (retval < 0) {
+			dprintk(DEBUG_ERROR, "v9fs_t_version failed\n");
+			goto FreeFcall;
+		}
+
+		/* Really should check for 9P1 and report error */
+		if (!strcmp(fcall->params.rversion.version, "9P2000.u")) {
+			dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
+			v9ses->extended = 1;
+		} else {
+			dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n");
+			v9ses->extended = 0;
+		}
+
+		n = fcall->params.rversion.msize;
+		kfree(fcall);
+
+		if (n < v9ses->maxdata)
+			v9ses->maxdata = n;
+	}
+
+	newfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (newfid < 0) {
+		eprintk(KERN_WARNING, "couldn't allocate FID\n");
+		retval = -ENOMEM;
+		goto SessCleanUp;
+	}
+	/* it is a little bit ugly, but we have to prevent newfid */
+	/* being the same as afid, so if it is, get a new fid     */
+	if (v9ses->afid != ~0 && newfid == v9ses->afid) {
+		newfid = v9fs_get_idpool(&v9ses->fidpool);
+		if (newfid < 0) {
+			eprintk(KERN_WARNING, "couldn't allocate FID\n");
+			retval = -ENOMEM;
+			goto SessCleanUp;
+		}
+	}
+
+	if ((retval =
+	     v9fs_t_attach(v9ses, v9ses->name, v9ses->remotename, newfid,
+			   v9ses->afid, NULL))
+	    < 0) {
+		dprintk(DEBUG_ERROR, "cannot attach\n");
+		goto SessCleanUp;
+	}
+
+	if (v9ses->afid != ~0) {
+		if (v9fs_t_clunk(v9ses, v9ses->afid, NULL))
+			dprintk(DEBUG_ERROR, "clunk failed\n");
+	}
+
+	return newfid;
+
+      FreeFcall:
+	kfree(fcall);
+
+      SessCleanUp:
+	v9fs_session_close(v9ses);
+	return retval;
+}
+
+/**
+ * v9fs_session_close - shutdown a session
+ * @v9ses: session information structure
+ *
+ */
+
+void v9fs_session_close(struct v9fs_session_info *v9ses)
+{
+	if (v9ses->recvproc) {
+		send_sig(SIGKILL, v9ses->recvproc, 1);
+		wait_for_completion(&v9ses->proccmpl);
+	}
+
+	if (v9ses->transport)
+		v9ses->transport->close(v9ses->transport);
+
+	putname(v9ses->name);
+	putname(v9ses->remotename);
+}
+
+extern int v9fs_error_init(void);
+
+/**
+ * v9fs_init - Initialize module
+ *
+ */
+
+static int __init init_v9fs(void)
+{
+	v9fs_error_init();
+
+	printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
+
+	return register_filesystem(&v9fs_fs_type);
+}
+
+/**
+ * v9fs_init - shutdown module
+ *
+ */
+
+static void __exit exit_v9fs(void)
+{
+	unregister_filesystem(&v9fs_fs_type);
+}
+
+module_init(init_v9fs)
+module_exit(exit_v9fs)
+
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
new file mode 100644
index 00000000000..52203027b15
--- /dev/null
+++ b/fs/9p/v9fs.h
@@ -0,0 +1,105 @@
+/*
+ * V9FS definitions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/*
+  * Idpool structure provides lock and id management
+  *
+  */
+
+struct v9fs_idpool {
+	struct semaphore lock;
+	struct idr pool;
+};
+
+/*
+  * Session structure provides information for an opened session
+  *
+  */
+
+struct v9fs_session_info {
+	/* options */
+	unsigned int maxdata;
+	unsigned char extended;	/* set to 1 if we are using UNIX extensions */
+	unsigned char nodev;	/* set to 1 if no disable device mapping */
+	unsigned short port;	/* port to connect to */
+	unsigned short debug;	/* debug level */
+	unsigned short proto;	/* protocol to use */
+	unsigned int afid;	/* authentication fid */
+	unsigned int rfdno;	/* read file descriptor number */
+	unsigned int wfdno;	/* write file descriptor number */
+
+
+	char *name;		/* user name to mount as */
+	char *remotename;	/* name of remote hierarchy being mounted */
+	unsigned int uid;	/* default uid/muid for legacy support */
+	unsigned int gid;	/* default gid for legacy support */
+
+	/* book keeping */
+	struct v9fs_idpool fidpool;	/* The FID pool for file descriptors */
+	struct v9fs_idpool tidpool;	/* The TID pool for transactions ids */
+
+	/* transport information */
+	struct v9fs_transport *transport;
+
+	int inprogress;		/* session in progress => true */
+	int shutdown;		/* session shutting down. no more attaches. */
+	unsigned char session_hung;
+
+	/* mux private data */
+	struct v9fs_fcall *curfcall;
+	wait_queue_head_t read_wait;
+	struct completion fcread;
+	struct completion proccmpl;
+	struct task_struct *recvproc;
+
+	spinlock_t muxlock;
+	struct list_head mux_fcalls;
+};
+
+/* possible values of ->proto */
+enum {
+	PROTO_TCP,
+	PROTO_UNIX,
+	PROTO_FD,
+};
+
+int v9fs_session_init(struct v9fs_session_info *, const char *, char *);
+struct v9fs_session_info *v9fs_inode2v9ses(struct inode *);
+void v9fs_session_close(struct v9fs_session_info *v9ses);
+int v9fs_get_idpool(struct v9fs_idpool *p);
+void v9fs_put_idpool(int id, struct v9fs_idpool *p);
+int v9fs_get_option(char *opts, char *name, char *buf, int buflen);
+long long v9fs_get_int_option(char *opts, char *name, long long dflt);
+int v9fs_parse_tcp_devname(const char *devname, char **addr, char **remotename);
+
+#define V9FS_MAGIC 0x01021997
+
+/* other default globals */
+#define V9FS_PORT		564
+#define V9FS_DEFUSER	"nobody"
+#define V9FS_DEFANAME	""
+
+/* inital pool sizes for fids and tags */
+#define V9FS_START_FIDS 8192
+#define V9FS_START_TIDS 256
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
new file mode 100644
index 00000000000..2f2cea7ee3e
--- /dev/null
+++ b/fs/9p/v9fs_vfs.h
@@ -0,0 +1,53 @@
+/*
+ * V9FS VFS extensions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/* plan9 semantics are that created files are implicitly opened.
+ * But linux semantics are that you call create, then open.
+ * the plan9 approach is superior as it provides an atomic
+ * open.
+ * we track the create fid here. When the file is opened, if fidopen is
+ * non-zero, we use the fid and can skip some steps.
+ * there may be a better way to do this, but I don't know it.
+ * one BAD way is to clunk the fid on create, then open it again:
+ * you lose the atomicity of file open
+ */
+
+/* special case:
+ * unlink calls remove, which is an implicit clunk. So we have to track
+ * that kind of thing so that we don't try to clunk a dead fid.
+ */
+
+extern struct file_system_type v9fs_fs_type;
+extern struct file_operations v9fs_file_operations;
+extern struct file_operations v9fs_dir_operations;
+extern struct dentry_operations v9fs_dentry_operations;
+
+struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+ino_t v9fs_qid2ino(struct v9fs_qid *qid);
+void v9fs_mistat2inode(struct v9fs_stat *, struct inode *,
+		       struct super_block *);
+int v9fs_dir_release(struct inode *inode, struct file *filp);
+int v9fs_file_open(struct inode *inode, struct file *file);
+void v9fs_inode2mistat(struct inode *inode, struct v9fs_stat *mistat);
+void v9fs_dentry_release(struct dentry *);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
new file mode 100644
index 00000000000..ce0778acc90
--- /dev/null
+++ b/fs/9p/vfs_super.c
@@ -0,0 +1,271 @@
+/*
+ *  linux/fs/9p/vfs_super.c
+ *
+ * This file contians superblock ops for 9P2000. It is intended that
+ * you mount this file system on directories.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+static void v9fs_clear_inode(struct inode *);
+static struct super_operations v9fs_super_ops;
+
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+
+static void v9fs_clear_inode(struct inode *inode)
+{
+	filemap_fdatawrite(inode->i_mapping);
+}
+
+/**
+ * v9fs_set_super - set the superblock
+ * @s: super block
+ * @data: file system specific data
+ *
+ */
+
+static int v9fs_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return set_anon_super(s, data);
+}
+
+/**
+ * v9fs_fill_super - populate superblock with info
+ * @sb: superblock
+ * @v9ses: session information
+ *
+ */
+
+static void
+v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
+		int flags)
+{
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
+	sb->s_blocksize = 1 << sb->s_blocksize_bits;
+	sb->s_magic = V9FS_MAGIC;
+	sb->s_op = &v9fs_super_ops;
+
+	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
+	    MS_NODIRATIME | MS_NOATIME;
+}
+
+/**
+ * v9fs_get_sb - mount a superblock
+ * @fs_type: file system type
+ * @flags: mount flags
+ * @dev_name: device name that was mounted
+ * @data: mount options
+ *
+ */
+
+static struct super_block *v9fs_get_sb(struct file_system_type
+				       *fs_type, int flags,
+				       const char *dev_name, void *data)
+{
+	struct super_block *sb = NULL;
+	struct v9fs_fcall *fcall = NULL;
+	struct inode *inode = NULL;
+	struct dentry *root = NULL;
+	struct v9fs_session_info *v9ses = NULL;
+	struct v9fs_fid *root_fid = NULL;
+	int mode = S_IRWXUGO | S_ISVTX;
+	uid_t uid = current->fsuid;
+	gid_t gid = current->fsgid;
+	int stat_result = 0;
+	int newfid = 0;
+	int retval = 0;
+
+	dprintk(DEBUG_VFS, " \n");
+
+	v9ses = kcalloc(1, sizeof(struct v9fs_session_info), GFP_KERNEL);
+	if (!v9ses)
+		return ERR_PTR(-ENOMEM);
+
+	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
+		dprintk(DEBUG_ERROR, "problem initiating session\n");
+		retval = newfid;
+		goto free_session;
+	}
+
+	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
+
+	v9fs_fill_super(sb, v9ses, flags);
+
+	inode = v9fs_get_inode(sb, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		retval = PTR_ERR(inode);
+		goto put_back_sb;
+	}
+
+	inode->i_uid = uid;
+	inode->i_gid = gid;
+
+	root = d_alloc_root(inode);
+
+	if (!root) {
+		retval = -ENOMEM;
+		goto release_inode;
+	}
+
+	sb->s_root = root;
+
+	/* Setup the Root Inode */
+	root_fid = v9fs_fid_create(root);
+	if (root_fid == NULL) {
+		retval = -ENOMEM;
+		goto release_dentry;
+	}
+
+	root_fid->fidopen = 0;
+	root_fid->v9ses = v9ses;
+
+	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
+	if (stat_result < 0) {
+		dprintk(DEBUG_ERROR, "stat error\n");
+		v9fs_t_clunk(v9ses, newfid, NULL);
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+	} else {
+		root_fid->fid = newfid;
+		root_fid->qid = fcall->params.rstat.stat->qid;
+		root->d_inode->i_ino =
+		    v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+		v9fs_mistat2inode(fcall->params.rstat.stat, root->d_inode, sb);
+	}
+
+	kfree(fcall);
+
+	if (stat_result < 0) {
+		retval = stat_result;
+		goto release_dentry;
+	}
+
+	return sb;
+
+      release_dentry:
+	dput(sb->s_root);
+
+      release_inode:
+	iput(inode);
+
+      put_back_sb:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	v9fs_session_close(v9ses);
+
+      free_session:
+	kfree(v9ses);
+
+	return ERR_PTR(retval);
+}
+
+/**
+ * v9fs_kill_super - Kill Superblock
+ * @s: superblock
+ *
+ */
+
+static void v9fs_kill_super(struct super_block *s)
+{
+	struct v9fs_session_info *v9ses = s->s_fs_info;
+
+	dprintk(DEBUG_VFS, " %p\n", s);
+
+	v9fs_dentry_release(s->s_root);	/* clunk root */
+
+	kill_anon_super(s);
+
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+	dprintk(DEBUG_VFS, "exiting kill_super\n");
+}
+
+/**
+ * v9fs_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ *
+ */
+
+static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
+
+	if (v9ses->debug != 0)
+		seq_printf(m, ",debug=%u", v9ses->debug);
+	if (v9ses->port != V9FS_PORT)
+		seq_printf(m, ",port=%u", v9ses->port);
+	if (v9ses->maxdata != 9000)
+		seq_printf(m, ",msize=%u", v9ses->maxdata);
+	if (v9ses->afid != ~0)
+		seq_printf(m, ",afid=%u", v9ses->afid);
+	if (v9ses->proto == PROTO_UNIX)
+		seq_puts(m, ",proto=unix");
+	if (v9ses->extended == 0)
+		seq_puts(m, ",noextend");
+	if (v9ses->nodev == 1)
+		seq_puts(m, ",nodevmap");
+	seq_printf(m, ",name=%s", v9ses->name);
+	seq_printf(m, ",aname=%s", v9ses->remotename);
+	seq_printf(m, ",uid=%u", v9ses->uid);
+	seq_printf(m, ",gid=%u", v9ses->gid);
+	return 0;
+}
+
+static struct super_operations v9fs_super_ops = {
+	.statfs = simple_statfs,
+	.clear_inode = v9fs_clear_inode,
+	.show_options = v9fs_show_options,
+};
+
+struct file_system_type v9fs_fs_type = {
+	.name = "9P",
+	.get_sb = v9fs_get_sb,
+	.kill_sb = v9fs_kill_super,
+	.owner = THIS_MODULE,
+};
-- 
cgit v1.2.3-70-g09d2


From b8cf945b3166c4394386f162a527c9950f396ce2 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:21 -0700
Subject: [PATCH] v9fs: 9P protocol implementation

This part of the patch contains the 9P protocol functions.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c   | 359 +++++++++++++++++++++++++++++++
 fs/9p/9p.h   | 341 +++++++++++++++++++++++++++++
 fs/9p/conv.c | 693 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/conv.h |  36 ++++
 4 files changed, 1429 insertions(+)
 create mode 100644 fs/9p/9p.c
 create mode 100644 fs/9p/9p.h
 create mode 100644 fs/9p/conv.c
 create mode 100644 fs/9p/conv.h

(limited to 'fs')

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
new file mode 100644
index 00000000000..e847f504a47
--- /dev/null
+++ b/fs/9p/9p.c
@@ -0,0 +1,359 @@
+/*
+ *  linux/fs/9p/9p.c
+ *
+ *  This file contains functions 9P2000 functions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "mux.h"
+
+/**
+ * v9fs_t_version - negotiate protocol parameters with sever
+ * @v9ses: 9P2000 session information
+ * @msize: requested max size packet
+ * @version: requested version.extension string
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+
+int
+v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
+	       char *version, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
+	msg.id = TVERSION;
+	msg.params.tversion.msize = msize;
+	msg.params.tversion.version = version;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_attach - mount the server
+ * @v9ses: 9P2000 session information
+ * @uname: user name doing the attach
+ * @aname: remote name being attached to
+ * @fid: mount fid to attatch to root node
+ * @afid: authentication fid (in this case result key)
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+
+int
+v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
+	      u32 fid, u32 afid, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
+		aname, fid, afid);
+	msg.id = TATTACH;
+	msg.params.tattach.fid = fid;
+	msg.params.tattach.afid = afid;
+	msg.params.tattach.uname = uname;
+	msg.params.tattach.aname = aname;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_clunk - release a fid (finish a transaction)
+ * @v9ses: 9P2000 session information
+ * @fid: fid to release
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+
+int
+v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
+	     struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+	msg.id = TCLUNK;
+	msg.params.tclunk.fid = fid;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_v9fs_t_flush - flush a pending transaction
+ * @v9ses: 9P2000 session information
+ * @tag: tid to release
+ *
+ */
+
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "oldtag %d\n", tag);
+	msg.id = TFLUSH;
+	msg.params.tflush.oldtag = tag;
+	return v9fs_mux_rpc(v9ses, &msg, NULL);
+}
+
+/**
+ * v9fs_t_stat - read a file's meta-data
+ * @v9ses: 9P2000 session information
+ * @fid: fid pointing to file or directory to get info about
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+	if (fcall)
+		*fcall = NULL;
+
+	msg.id = TSTAT;
+	msg.params.tstat.fid = fid;
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_wstat - write a file's meta-data
+ * @v9ses: 9P2000 session information
+ * @fid: fid pointing to file or directory to write info about
+ * @stat: metadata
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
+	     struct v9fs_stat *stat, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d length %d\n", fid, (int)stat->length);
+	msg.id = TWSTAT;
+	msg.params.twstat.fid = fid;
+	msg.params.twstat.stat = stat;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_walk - walk a fid to a new file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to walk
+ * @newfid: new fid (for clone operations)
+ * @name: path to walk fid to
+ * @fcall: pointer to response fcall
+ *
+ */
+
+/* TODO: support multiple walk */
+
+int
+v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
+	    char *name, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name);
+	msg.id = TWALK;
+	msg.params.twalk.fid = fid;
+	msg.params.twalk.newfid = newfid;
+
+	if (name) {
+		msg.params.twalk.nwname = 1;
+		msg.params.twalk.wnames = &name;
+	} else {
+		msg.params.twalk.nwname = 0;
+	}
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_open - open a file
+ *
+ * @v9ses - 9P2000 session information
+ * @fid - fid to open
+ * @mode - mode to open file (R, RW, etc)
+ * @fcall - pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
+	    struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+	long errorno = -1;
+
+	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
+	msg.id = TOPEN;
+	msg.params.topen.fid = fid;
+	msg.params.topen.mode = mode;
+
+	errorno = v9fs_mux_rpc(v9ses, &msg, fcall);
+
+	return errorno;
+}
+
+/**
+ * v9fs_t_remove - remove a file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to remove
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
+	      struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+	msg.id = TREMOVE;
+	msg.params.tremove.fid = fid;
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_create - create a file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to create
+ * @name: name of the file or directory to create
+ * @perm: permissions to create with
+ * @mode: mode to open file (R, RW, etc)
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
+	      u32 perm, u8 mode, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
+		fid, name, perm, mode);
+
+	msg.id = TCREATE;
+	msg.params.tcreate.fid = fid;
+	msg.params.tcreate.name = name;
+	msg.params.tcreate.perm = perm;
+	msg.params.tcreate.mode = mode;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_read - read data
+ * @v9ses: 9P2000 session information
+ * @fid: fid to read from
+ * @offset: offset to start read at
+ * @count: how many bytes to read
+ * @fcall: pointer to response fcall (with data)
+ *
+ */
+
+int
+v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
+	    u32 count, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+	struct v9fs_fcall *rc = NULL;
+	long errorno = -1;
+
+	dprintk(DEBUG_9P, "fid %d offset 0x%lx count 0x%x\n", fid,
+		(long unsigned int)offset, count);
+	msg.id = TREAD;
+	msg.params.tread.fid = fid;
+	msg.params.tread.offset = offset;
+	msg.params.tread.count = count;
+	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+
+	if (!errorno) {
+		errorno = rc->params.rread.count;
+		dump_data(rc->params.rread.data, rc->params.rread.count);
+	}
+
+	if (fcall)
+		*fcall = rc;
+	else
+		kfree(rc);
+
+	return errorno;
+}
+
+/**
+ * v9fs_t_write - write data
+ * @v9ses: 9P2000 session information
+ * @fid: fid to write to
+ * @offset: offset to start write at
+ * @count: how many bytes to write
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid,
+	     u64 offset, u32 count, void *data, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+	struct v9fs_fcall *rc = NULL;
+	long errorno = -1;
+
+	dprintk(DEBUG_9P, "fid %d offset 0x%llx count 0x%x\n", fid,
+		(unsigned long long)offset, count);
+	dump_data(data, count);
+
+	msg.id = TWRITE;
+	msg.params.twrite.fid = fid;
+	msg.params.twrite.offset = offset;
+	msg.params.twrite.count = count;
+	msg.params.twrite.data = data;
+
+	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+
+	if (!errorno)
+		errorno = rc->params.rwrite.count;
+
+	if (fcall)
+		*fcall = rc;
+	else
+		kfree(rc);
+
+	return errorno;
+}
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
new file mode 100644
index 00000000000..f55424216be
--- /dev/null
+++ b/fs/9p/9p.h
@@ -0,0 +1,341 @@
+/*
+ * linux/fs/9p/9p.h
+ *
+ * 9P protocol definitions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/* Message Types */
+enum {
+	TVERSION = 100,
+	RVERSION,
+	TAUTH = 102,
+	RAUTH,
+	TATTACH = 104,
+	RATTACH,
+	TERROR = 106,
+	RERROR,
+	TFLUSH = 108,
+	RFLUSH,
+	TWALK = 110,
+	RWALK,
+	TOPEN = 112,
+	ROPEN,
+	TCREATE = 114,
+	RCREATE,
+	TREAD = 116,
+	RREAD,
+	TWRITE = 118,
+	RWRITE,
+	TCLUNK = 120,
+	RCLUNK,
+	TREMOVE = 122,
+	RREMOVE,
+	TSTAT = 124,
+	RSTAT,
+	TWSTAT = 126,
+	RWSTAT,
+};
+
+/* modes */
+enum {
+	V9FS_OREAD = 0x00,
+	V9FS_OWRITE = 0x01,
+	V9FS_ORDWR = 0x02,
+	V9FS_OEXEC = 0x03,
+	V9FS_OEXCL = 0x04,
+	V9FS_OTRUNC = 0x10,
+	V9FS_OREXEC = 0x20,
+	V9FS_ORCLOSE = 0x40,
+	V9FS_OAPPEND = 0x80,
+};
+
+/* permissions */
+enum {
+	V9FS_DMDIR = 0x80000000,
+	V9FS_DMAPPEND = 0x40000000,
+	V9FS_DMEXCL = 0x20000000,
+	V9FS_DMMOUNT = 0x10000000,
+	V9FS_DMAUTH = 0x08000000,
+	V9FS_DMTMP = 0x04000000,
+	V9FS_DMSYMLINK = 0x02000000,
+	V9FS_DMLINK = 0x01000000,
+	/* 9P2000.u extensions */
+	V9FS_DMDEVICE = 0x00800000,
+	V9FS_DMNAMEDPIPE = 0x00200000,
+	V9FS_DMSOCKET = 0x00100000,
+	V9FS_DMSETUID = 0x00080000,
+	V9FS_DMSETGID = 0x00040000,
+};
+
+/* qid.types */
+enum {
+	V9FS_QTDIR = 0x80,
+	V9FS_QTAPPEND = 0x40,
+	V9FS_QTEXCL = 0x20,
+	V9FS_QTMOUNT = 0x10,
+	V9FS_QTAUTH = 0x08,
+	V9FS_QTTMP = 0x04,
+	V9FS_QTSYMLINK = 0x02,
+	V9FS_QTLINK = 0x01,
+	V9FS_QTFILE = 0x00,
+};
+
+/* ample room for Twrite/Rread header (iounit) */
+#define V9FS_IOHDRSZ	24
+
+/* qids are the unique ID for a file (like an inode */
+struct v9fs_qid {
+	u8 type;
+	u32 version;
+	u64 path;
+};
+
+/* Plan 9 file metadata (stat) structure */
+struct v9fs_stat {
+	u16 size;
+	u16 type;
+	u32 dev;
+	struct v9fs_qid qid;
+	u32 mode;
+	u32 atime;
+	u32 mtime;
+	u64 length;
+	char *name;
+	char *uid;
+	char *gid;
+	char *muid;
+	char *extension;	/* 9p2000.u extensions */
+	u32 n_uid;		/* 9p2000.u extensions */
+	u32 n_gid;		/* 9p2000.u extensions */
+	u32 n_muid;		/* 9p2000.u extensions */
+	char data[0];
+};
+
+/* Structures for Protocol Operations */
+
+struct Tversion {
+	u32 msize;
+	char *version;
+};
+
+struct Rversion {
+	u32 msize;
+	char *version;
+};
+
+struct Tauth {
+	u32 afid;
+	char *uname;
+	char *aname;
+};
+
+struct Rauth {
+	struct v9fs_qid qid;
+};
+
+struct Rerror {
+	char *error;
+	u32 errno;		/* 9p2000.u extension */
+};
+
+struct Tflush {
+	u32 oldtag;
+};
+
+struct Rflush {
+};
+
+struct Tattach {
+	u32 fid;
+	u32 afid;
+	char *uname;
+	char *aname;
+};
+
+struct Rattach {
+	struct v9fs_qid qid;
+};
+
+struct Twalk {
+	u32 fid;
+	u32 newfid;
+	u32 nwname;
+	char **wnames;
+};
+
+struct Rwalk {
+	u32 nwqid;
+	struct v9fs_qid *wqids;
+};
+
+struct Topen {
+	u32 fid;
+	u8 mode;
+};
+
+struct Ropen {
+	struct v9fs_qid qid;
+	u32 iounit;
+};
+
+struct Tcreate {
+	u32 fid;
+	char *name;
+	u32 perm;
+	u8 mode;
+};
+
+struct Rcreate {
+	struct v9fs_qid qid;
+	u32 iounit;
+};
+
+struct Tread {
+	u32 fid;
+	u64 offset;
+	u32 count;
+};
+
+struct Rread {
+	u32 count;
+	u8 *data;
+};
+
+struct Twrite {
+	u32 fid;
+	u64 offset;
+	u32 count;
+	u8 *data;
+};
+
+struct Rwrite {
+	u32 count;
+};
+
+struct Tclunk {
+	u32 fid;
+};
+
+struct Rclunk {
+};
+
+struct Tremove {
+	u32 fid;
+};
+
+struct Rremove {
+};
+
+struct Tstat {
+	u32 fid;
+};
+
+struct Rstat {
+	struct v9fs_stat *stat;
+};
+
+struct Twstat {
+	u32 fid;
+	struct v9fs_stat *stat;
+};
+
+struct Rwstat {
+};
+
+/*
+  * fcall is the primary packet structure
+  *
+  */
+
+struct v9fs_fcall {
+	u32 size;
+	u8 id;
+	u16 tag;
+
+	union {
+		struct Tversion tversion;
+		struct Rversion rversion;
+		struct Tauth tauth;
+		struct Rauth rauth;
+		struct Rerror rerror;
+		struct Tflush tflush;
+		struct Rflush rflush;
+		struct Tattach tattach;
+		struct Rattach rattach;
+		struct Twalk twalk;
+		struct Rwalk rwalk;
+		struct Topen topen;
+		struct Ropen ropen;
+		struct Tcreate tcreate;
+		struct Rcreate rcreate;
+		struct Tread tread;
+		struct Rread rread;
+		struct Twrite twrite;
+		struct Rwrite rwrite;
+		struct Tclunk tclunk;
+		struct Rclunk rclunk;
+		struct Tremove tremove;
+		struct Rremove rremove;
+		struct Tstat tstat;
+		struct Rstat rstat;
+		struct Twstat twstat;
+		struct Rwstat rwstat;
+	} params;
+};
+
+#define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "")
+
+int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
+		   char *version, struct v9fs_fcall **rcall);
+
+int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
+		  u32 fid, u32 afid, struct v9fs_fcall **rcall);
+
+int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
+		 struct v9fs_fcall **rcall);
+
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag);
+
+int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid,
+		struct v9fs_fcall **rcall);
+
+int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
+		 struct v9fs_stat *stat, struct v9fs_fcall **rcall);
+
+int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
+		char *name, struct v9fs_fcall **rcall);
+
+int v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
+		struct v9fs_fcall **rcall);
+
+int v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
+		  struct v9fs_fcall **rcall);
+
+int v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
+		  u32 perm, u8 mode, struct v9fs_fcall **rcall);
+
+int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid,
+		u64 offset, u32 count, struct v9fs_fcall **rcall);
+
+int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
+		 u32 count, void *data, struct v9fs_fcall **rcall);
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
new file mode 100644
index 00000000000..1554731bd65
--- /dev/null
+++ b/fs/9p/conv.c
@@ -0,0 +1,693 @@
+/*
+ * linux/fs/9p/conv.c
+ *
+ * 9P protocol conversion functions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "conv.h"
+
+/*
+ * Buffer to help with string parsing
+ */
+struct cbuf {
+	unsigned char *sp;
+	unsigned char *p;
+	unsigned char *ep;
+};
+
+static inline void buf_init(struct cbuf *buf, void *data, int datalen)
+{
+	buf->sp = buf->p = data;
+	buf->ep = data + datalen;
+}
+
+static inline int buf_check_overflow(struct cbuf *buf)
+{
+	return buf->p > buf->ep;
+}
+
+static inline void buf_check_size(struct cbuf *buf, int len)
+{
+	if (buf->p+len > buf->ep) {
+		if (buf->p < buf->ep) {
+			eprintk(KERN_ERR, "buffer overflow\n");
+			buf->p = buf->ep + 1;
+		}
+	}
+}
+
+static inline void *buf_alloc(struct cbuf *buf, int len)
+{
+	void *ret = NULL;
+
+	buf_check_size(buf, len);
+	ret = buf->p;
+	buf->p += len;
+
+	return ret;
+}
+
+static inline void buf_put_int8(struct cbuf *buf, u8 val)
+{
+	buf_check_size(buf, 1);
+
+	buf->p[0] = val;
+	buf->p++;
+}
+
+static inline void buf_put_int16(struct cbuf *buf, u16 val)
+{
+	buf_check_size(buf, 2);
+
+	*(__le16 *) buf->p = cpu_to_le16(val);
+	buf->p += 2;
+}
+
+static inline void buf_put_int32(struct cbuf *buf, u32 val)
+{
+	buf_check_size(buf, 4);
+
+	*(__le32 *)buf->p = cpu_to_le32(val);
+	buf->p += 4;
+}
+
+static inline void buf_put_int64(struct cbuf *buf, u64 val)
+{
+	buf_check_size(buf, 8);
+
+	*(__le64 *)buf->p = cpu_to_le64(val);
+	buf->p += 8;
+}
+
+static inline void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
+{
+	buf_check_size(buf, slen + 2);
+
+	buf_put_int16(buf, slen);
+	memcpy(buf->p, s, slen);
+	buf->p += slen;
+}
+
+static inline void buf_put_string(struct cbuf *buf, const char *s)
+{
+	buf_put_stringn(buf, s, strlen(s));
+}
+
+static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen)
+{
+	buf_check_size(buf, datalen);
+
+	memcpy(buf->p, data, datalen);
+	buf->p += datalen;
+}
+
+static inline u8 buf_get_int8(struct cbuf *buf)
+{
+	u8 ret = 0;
+
+	buf_check_size(buf, 1);
+	ret = buf->p[0];
+
+	buf->p++;
+
+	return ret;
+}
+
+static inline u16 buf_get_int16(struct cbuf *buf)
+{
+	u16 ret = 0;
+
+	buf_check_size(buf, 2);
+	ret = le16_to_cpu(*(__le16 *)buf->p);
+
+	buf->p += 2;
+
+	return ret;
+}
+
+static inline u32 buf_get_int32(struct cbuf *buf)
+{
+	u32 ret = 0;
+
+	buf_check_size(buf, 4);
+	ret = le32_to_cpu(*(__le32 *)buf->p);
+
+	buf->p += 4;
+
+	return ret;
+}
+
+static inline u64 buf_get_int64(struct cbuf *buf)
+{
+	u64 ret = 0;
+
+	buf_check_size(buf, 8);
+	ret = le64_to_cpu(*(__le64 *)buf->p);
+
+	buf->p += 8;
+
+	return ret;
+}
+
+static inline int
+buf_get_string(struct cbuf *buf, char *data, unsigned int datalen)
+{
+
+	u16 len = buf_get_int16(buf);
+	buf_check_size(buf, len);
+	if (len + 1 > datalen)
+		return 0;
+
+	memcpy(data, buf->p, len);
+	data[len] = 0;
+	buf->p += len;
+
+	return len + 1;
+}
+
+static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
+{
+	char *ret = NULL;
+	int n = buf_get_string(buf, sbuf->p, sbuf->ep - sbuf->p);
+
+	if (n > 0) {
+		ret = sbuf->p;
+		sbuf->p += n;
+	}
+
+	return ret;
+}
+
+static inline int buf_get_data(struct cbuf *buf, void *data, int datalen)
+{
+	buf_check_size(buf, datalen);
+
+	memcpy(data, buf->p, datalen);
+	buf->p += datalen;
+
+	return datalen;
+}
+
+static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
+				  int datalen)
+{
+	char *ret = NULL;
+	int n = 0;
+
+	buf_check_size(dbuf, datalen);
+
+	n = buf_get_data(buf, dbuf->p, datalen);
+
+	if (n > 0) {
+		ret = dbuf->p;
+		dbuf->p += n;
+	}
+
+	return ret;
+}
+
+/**
+ * v9fs_size_stat - calculate the size of a variable length stat struct
+ * @v9ses: session information
+ * @stat: metadata (stat) structure
+ *
+ */
+
+static int v9fs_size_stat(struct v9fs_session_info *v9ses,
+			  struct v9fs_stat *stat)
+{
+	int size = 0;
+
+	if (stat == NULL) {
+		eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n");
+		return 0;
+	}
+
+	size =			/* 2 + *//* size[2] */
+	    2 +			/* type[2] */
+	    4 +			/* dev[4] */
+	    1 +			/* qid.type[1] */
+	    4 +			/* qid.vers[4] */
+	    8 +			/* qid.path[8] */
+	    4 +			/* mode[4] */
+	    4 +			/* atime[4] */
+	    4 +			/* mtime[4] */
+	    8 +			/* length[8] */
+	    8;			/* minimum sum of string lengths */
+
+	if (stat->name)
+		size += strlen(stat->name);
+	if (stat->uid)
+		size += strlen(stat->uid);
+	if (stat->gid)
+		size += strlen(stat->gid);
+	if (stat->muid)
+		size += strlen(stat->muid);
+
+	if (v9ses->extended) {
+		size += 4 +	/* n_uid[4] */
+		    4 +		/* n_gid[4] */
+		    4 +		/* n_muid[4] */
+		    2;		/* string length of extension[4] */
+		if (stat->extension)
+			size += strlen(stat->extension);
+	}
+
+	return size;
+}
+
+/**
+ * serialize_stat - safely format a stat structure for transmission
+ * @v9ses: session info
+ * @stat: metadata (stat) structure
+ * @bufp: buffer to serialize structure into
+ *
+ */
+
+static int
+serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
+	       struct cbuf *bufp)
+{
+	buf_put_int16(bufp, stat->size);
+	buf_put_int16(bufp, stat->type);
+	buf_put_int32(bufp, stat->dev);
+	buf_put_int8(bufp, stat->qid.type);
+	buf_put_int32(bufp, stat->qid.version);
+	buf_put_int64(bufp, stat->qid.path);
+	buf_put_int32(bufp, stat->mode);
+	buf_put_int32(bufp, stat->atime);
+	buf_put_int32(bufp, stat->mtime);
+	buf_put_int64(bufp, stat->length);
+
+	buf_put_string(bufp, stat->name);
+	buf_put_string(bufp, stat->uid);
+	buf_put_string(bufp, stat->gid);
+	buf_put_string(bufp, stat->muid);
+
+	if (v9ses->extended) {
+		buf_put_string(bufp, stat->extension);
+		buf_put_int32(bufp, stat->n_uid);
+		buf_put_int32(bufp, stat->n_gid);
+		buf_put_int32(bufp, stat->n_muid);
+	}
+
+	if (buf_check_overflow(bufp))
+		return 0;
+
+	return stat->size;
+}
+
+/**
+ * deserialize_stat - safely decode a recieved metadata (stat) structure
+ * @v9ses: session info
+ * @bufp: buffer to deserialize
+ * @stat: metadata (stat) structure
+ * @dbufp: buffer to deserialize variable strings into
+ *
+ */
+
+static inline int
+deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
+		 struct v9fs_stat *stat, struct cbuf *dbufp)
+{
+
+	stat->size = buf_get_int16(bufp);
+	stat->type = buf_get_int16(bufp);
+	stat->dev = buf_get_int32(bufp);
+	stat->qid.type = buf_get_int8(bufp);
+	stat->qid.version = buf_get_int32(bufp);
+	stat->qid.path = buf_get_int64(bufp);
+	stat->mode = buf_get_int32(bufp);
+	stat->atime = buf_get_int32(bufp);
+	stat->mtime = buf_get_int32(bufp);
+	stat->length = buf_get_int64(bufp);
+	stat->name = buf_get_stringb(bufp, dbufp);
+	stat->uid = buf_get_stringb(bufp, dbufp);
+	stat->gid = buf_get_stringb(bufp, dbufp);
+	stat->muid = buf_get_stringb(bufp, dbufp);
+
+	if (v9ses->extended) {
+		stat->extension = buf_get_stringb(bufp, dbufp);
+		stat->n_uid = buf_get_int32(bufp);
+		stat->n_gid = buf_get_int32(bufp);
+		stat->n_muid = buf_get_int32(bufp);
+	}
+
+	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+		return 0;
+
+	return stat->size + 2;
+}
+
+/**
+ * deserialize_statb - wrapper for decoding a received metadata structure
+ * @v9ses: session info
+ * @bufp: buffer to deserialize
+ * @dbufp: buffer to deserialize variable strings into
+ *
+ */
+
+static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
+						  *v9ses, struct cbuf *bufp,
+						  struct cbuf *dbufp)
+{
+	struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
+
+	if (ret) {
+		int n = deserialize_stat(v9ses, bufp, ret, dbufp);
+		if (n <= 0)
+			return NULL;
+	}
+
+	return ret;
+}
+
+/**
+ * v9fs_deserialize_stat - decode a received metadata structure
+ * @v9ses: session info
+ * @buf: buffer to deserialize
+ * @buflen: length of received buffer
+ * @stat: metadata structure to decode into
+ * @statlen: length of destination metadata structure
+ *
+ */
+
+int
+v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf,
+		      u32 buflen, struct v9fs_stat *stat, u32 statlen)
+{
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+	struct cbuf dbuffer;
+	struct cbuf *dbufp = &dbuffer;
+
+	buf_init(bufp, buf, buflen);
+	buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat),
+		 statlen - sizeof(struct v9fs_stat));
+
+	return deserialize_stat(v9ses, bufp, stat, dbufp);
+}
+
+static inline int
+v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
+{
+	int size = 4 + 1 + 2;	/* size[4] msg[1] tag[2] */
+	int i = 0;
+
+	switch (fcall->id) {
+	default:
+		eprintk(KERN_ERR, "bad msg type %d\n", fcall->id);
+		return 0;
+	case TVERSION:		/* msize[4] version[s] */
+		size += 4 + 2 + strlen(fcall->params.tversion.version);
+		break;
+	case TAUTH:		/* afid[4] uname[s] aname[s] */
+		size += 4 + 2 + strlen(fcall->params.tauth.uname) +
+		    2 + strlen(fcall->params.tauth.aname);
+		break;
+	case TFLUSH:		/* oldtag[2] */
+		size += 2;
+		break;
+	case TATTACH:		/* fid[4] afid[4] uname[s] aname[s] */
+		size += 4 + 4 + 2 + strlen(fcall->params.tattach.uname) +
+		    2 + strlen(fcall->params.tattach.aname);
+		break;
+	case TWALK:		/* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
+		size += 4 + 4 + 2;
+		/* now compute total for the array of names */
+		for (i = 0; i < fcall->params.twalk.nwname; i++)
+			size += 2 + strlen(fcall->params.twalk.wnames[i]);
+		break;
+	case TOPEN:		/* fid[4] mode[1] */
+		size += 4 + 1;
+		break;
+	case TCREATE:		/* fid[4] name[s] perm[4] mode[1] */
+		size += 4 + 2 + strlen(fcall->params.tcreate.name) + 4 + 1;
+		break;
+	case TREAD:		/* fid[4] offset[8] count[4] */
+		size += 4 + 8 + 4;
+		break;
+	case TWRITE:		/* fid[4] offset[8] count[4] data[count] */
+		size += 4 + 8 + 4 + fcall->params.twrite.count;
+		break;
+	case TCLUNK:		/* fid[4] */
+		size += 4;
+		break;
+	case TREMOVE:		/* fid[4] */
+		size += 4;
+		break;
+	case TSTAT:		/* fid[4] */
+		size += 4;
+		break;
+	case TWSTAT:		/* fid[4] stat[n] */
+		fcall->params.twstat.stat->size =
+		    v9fs_size_stat(v9ses, fcall->params.twstat.stat);
+		size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
+	}
+	return size;
+}
+
+/*
+ * v9fs_serialize_fcall - marshall fcall struct into a packet
+ * @v9ses: session information
+ * @fcall: structure to convert
+ * @data: buffer to serialize fcall into
+ * @datalen: length of buffer to serialize fcall into
+ *
+ */
+
+int
+v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
+		     void *data, u32 datalen)
+{
+	int i = 0;
+	struct v9fs_stat *stat = NULL;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	buf_init(bufp, data, datalen);
+
+	if (!fcall) {
+		eprintk(KERN_ERR, "no fcall\n");
+		return -EINVAL;
+	}
+
+	fcall->size = v9fs_size_fcall(v9ses, fcall);
+
+	buf_put_int32(bufp, fcall->size);
+	buf_put_int8(bufp, fcall->id);
+	buf_put_int16(bufp, fcall->tag);
+
+	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", fcall->size, fcall->id,
+		fcall->tag);
+
+	/* now encode it */
+	switch (fcall->id) {
+	default:
+		eprintk(KERN_ERR, "bad msg type: %d\n", fcall->id);
+		return -EPROTO;
+	case TVERSION:
+		buf_put_int32(bufp, fcall->params.tversion.msize);
+		buf_put_string(bufp, fcall->params.tversion.version);
+		break;
+	case TAUTH:
+		buf_put_int32(bufp, fcall->params.tauth.afid);
+		buf_put_string(bufp, fcall->params.tauth.uname);
+		buf_put_string(bufp, fcall->params.tauth.aname);
+		break;
+	case TFLUSH:
+		buf_put_int16(bufp, fcall->params.tflush.oldtag);
+		break;
+	case TATTACH:
+		buf_put_int32(bufp, fcall->params.tattach.fid);
+		buf_put_int32(bufp, fcall->params.tattach.afid);
+		buf_put_string(bufp, fcall->params.tattach.uname);
+		buf_put_string(bufp, fcall->params.tattach.aname);
+		break;
+	case TWALK:
+		buf_put_int32(bufp, fcall->params.twalk.fid);
+		buf_put_int32(bufp, fcall->params.twalk.newfid);
+		buf_put_int16(bufp, fcall->params.twalk.nwname);
+		for (i = 0; i < fcall->params.twalk.nwname; i++)
+			buf_put_string(bufp, fcall->params.twalk.wnames[i]);
+		break;
+	case TOPEN:
+		buf_put_int32(bufp, fcall->params.topen.fid);
+		buf_put_int8(bufp, fcall->params.topen.mode);
+		break;
+	case TCREATE:
+		buf_put_int32(bufp, fcall->params.tcreate.fid);
+		buf_put_string(bufp, fcall->params.tcreate.name);
+		buf_put_int32(bufp, fcall->params.tcreate.perm);
+		buf_put_int8(bufp, fcall->params.tcreate.mode);
+		break;
+	case TREAD:
+		buf_put_int32(bufp, fcall->params.tread.fid);
+		buf_put_int64(bufp, fcall->params.tread.offset);
+		buf_put_int32(bufp, fcall->params.tread.count);
+		break;
+	case TWRITE:
+		buf_put_int32(bufp, fcall->params.twrite.fid);
+		buf_put_int64(bufp, fcall->params.twrite.offset);
+		buf_put_int32(bufp, fcall->params.twrite.count);
+		buf_put_data(bufp, fcall->params.twrite.data,
+			     fcall->params.twrite.count);
+		break;
+	case TCLUNK:
+		buf_put_int32(bufp, fcall->params.tclunk.fid);
+		break;
+	case TREMOVE:
+		buf_put_int32(bufp, fcall->params.tremove.fid);
+		break;
+	case TSTAT:
+		buf_put_int32(bufp, fcall->params.tstat.fid);
+		break;
+	case TWSTAT:
+		buf_put_int32(bufp, fcall->params.twstat.fid);
+		stat = fcall->params.twstat.stat;
+
+		buf_put_int16(bufp, stat->size + 2);
+		serialize_stat(v9ses, stat, bufp);
+		break;
+	}
+
+	if (buf_check_overflow(bufp))
+		return -EIO;
+
+	return fcall->size;
+}
+
+/**
+ * deserialize_fcall - unmarshal a response
+ * @v9ses: session information
+ * @msgsize: size of rcall message
+ * @buf: recieved buffer
+ * @buflen: length of received buffer
+ * @rcall: fcall structure to populate
+ * @rcalllen: length of fcall structure to populate
+ *
+ */
+
+int
+v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
+		       void *buf, u32 buflen, struct v9fs_fcall *rcall,
+		       int rcalllen)
+{
+
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+	struct cbuf dbuffer;
+	struct cbuf *dbufp = &dbuffer;
+	int i = 0;
+
+	buf_init(bufp, buf, buflen);
+	buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
+		 rcalllen - sizeof(struct v9fs_fcall));
+
+	rcall->size = msgsize;
+	rcall->id = buf_get_int8(bufp);
+	rcall->tag = buf_get_int16(bufp);
+
+	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id,
+		rcall->tag);
+	switch (rcall->id) {
+	default:
+		eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id);
+		return -EPROTO;
+	case RVERSION:
+		rcall->params.rversion.msize = buf_get_int32(bufp);
+		rcall->params.rversion.version = buf_get_stringb(bufp, dbufp);
+		break;
+	case RFLUSH:
+		break;
+	case RATTACH:
+		rcall->params.rattach.qid.type = buf_get_int8(bufp);
+		rcall->params.rattach.qid.version = buf_get_int32(bufp);
+		rcall->params.rattach.qid.path = buf_get_int64(bufp);
+		break;
+	case RWALK:
+		rcall->params.rwalk.nwqid = buf_get_int16(bufp);
+		rcall->params.rwalk.wqids = buf_alloc(bufp,
+		      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
+		if (rcall->params.rwalk.wqids)
+			for (i = 0; i < rcall->params.rwalk.nwqid; i++) {
+				rcall->params.rwalk.wqids[i].type =
+				    buf_get_int8(bufp);
+				rcall->params.rwalk.wqids[i].version =
+				    buf_get_int16(bufp);
+				rcall->params.rwalk.wqids[i].path =
+				    buf_get_int64(bufp);
+			}
+		break;
+	case ROPEN:
+		rcall->params.ropen.qid.type = buf_get_int8(bufp);
+		rcall->params.ropen.qid.version = buf_get_int32(bufp);
+		rcall->params.ropen.qid.path = buf_get_int64(bufp);
+		rcall->params.ropen.iounit = buf_get_int32(bufp);
+		break;
+	case RCREATE:
+		rcall->params.rcreate.qid.type = buf_get_int8(bufp);
+		rcall->params.rcreate.qid.version = buf_get_int32(bufp);
+		rcall->params.rcreate.qid.path = buf_get_int64(bufp);
+		rcall->params.rcreate.iounit = buf_get_int32(bufp);
+		break;
+	case RREAD:
+		rcall->params.rread.count = buf_get_int32(bufp);
+		rcall->params.rread.data = buf_get_datab(bufp, dbufp,
+			rcall->params.rread.count);
+		break;
+	case RWRITE:
+		rcall->params.rwrite.count = buf_get_int32(bufp);
+		break;
+	case RCLUNK:
+		break;
+	case RREMOVE:
+		break;
+	case RSTAT:
+		buf_get_int16(bufp);
+		rcall->params.rstat.stat =
+		    deserialize_statb(v9ses, bufp, dbufp);
+		break;
+	case RWSTAT:
+		break;
+	case RERROR:
+		rcall->params.rerror.error = buf_get_stringb(bufp, dbufp);
+		if (v9ses->extended)
+			rcall->params.rerror.errno = buf_get_int16(bufp);
+		break;
+	}
+
+	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+		return -EIO;
+
+	return rcall->size;
+}
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
new file mode 100644
index 00000000000..ee849613c61
--- /dev/null
+++ b/fs/9p/conv.h
@@ -0,0 +1,36 @@
+/*
+ * linux/fs/9p/conv.h
+ *
+ * 9P protocol conversion definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+int v9fs_deserialize_stat(struct v9fs_session_info *, void *buf,
+			  u32 buflen, struct v9fs_stat *stat, u32 statlen);
+int v9fs_serialize_fcall(struct v9fs_session_info *, struct v9fs_fcall *tcall,
+			 void *buf, u32 buflen);
+int v9fs_deserialize_fcall(struct v9fs_session_info *, u32 msglen,
+			   void *buf, u32 buflen, struct v9fs_fcall *rcall,
+			   int rcalllen);
+
+/* this one is actually in error.c right now */
+int v9fs_errstr2errno(char *errstr);
-- 
cgit v1.2.3-70-g09d2


From 426cc91aa651b50713d06d45e5c3c3e90cfd40d9 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:22 -0700
Subject: [PATCH] v9fs: transport modules

This part of the patch contains transport routines.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/mux.c        | 440 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/mux.h        |  39 +++++
 fs/9p/trans_fd.c   | 172 +++++++++++++++++++++
 fs/9p/trans_sock.c | 282 ++++++++++++++++++++++++++++++++++
 fs/9p/transport.h  |  46 ++++++
 fs/9p/v9fs.c       |   5 -
 6 files changed, 979 insertions(+), 5 deletions(-)
 create mode 100644 fs/9p/mux.c
 create mode 100644 fs/9p/mux.h
 create mode 100644 fs/9p/trans_fd.c
 create mode 100644 fs/9p/trans_sock.c
 create mode 100644 fs/9p/transport.h

(limited to 'fs')

diff --git a/fs/9p/mux.c b/fs/9p/mux.c
new file mode 100644
index 00000000000..8ebc1af2c24
--- /dev/null
+++ b/fs/9p/mux.c
@@ -0,0 +1,440 @@
+/*
+ * linux/fs/9p/mux.c
+ *
+ * Protocol Multiplexer
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kthread.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "transport.h"
+#include "conv.h"
+#include "mux.h"
+
+/**
+ * dprintcond - print condition of session info
+ * @v9ses: session info structure
+ * @req: RPC request structure
+ *
+ */
+
+static inline int
+dprintcond(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+	dprintk(DEBUG_MUX, "condition: %d, %p\n", v9ses->transport->status,
+		req->rcall);
+	return 0;
+}
+
+/**
+ * xread - force read of a certain number of bytes
+ * @v9ses: session info structure
+ * @ptr: pointer to buffer
+ * @sz: number of bytes to read
+ *
+ * Chuck Cranor CS-533 project1
+ */
+
+static int xread(struct v9fs_session_info *v9ses, void *ptr, unsigned long sz)
+{
+	int rd = 0;
+	int ret = 0;
+	while (rd < sz) {
+		ret = v9ses->transport->read(v9ses->transport, ptr, sz - rd);
+		if (ret <= 0) {
+			dprintk(DEBUG_ERROR, "xread errno %d\n", ret);
+			return ret;
+		}
+		rd += ret;
+		ptr += ret;
+	}
+	return (rd);
+}
+
+/**
+ * read_message - read a full 9P2000 fcall packet
+ * @v9ses: session info structure
+ * @rcall: fcall structure to read into
+ * @rcalllen: size of fcall buffer
+ *
+ */
+
+static int
+read_message(struct v9fs_session_info *v9ses,
+	     struct v9fs_fcall *rcall, int rcalllen)
+{
+	unsigned char buf[4];
+	void *data;
+	int size = 0;
+	int res = 0;
+
+	res = xread(v9ses, buf, sizeof(buf));
+	if (res < 0) {
+		dprintk(DEBUG_ERROR,
+			"Reading of count field failed returned: %d\n", res);
+		return res;
+	}
+
+	if (res < 4) {
+		dprintk(DEBUG_ERROR,
+			"Reading of count field failed returned: %d\n", res);
+		return -EIO;
+	}
+
+	size = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+	dprintk(DEBUG_MUX, "got a packet count: %d\n", size);
+
+	/* adjust for the four bytes of size */
+	size -= 4;
+
+	if (size > v9ses->maxdata) {
+		dprintk(DEBUG_ERROR, "packet too big: %d\n", size);
+		return -E2BIG;
+	}
+
+	data = kmalloc(size, GFP_KERNEL);
+	if (!data) {
+		eprintk(KERN_WARNING, "out of memory\n");
+		return -ENOMEM;
+	}
+
+	res = xread(v9ses, data, size);
+	if (res < size) {
+		dprintk(DEBUG_ERROR, "Reading of fcall failed returned: %d\n",
+			res);
+		kfree(data);
+		return res;
+	}
+
+	/* we now have an in-memory string that is the reply.
+	 * deserialize it. There is very little to go wrong at this point
+	 * save for v9fs_alloc errors.
+	 */
+	res = v9fs_deserialize_fcall(v9ses, size, data, v9ses->maxdata,
+				     rcall, rcalllen);
+
+	kfree(data);
+
+	if (res < 0)
+		return res;
+
+	return 0;
+}
+
+/**
+ * v9fs_recv - receive an RPC response for a particular tag
+ * @v9ses: session info structure
+ * @req: RPC request structure
+ *
+ */
+
+static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+	int ret = 0;
+
+	dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag);
+	ret = wait_event_interruptible(v9ses->read_wait,
+		       ((v9ses->transport->status != Connected) ||
+			(req->rcall != 0) || dprintcond(v9ses, req)));
+
+	dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall);
+	if (v9ses->transport->status == Disconnected)
+		return -ECONNRESET;
+
+	if (ret == 0) {
+		spin_lock(&v9ses->muxlock);
+		list_del(&req->next);
+		spin_unlock(&v9ses->muxlock);
+	}
+
+	return ret;
+}
+
+/**
+ * v9fs_send - send a 9P request
+ * @v9ses: session info structure
+ * @req: RPC request to send
+ *
+ */
+
+static int v9fs_send(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+	int ret = -1;
+	void *data = NULL;
+	struct v9fs_fcall *tcall = req->tcall;
+
+	data = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	tcall->size = 0;	/* enforce size recalculation */
+	ret =
+	    v9fs_serialize_fcall(v9ses, tcall, data,
+				 v9ses->maxdata + V9FS_IOHDRSZ);
+	if (ret < 0)
+		goto free_data;
+
+	spin_lock(&v9ses->muxlock);
+	list_add(&req->next, &v9ses->mux_fcalls);
+	spin_unlock(&v9ses->muxlock);
+
+	dprintk(DEBUG_MUX, "sending message: tag %d size %d\n", tcall->tag,
+		tcall->size);
+	ret = v9ses->transport->write(v9ses->transport, data, tcall->size);
+
+	if (ret != tcall->size) {
+		spin_lock(&v9ses->muxlock);
+		list_del(&req->next);
+		kfree(req->rcall);
+
+		spin_unlock(&v9ses->muxlock);
+		if (ret >= 0)
+			ret = -EREMOTEIO;
+	} else
+		ret = 0;
+
+      free_data:
+	kfree(data);
+	return ret;
+}
+
+/**
+ * v9fs_mux_rpc - send a request, receive a response
+ * @v9ses: session info structure
+ * @tcall: fcall to send
+ * @rcall: buffer to place response into
+ *
+ */
+
+long
+v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
+	     struct v9fs_fcall **rcall)
+{
+	int tid = -1;
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_rpcreq req;
+	int ret = -1;
+
+	if (!v9ses)
+		return -EINVAL;
+
+	if (rcall)
+		*rcall = NULL;
+
+	if (tcall->id != TVERSION) {
+		tid = v9fs_get_idpool(&v9ses->tidpool);
+		if (tid < 0)
+			return -ENOMEM;
+	}
+
+	tcall->tag = tid;
+
+	req.tcall = tcall;
+	req.rcall = NULL;
+
+	ret = v9fs_send(v9ses, &req);
+
+	if (ret < 0) {
+		if (tcall->id != TVERSION)
+			v9fs_put_idpool(tid, &v9ses->tidpool);
+		dprintk(DEBUG_MUX, "error %d\n", ret);
+		return ret;
+	}
+
+	ret = v9fs_recv(v9ses, &req);
+
+	fcall = req.rcall;
+
+	dprintk(DEBUG_MUX, "received: tag=%x, ret=%d\n", tcall->tag, ret);
+	if (ret == -ERESTARTSYS) {
+		if (v9ses->transport->status != Disconnected
+		    && tcall->id != TFLUSH) {
+			unsigned long flags;
+
+			dprintk(DEBUG_MUX, "flushing the tag: %d\n",
+				tcall->tag);
+			clear_thread_flag(TIF_SIGPENDING);
+			v9fs_t_flush(v9ses, tcall->tag);
+			spin_lock_irqsave(&current->sighand->siglock, flags);
+			recalc_sigpending();
+			spin_unlock_irqrestore(&current->sighand->siglock,
+					       flags);
+			dprintk(DEBUG_MUX, "flushing done\n");
+		}
+
+		goto release_req;
+	} else if (ret < 0)
+		goto release_req;
+
+	if (!fcall)
+		ret = -EIO;
+	else {
+		if (fcall->id == RERROR) {
+			ret = v9fs_errstr2errno(fcall->params.rerror.error);
+			if (ret == 0) {	/* string match failed */
+				if (fcall->params.rerror.errno)
+					ret = -(fcall->params.rerror.errno);
+				else
+					ret = -ESERVERFAULT;
+			}
+		} else if (fcall->id != tcall->id + 1) {
+			dprintk(DEBUG_ERROR,
+				"fcall mismatch: expected %d, got %d\n",
+				tcall->id + 1, fcall->id);
+			ret = -EIO;
+		}
+	}
+
+      release_req:
+	if (tcall->id != TVERSION)
+		v9fs_put_idpool(tid, &v9ses->tidpool);
+	if (rcall)
+		*rcall = fcall;
+	else
+		kfree(fcall);
+
+	return ret;
+}
+
+/**
+ * v9fs_recvproc - kproc to handle demultiplexing responses
+ * @data: session info structure
+ *
+ */
+
+static int v9fs_recvproc(void *data)
+{
+	struct v9fs_session_info *v9ses = (struct v9fs_session_info *)data;
+	struct v9fs_fcall *rcall = NULL;
+	struct v9fs_rpcreq *rptr;
+	struct v9fs_rpcreq *req;
+	struct v9fs_rpcreq *rreq;
+	int err = 0;
+
+	allow_signal(SIGKILL);
+	set_current_state(TASK_INTERRUPTIBLE);
+	complete(&v9ses->proccmpl);
+	while (!kthread_should_stop() && err >= 0) {
+		req = rptr = rreq = NULL;
+
+		rcall = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
+		if (!rcall) {
+			eprintk(KERN_ERR, "no memory for buffers\n");
+			break;
+		}
+
+		err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ);
+		if (err < 0) {
+			kfree(rcall);
+			break;
+		}
+		spin_lock(&v9ses->muxlock);
+		list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+			if (rreq->tcall->tag == rcall->tag) {
+				req = rreq;
+				req->rcall = rcall;
+				break;
+			}
+		}
+
+		if (req && (req->tcall->id == TFLUSH)) {
+			struct v9fs_rpcreq *treq = NULL;
+			list_for_each_entry_safe(treq, rptr, &v9ses->mux_fcalls, next) {
+				if (treq->tcall->tag ==
+				    req->tcall->params.tflush.oldtag) {
+					list_del(&rptr->next);
+					kfree(treq->rcall);
+					break;
+				}
+			}
+		}
+
+		spin_unlock(&v9ses->muxlock);
+
+		if (!req) {
+			dprintk(DEBUG_ERROR,
+				"unexpected response: id %d tag %d\n",
+				rcall->id, rcall->tag);
+
+			kfree(rcall);
+		}
+
+		wake_up_all(&v9ses->read_wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	/* Inform all pending processes about the failure */
+	wake_up_all(&v9ses->read_wait);
+
+	if (signal_pending(current))
+		complete(&v9ses->proccmpl);
+
+	dprintk(DEBUG_MUX, "recvproc: end\n");
+	v9ses->recvproc = NULL;
+
+	return err >= 0;
+}
+
+/**
+ * v9fs_mux_init - initialize multiplexer (spawn kproc)
+ * @v9ses: session info structure
+ * @dev_name: mount device information (to create unique kproc)
+ *
+ */
+
+int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name)
+{
+	char procname[60];
+
+	strncpy(procname, dev_name, sizeof(procname));
+	procname[sizeof(procname) - 1] = 0;
+
+	init_waitqueue_head(&v9ses->read_wait);
+	init_completion(&v9ses->fcread);
+	init_completion(&v9ses->proccmpl);
+	spin_lock_init(&v9ses->muxlock);
+	INIT_LIST_HEAD(&v9ses->mux_fcalls);
+	v9ses->recvproc = NULL;
+	v9ses->curfcall = NULL;
+
+	v9ses->recvproc = kthread_create(v9fs_recvproc, v9ses,
+					 "v9fs_recvproc %s", procname);
+
+	if (IS_ERR(v9ses->recvproc)) {
+		eprintk(KERN_ERR, "cannot create receiving thread\n");
+		v9fs_session_close(v9ses);
+		return -ECONNABORTED;
+	}
+
+	wake_up_process(v9ses->recvproc);
+	wait_for_completion(&v9ses->proccmpl);
+
+	return 0;
+}
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
new file mode 100644
index 00000000000..d7d8fa1c152
--- /dev/null
+++ b/fs/9p/mux.h
@@ -0,0 +1,39 @@
+/*
+ * linux/fs/9p/mux.h
+ *
+ * Multiplexer Definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/* structure to manage each RPC transaction */
+
+struct v9fs_rpcreq {
+	struct v9fs_fcall *tcall;
+	struct v9fs_fcall *rcall;
+
+	/* XXX - could we put scatter/gather buffers here? */
+
+	struct list_head next;
+};
+
+int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name);
+long v9fs_mux_rpc(struct v9fs_session_info *v9ses,
+		  struct v9fs_fcall *tcall, struct v9fs_fcall **rcall);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
new file mode 100644
index 00000000000..63b58ce98ff
--- /dev/null
+++ b/fs/9p/trans_fd.c
@@ -0,0 +1,172 @@
+/*
+ * linux/fs/9p/trans_fd.c
+ *
+ * File Descriptor Transport Layer
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <asm/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "transport.h"
+
+struct v9fs_trans_fd {
+	struct file *in_file;
+	struct file *out_file;
+};
+
+/**
+ * v9fs_fd_recv - receive from a socket
+ * @v9ses: session information
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int v9fs_fd_recv(struct v9fs_transport *trans, void *v, int len)
+{
+	struct v9fs_trans_fd *ts = trans ? trans->priv : NULL;
+
+	if (!trans || trans->status != Connected || !ts)
+		return -EIO;
+
+	return kernel_read(ts->in_file, ts->in_file->f_pos, v, len);
+}
+
+/**
+ * v9fs_fd_send - send to a socket
+ * @v9ses: session information
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len)
+{
+	struct v9fs_trans_fd *ts = trans ? trans->priv : NULL;
+	mm_segment_t oldfs = get_fs();
+	int ret = 0;
+
+	if (!trans || trans->status != Connected || !ts)
+		return -EIO;
+
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+/**
+ * v9fs_fd_init - initialize file descriptor transport
+ * @v9ses: session information
+ * @addr: address of server to mount
+ * @data: mount options
+ *
+ */
+
+static int
+v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
+{
+	struct v9fs_trans_fd *ts = NULL;
+	struct v9fs_transport *trans = v9ses->transport;
+
+	if((v9ses->wfdno == ~0) || (v9ses->rfdno == ~0)) {
+		printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
+		return -ENOPROTOOPT;
+	}
+
+	sema_init(&trans->writelock, 1);
+	sema_init(&trans->readlock, 1);
+
+	ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL);
+
+	if (!ts)
+		return -ENOMEM;
+
+	ts->in_file = fget( v9ses->rfdno );
+	ts->out_file = fget( v9ses->wfdno );
+
+	if (!ts->in_file || !ts->out_file) {
+		if (ts->in_file)
+			fput(ts->in_file);
+
+		if (ts->out_file)
+			fput(ts->out_file);
+
+		kfree(ts);
+		return -EIO;
+	}
+
+	trans->priv = ts;
+	trans->status = Connected;
+
+	return 0;
+}
+
+
+/**
+ * v9fs_fd_close - shutdown file descriptor
+ * @trans: private socket structure
+ *
+ */
+
+static void v9fs_fd_close(struct v9fs_transport *trans)
+{
+	struct v9fs_trans_fd *ts;
+
+	if (!trans)
+		return;
+
+	trans->status = Disconnected;
+	ts = trans->priv;
+
+	if (!ts)
+		return;
+
+	if (ts->in_file)
+		fput(ts->in_file);
+
+	if (ts->out_file)
+		fput(ts->out_file);
+
+	kfree(ts);
+}
+
+struct v9fs_transport v9fs_trans_fd = {
+	.init = v9fs_fd_init,
+	.write = v9fs_fd_send,
+	.read = v9fs_fd_recv,
+	.close = v9fs_fd_close,
+};
+
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
new file mode 100644
index 00000000000..081d1c84780
--- /dev/null
+++ b/fs/9p/trans_sock.c
@@ -0,0 +1,282 @@
+/*
+ * linux/fs/9p/trans_socket.c
+ *
+ * Socket Transport Layer
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ *  Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <asm/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "transport.h"
+
+#define V9FS_PORT 564
+
+struct v9fs_trans_sock {
+	struct socket *s;
+};
+
+/**
+ * v9fs_sock_recv - receive from a socket
+ * @v9ses: session information
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
+{
+	struct msghdr msg;
+	struct kvec iov;
+	int result;
+	mm_segment_t oldfs;
+	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+
+	if (trans->status == Disconnected)
+		return -EREMOTEIO;
+
+	result = -EINVAL;
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+
+	iov.iov_base = v;
+	iov.iov_len = len;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+	msg.msg_flags = MSG_NOSIGNAL;
+
+	result = kernel_recvmsg(ts->s, &msg, &iov, 1, len, 0);
+
+	dprintk(DEBUG_TRANS, "socket state %d\n", ts->s->state);
+	set_fs(oldfs);
+
+	if (result <= 0) {
+		if (result != -ERESTARTSYS)
+			trans->status = Disconnected;
+	}
+
+	return result;
+}
+
+/**
+ * v9fs_sock_send - send to a socket
+ * @v9ses: session information
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
+{
+	struct kvec iov;
+	struct msghdr msg;
+	int result = -1;
+	mm_segment_t oldfs;
+	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+
+	dprintk(DEBUG_TRANS, "Sending packet size %d (%x)\n", len, len);
+	dump_data(v, len);
+
+	down(&trans->writelock);
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	iov.iov_base = v;
+	iov.iov_len = len;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+	msg.msg_flags = MSG_NOSIGNAL;
+	result = kernel_sendmsg(ts->s, &msg, &iov, 1, len);
+	set_fs(oldfs);
+
+	if (result < 0) {
+		if (result != -ERESTARTSYS)
+			trans->status = Disconnected;
+	}
+
+	up(&trans->writelock);
+	return result;
+}
+
+/**
+ * v9fs_tcp_init - initialize TCP socket
+ * @v9ses: session information
+ * @addr: address of server to mount
+ * @data: mount options
+ *
+ */
+
+static int
+v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
+{
+	struct socket *csocket = NULL;
+	struct sockaddr_in sin_server;
+	int rc = 0;
+	struct v9fs_trans_sock *ts = NULL;
+	struct v9fs_transport *trans = v9ses->transport;
+
+	sema_init(&trans->writelock, 1);
+	sema_init(&trans->readlock, 1);
+
+	ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
+
+	if (!ts)
+		return -ENOMEM;
+
+	trans->priv = ts;
+	ts->s = NULL;
+
+	if (!addr)
+		return -EINVAL;
+
+	dprintk(DEBUG_TRANS, "Connecting to %s\n", addr);
+
+	sin_server.sin_family = AF_INET;
+	sin_server.sin_addr.s_addr = in_aton(addr);
+	sin_server.sin_port = htons(v9ses->port);
+	sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket);
+	rc = csocket->ops->connect(csocket,
+				   (struct sockaddr *)&sin_server,
+				   sizeof(struct sockaddr_in), 0);
+	if (rc < 0) {
+		eprintk(KERN_ERR,
+			"v9fs_trans_tcp: problem connecting socket to %s\n",
+			addr);
+		return rc;
+	}
+	csocket->sk->sk_allocation = GFP_NOIO;
+	ts->s = csocket;
+	trans->status = Connected;
+
+	return 0;
+}
+
+/**
+ * v9fs_unix_init - initialize UNIX domain socket
+ * @v9ses: session information
+ * @dev_name: path to named pipe
+ * @data: mount options
+ *
+ */
+
+static int
+v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
+	       char *data)
+{
+	int rc;
+	struct socket *csocket;
+	struct sockaddr_un sun_server;
+	struct v9fs_transport *trans;
+	struct v9fs_trans_sock *ts;
+
+	rc = 0;
+	csocket = NULL;
+	trans = v9ses->transport;
+
+	if (strlen(dev_name) > UNIX_PATH_MAX) {
+		eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n",
+			dev_name);
+		return -ENOMEM;
+	}
+
+	ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
+	if (!ts)
+		return -ENOMEM;
+
+	trans->priv = ts;
+	ts->s = NULL;
+
+	sema_init(&trans->writelock, 1);
+	sema_init(&trans->readlock, 1);
+
+	sun_server.sun_family = PF_UNIX;
+	strcpy(sun_server.sun_path, dev_name);
+	sock_create_kern(PF_UNIX, SOCK_STREAM, 0, &csocket);
+	rc = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
+		sizeof(struct sockaddr_un) - 1, 0);	/* -1 *is* important */
+	if (rc < 0) {
+		eprintk(KERN_ERR,
+			"v9fs_trans_unix: problem connecting socket: %s: %d\n",
+			dev_name, rc);
+		return rc;
+	}
+	csocket->sk->sk_allocation = GFP_NOIO;
+	ts->s = csocket;
+	trans->status = Connected;
+
+	return 0;
+}
+
+/**
+ * v9fs_sock_close - shutdown socket
+ * @trans: private socket structure
+ *
+ */
+
+static void v9fs_sock_close(struct v9fs_transport *trans)
+{
+	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+
+	if ((ts) && (ts->s)) {
+		dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s);
+		sock_release(ts->s);
+		ts->s = NULL;
+		trans->status = Disconnected;
+		dprintk(DEBUG_TRANS, "socket closed\n");
+	}
+
+	kfree(ts);
+}
+
+struct v9fs_transport v9fs_trans_tcp = {
+	.init = v9fs_tcp_init,
+	.write = v9fs_sock_send,
+	.read = v9fs_sock_recv,
+	.close = v9fs_sock_close,
+};
+
+struct v9fs_transport v9fs_trans_unix = {
+	.init = v9fs_unix_init,
+	.write = v9fs_sock_send,
+	.read = v9fs_sock_recv,
+	.close = v9fs_sock_close,
+};
diff --git a/fs/9p/transport.h b/fs/9p/transport.h
new file mode 100644
index 00000000000..9e9cd418efd
--- /dev/null
+++ b/fs/9p/transport.h
@@ -0,0 +1,46 @@
+/*
+ * linux/fs/9p/transport.h
+ *
+ * Transport Definition
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+enum v9fs_transport_status {
+	Connected,
+	Disconnected,
+	Hung,
+};
+
+struct v9fs_transport {
+	enum v9fs_transport_status status;
+	struct semaphore writelock;
+	struct semaphore readlock;
+	void *priv;
+
+	int (*init) (struct v9fs_session_info *, const char *, char *);
+	int (*write) (struct v9fs_transport *, void *, int);
+	int (*read) (struct v9fs_transport *, void *, int);
+	void (*close) (struct v9fs_transport *);
+};
+
+extern struct v9fs_transport v9fs_trans_tcp;
+extern struct v9fs_transport v9fs_trans_unix;
+extern struct v9fs_transport v9fs_trans_fd;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 14d663ebfcb..a573b751dd9 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -296,11 +296,6 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	case PROTO_FD:
 		trans_proto = &v9fs_trans_fd;
 		*v9ses->remotename = 0;
-		if((v9ses->wfdno == ~0) || (v9ses->rfdno == ~0)) {
-			printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
-			retval = -ENOPROTOOPT;
-			goto SessCleanUp;
-		}
 		break;
 	default:
 		printk(KERN_ERR "v9fs: Bad mount protocol %d\n", v9ses->proto);
-- 
cgit v1.2.3-70-g09d2


From 322b329ab787de5f45abca9c9eabfd33bc5927e8 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:23 -0700
Subject: [PATCH] v9fs: Support to force umount

Support for force umount

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/mux.c       | 20 ++++++++++++++++++++
 fs/9p/mux.h       |  1 +
 fs/9p/v9fs.c      |  9 +++++++++
 fs/9p/v9fs.h      |  4 +---
 fs/9p/vfs_super.c |  9 +++++++++
 5 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 8ebc1af2c24..0854bef58c1 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -323,6 +323,26 @@ v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
 	return ret;
 }
 
+/**
+ * v9fs_mux_cancel_requests - cancels all pending requests
+ *
+ * @v9ses: session info structure
+ * @err: error code to return to the requests
+ */
+void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err)
+{
+	struct v9fs_rpcreq *rptr;
+	struct v9fs_rpcreq *rreq;
+
+	dprintk(DEBUG_MUX, " %d\n", err);
+	spin_lock(&v9ses->muxlock);
+	list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+		rreq->err = err;
+	}
+	spin_unlock(&v9ses->muxlock);
+	wake_up_all(&v9ses->read_wait);
+}
+
 /**
  * v9fs_recvproc - kproc to handle demultiplexing responses
  * @data: session info structure
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index d7d8fa1c152..82ce793af1b 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -37,3 +37,4 @@ struct v9fs_rpcreq {
 int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name);
 long v9fs_mux_rpc(struct v9fs_session_info *v9ses,
 		  struct v9fs_fcall *tcall, struct v9fs_fcall **rcall);
+void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index a573b751dd9..13bdbbab438 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -409,6 +409,15 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 	putname(v9ses->remotename);
 }
 
+/**
+ * v9fs_session_cancel - mark transport as disconnected
+ * 	and cancel all pending requests.
+ */
+void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+	v9ses->transport->status = Disconnected;
+	v9fs_mux_cancel_requests(v9ses, -EIO);
+}
+
 extern int v9fs_error_init(void);
 
 /**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 52203027b15..45dcef42bdd 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -89,9 +89,7 @@ struct v9fs_session_info *v9fs_inode2v9ses(struct inode *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
 int v9fs_get_idpool(struct v9fs_idpool *p);
 void v9fs_put_idpool(int id, struct v9fs_idpool *p);
-int v9fs_get_option(char *opts, char *name, char *buf, int buflen);
-long long v9fs_get_int_option(char *opts, char *name, long long dflt);
-int v9fs_parse_tcp_devname(const char *devname, char **addr, char **remotename);
+void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 
 #define V9FS_MAGIC 0x01021997
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ce0778acc90..868f350b2c5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -257,10 +257,19 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
+static void
+v9fs_umount_begin(struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	v9fs_session_cancel(v9ses);
+}
+
 static struct super_operations v9fs_super_ops = {
 	.statfs = simple_statfs,
 	.clear_inode = v9fs_clear_inode,
 	.show_options = v9fs_show_options,
+	.umount_begin = v9fs_umount_begin,
 };
 
 struct file_system_type v9fs_fs_type = {
-- 
cgit v1.2.3-70-g09d2


From 3ed8491c8a75cefe95b57f7f428a3e2ddd421e97 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:24 -0700
Subject: [PATCH] v9fs: debug and support routines

This part of the patch contains debug and other misc routines.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/debug.h |  70 +++++++++++++++++
 fs/9p/error.c |  93 ++++++++++++++++++++++
 fs/9p/error.h | 181 +++++++++++++++++++++++++++++++++++++++++++
 fs/9p/fid.c   | 241 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/fid.h   |  57 ++++++++++++++
 5 files changed, 642 insertions(+)
 create mode 100644 fs/9p/debug.h
 create mode 100644 fs/9p/error.c
 create mode 100644 fs/9p/error.h
 create mode 100644 fs/9p/fid.c
 create mode 100644 fs/9p/fid.h

(limited to 'fs')

diff --git a/fs/9p/debug.h b/fs/9p/debug.h
new file mode 100644
index 00000000000..4445f06919d
--- /dev/null
+++ b/fs/9p/debug.h
@@ -0,0 +1,70 @@
+/*
+ *  linux/fs/9p/debug.h - V9FS Debug Definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#define DEBUG_ERROR		(1<<0)
+#define DEBUG_CURRENT		(1<<1)
+#define DEBUG_9P	        (1<<2)
+#define DEBUG_VFS	        (1<<3)
+#define DEBUG_CONV		(1<<4)
+#define DEBUG_MUX		(1<<5)
+#define DEBUG_TRANS		(1<<6)
+#define DEBUG_SLABS	      	(1<<7)
+
+#define DEBUG_DUMP_PKT		0
+
+extern int v9fs_debug_level;
+
+#define dprintk(level, format, arg...) \
+do {  \
+	if((v9fs_debug_level & level)==level) \
+		printk(KERN_NOTICE "-- %s (%d): " \
+		format , __FUNCTION__, current->pid , ## arg); \
+} while(0)
+
+#define eprintk(level, format, arg...) \
+do { \
+	printk(level "v9fs: %s (%d): " \
+		format , __FUNCTION__, current->pid , ## arg); \
+} while(0)
+
+#if DEBUG_DUMP_PKT
+static inline void dump_data(const unsigned char *data, unsigned int datalen)
+{
+	int i, j;
+	int len = datalen;
+
+	printk(KERN_DEBUG "data ");
+	for (i = 0; i < len; i += 4) {
+		for (j = 0; (j < 4) && (i + j < len); j++)
+			printk(KERN_DEBUG "%02x", data[i + j]);
+		printk(KERN_DEBUG " ");
+	}
+	printk(KERN_DEBUG "\n");
+}
+#else				/* DEBUG_DUMP_PKT */
+static inline void dump_data(const unsigned char *data, unsigned int datalen)
+{
+
+}
+#endif				/* DEBUG_DUMP_PKT */
diff --git a/fs/9p/error.c b/fs/9p/error.c
new file mode 100644
index 00000000000..fee5d19179c
--- /dev/null
+++ b/fs/9p/error.c
@@ -0,0 +1,93 @@
+/*
+ * linux/fs/9p/error.c
+ *
+ * Error string handling
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  These functions
+ * try to help manage that and provide for dynamically adding error
+ * mappings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/list.h>
+#include <linux/jhash.h>
+
+#include "debug.h"
+#include "error.h"
+
+/**
+ * v9fs_error_init - preload
+ * @errstr: error string
+ *
+ */
+
+int v9fs_error_init(void)
+{
+	struct errormap *c;
+	int bucket;
+
+	/* initialize hash table */
+	for (bucket = 0; bucket < ERRHASHSZ; bucket++)
+		INIT_HLIST_HEAD(&hash_errmap[bucket]);
+
+	/* load initial error map into hash table */
+	for (c = errmap; c->name != NULL; c++) {
+		bucket = jhash(c->name, strlen(c->name), 0) % ERRHASHSZ;
+		INIT_HLIST_NODE(&c->list);
+		hlist_add_head(&c->list, &hash_errmap[bucket]);
+	}
+
+	return 1;
+}
+
+/**
+ * errstr2errno - convert error string to error number
+ * @errstr: error string
+ *
+ */
+
+int v9fs_errstr2errno(char *errstr)
+{
+	int errno = 0;
+	struct hlist_node *p = NULL;
+	struct errormap *c = NULL;
+	int bucket = jhash(errstr, strlen(errstr), 0) % ERRHASHSZ;
+
+	hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
+		if (!strcmp(c->name, errstr)) {
+			errno = c->val;
+			break;
+		}
+	}
+
+	if (errno == 0) {
+		/* TODO: if error isn't found, add it dynamically */
+		printk(KERN_ERR "%s: errstr :%s: not found\n", __FUNCTION__,
+		       errstr);
+		errno = 1;
+	}
+
+	return -errno;
+}
diff --git a/fs/9p/error.h b/fs/9p/error.h
new file mode 100644
index 00000000000..4bf2cf5aa1b
--- /dev/null
+++ b/fs/9p/error.h
@@ -0,0 +1,181 @@
+/*
+ * linux/fs/9p/error.h
+ *
+ * Huge Nasty Error Table
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  This table tries to
+ * match UNIX strings and Plan 9 strings to unix error numbers.  It is used
+ * to preload the dynamic error table which can also track user-specific error
+ * strings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/errno.h>
+
+struct errormap {
+	char *name;
+	int val;
+
+	struct hlist_node list;
+};
+
+#define ERRHASHSZ		32
+static struct hlist_head hash_errmap[ERRHASHSZ];
+
+/* FixMe - reduce to a reasonable size */
+static struct errormap errmap[] = {
+	{"Operation not permitted", 1},
+	{"wstat prohibited", 1},
+	{"No such file or directory", 2},
+	{"file not found", 2},
+	{"Interrupted system call", 4},
+	{"Input/output error", 5},
+	{"No such device or address", 6},
+	{"Argument list too long", 7},
+	{"Bad file descriptor", 9},
+	{"Resource temporarily unavailable", 11},
+	{"Cannot allocate memory", 12},
+	{"Permission denied", 13},
+	{"Bad address", 14},
+	{"Block device required", 15},
+	{"Device or resource busy", 16},
+	{"File exists", 17},
+	{"Invalid cross-device link", 18},
+	{"No such device", 19},
+	{"Not a directory", 20},
+	{"Is a directory", 21},
+	{"Invalid argument", 22},
+	{"Too many open files in system", 23},
+	{"Too many open files", 24},
+	{"Text file busy", 26},
+	{"File too large", 27},
+	{"No space left on device", 28},
+	{"Illegal seek", 29},
+	{"Read-only file system", 30},
+	{"Too many links", 31},
+	{"Broken pipe", 32},
+	{"Numerical argument out of domain", 33},
+	{"Numerical result out of range", 34},
+	{"Resource deadlock avoided", 35},
+	{"File name too long", 36},
+	{"No locks available", 37},
+	{"Function not implemented", 38},
+	{"Directory not empty", 39},
+	{"Too many levels of symbolic links", 40},
+	{"Unknown error 41", 41},
+	{"No message of desired type", 42},
+	{"Identifier removed", 43},
+	{"File locking deadlock error", 58},
+	{"No data available", 61},
+	{"Machine is not on the network", 64},
+	{"Package not installed", 65},
+	{"Object is remote", 66},
+	{"Link has been severed", 67},
+	{"Communication error on send", 70},
+	{"Protocol error", 71},
+	{"Bad message", 74},
+	{"File descriptor in bad state", 77},
+	{"Streams pipe error", 86},
+	{"Too many users", 87},
+	{"Socket operation on non-socket", 88},
+	{"Message too long", 90},
+	{"Protocol not available", 92},
+	{"Protocol not supported", 93},
+	{"Socket type not supported", 94},
+	{"Operation not supported", 95},
+	{"Protocol family not supported", 96},
+	{"Network is down", 100},
+	{"Network is unreachable", 101},
+	{"Network dropped connection on reset", 102},
+	{"Software caused connection abort", 103},
+	{"Connection reset by peer", 104},
+	{"No buffer space available", 105},
+	{"Transport endpoint is already connected", 106},
+	{"Transport endpoint is not connected", 107},
+	{"Cannot send after transport endpoint shutdown", 108},
+	{"Connection timed out", 110},
+	{"Connection refused", 111},
+	{"Host is down", 112},
+	{"No route to host", 113},
+	{"Operation already in progress", 114},
+	{"Operation now in progress", 115},
+	{"Is a named type file", 120},
+	{"Remote I/O error", 121},
+	{"Disk quota exceeded", 122},
+	{"Operation canceled", 125},
+	{"Unknown error 126", 126},
+	{"Unknown error 127", 127},
+/* errors from fossil, vacfs, and u9fs */
+	{"fid unknown or out of range", EBADF},
+	{"permission denied", EACCES},
+	{"file does not exist", ENOENT},
+	{"authentication failed", ECONNREFUSED},
+	{"bad offset in directory read", ESPIPE},
+	{"bad use of fid", EBADF},
+	{"wstat can't convert between files and directories", EPERM},
+	{"directory is not empty", ENOTEMPTY},
+	{"file exists", EEXIST},
+	{"file already exists", EEXIST},
+	{"file or directory already exists", EEXIST},
+	{"fid already in use", EBADF},
+	{"file in use", ETXTBSY},
+	{"i/o error", EIO},
+	{"file already open for I/O", ETXTBSY},
+	{"illegal mode", EINVAL},
+	{"illegal name", ENAMETOOLONG},
+	{"not a directory", ENOTDIR},
+	{"not a member of proposed group", EINVAL},
+	{"not owner", EACCES},
+	{"only owner can change group in wstat", EACCES},
+	{"read only file system", EROFS},
+	{"no access to special file", EPERM},
+	{"i/o count too large", EIO},
+	{"unknown group", EINVAL},
+	{"unknown user", EINVAL},
+	{"bogus wstat buffer", EPROTO},
+	{"exclusive use file already open", EAGAIN},
+	{"corrupted directory entry", EIO},
+	{"corrupted file entry", EIO},
+	{"corrupted block label", EIO},
+	{"corrupted meta data", EIO},
+	{"illegal offset", EINVAL},
+	{"illegal path element", ENOENT},
+	{"root of file system is corrupted", EIO},
+	{"corrupted super block", EIO},
+	{"protocol botch", EPROTO},
+	{"file system is full", ENOSPC},
+	{"file is in use", EAGAIN},
+	{"directory entry is not allocated", ENOENT},
+	{"file is read only", EROFS},
+	{"file has been removed", EIDRM},
+	{"only support truncation to zero length", EPERM},
+	{"cannot remove root", EPERM},
+	{"file too big", EFBIG},
+	{"venti i/o error", EIO},
+	/* these are not errors */
+	{"u9fs rhostsauth: no authentication required", 0},
+	{"u9fs authnone: no authentication required", 0},
+	{NULL, -1}
+};
+
+extern int v9fs_error_init(void);
+extern int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
new file mode 100644
index 00000000000..821c9c4d76a
--- /dev/null
+++ b/fs/9p/fid.c
@@ -0,0 +1,241 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "transport.h"
+#include "mux.h"
+#include "conv.h"
+#include "fid.h"
+
+/**
+ * v9fs_fid_insert - add a fid to a dentry
+ * @fid: fid to add
+ * @dentry: dentry that it is being added to
+ *
+ */
+
+static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
+{
+	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
+	dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid,
+		dentry->d_iname, dentry);
+	if (dentry->d_fsdata == NULL) {
+		dentry->d_fsdata =
+		    kmalloc(sizeof(struct list_head), GFP_KERNEL);
+		if (dentry->d_fsdata == NULL) {
+			dprintk(DEBUG_ERROR, "Out of memory\n");
+			return -ENOMEM;
+		}
+		fid_list = (struct list_head *)dentry->d_fsdata;
+		INIT_LIST_HEAD(fid_list);	/* Initialize list head */
+	}
+
+	fid->uid = current->uid;
+	fid->pid = current->pid;
+	list_add(&fid->list, fid_list);
+	return 0;
+}
+
+/**
+ * v9fs_fid_create - allocate a FID structure
+ * @dentry - dentry to link newly created fid to
+ *
+ */
+
+struct v9fs_fid *v9fs_fid_create(struct dentry *dentry)
+{
+	struct v9fs_fid *new;
+
+	new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
+	if (new == NULL) {
+		dprintk(DEBUG_ERROR, "Out of Memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	new->fid = -1;
+	new->fidopen = 0;
+	new->fidcreate = 0;
+	new->fidclunked = 0;
+	new->iounit = 0;
+
+	if (v9fs_fid_insert(new, dentry) == 0)
+		return new;
+	else {
+		dprintk(DEBUG_ERROR, "Problems inserting to dentry\n");
+		kfree(new);
+		return NULL;
+	}
+}
+
+/**
+ * v9fs_fid_destroy - deallocate a FID structure
+ * @fid: fid to destroy
+ *
+ */
+
+void v9fs_fid_destroy(struct v9fs_fid *fid)
+{
+	list_del(&fid->list);
+	kfree(fid);
+}
+
+/**
+ * v9fs_fid_lookup - retrieve the right fid from a  particular dentry
+ * @dentry: dentry to look for fid in
+ * @type: intent of lookup (operation or traversal)
+ *
+ * search list of fids associated with a dentry for a fid with a matching
+ * thread id or uid.  If that fails, look up the dentry's parents to see if you
+ * can find a matching fid.
+ *
+ */
+
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type)
+{
+	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
+	struct v9fs_fid *current_fid = NULL;
+	struct v9fs_fid *temp = NULL;
+	struct v9fs_fid *return_fid = NULL;
+	int found_parent = 0;
+	int found_user = 0;
+
+	dprintk(DEBUG_9P, " dentry: %s (%p) type %d\n", dentry->d_iname, dentry,
+		type);
+
+	if (fid_list && !list_empty(fid_list)) {
+		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
+			if (current_fid->uid == current->uid) {
+				if (return_fid == NULL) {
+					if ((type == FID_OP)
+					    || (!current_fid->fidopen)) {
+						return_fid = current_fid;
+						found_user = 1;
+					}
+				}
+			}
+			if (current_fid->pid == current->real_parent->pid) {
+				if ((return_fid == NULL) || (found_parent)
+				    || (found_user)) {
+					if ((type == FID_OP)
+					    || (!current_fid->fidopen)) {
+						return_fid = current_fid;
+						found_parent = 1;
+						found_user = 0;
+					}
+				}
+			}
+			if (current_fid->pid == current->pid) {
+				if ((type == FID_OP) ||
+				    (!current_fid->fidopen)) {
+					return_fid = current_fid;
+					found_parent = 0;
+					found_user = 0;
+				}
+			}
+		}
+	}
+
+	/* we are at the root but didn't match */
+	if ((!return_fid) && (dentry->d_parent == dentry)) {
+		/* TODO: clone attach with new uid */
+		return_fid = current_fid;
+	}
+
+	if (!return_fid) {
+		struct dentry *par = current->fs->pwd->d_parent;
+		int count = 1;
+		while (par != NULL) {
+			if (par == dentry)
+				break;
+			count++;
+			if (par == par->d_parent) {
+				dprintk(DEBUG_ERROR,
+					"got to root without finding dentry\n");
+				break;
+			}
+			par = par->d_parent;
+		}
+
+/* XXX - there may be some duplication we can get rid of */
+		if (par == dentry) {
+			/* we need to fid_lookup the starting point */
+			int fidnum = -1;
+			int oldfid = -1;
+			int result = -1;
+			struct v9fs_session_info *v9ses =
+			    v9fs_inode2v9ses(current->fs->pwd->d_inode);
+
+			current_fid =
+			    v9fs_fid_lookup(current->fs->pwd, FID_WALK);
+			if (current_fid == NULL) {
+				dprintk(DEBUG_ERROR,
+					"process cwd doesn't have a fid\n");
+				return return_fid;
+			}
+			oldfid = current_fid->fid;
+			par = current->fs->pwd;
+			/* TODO: take advantage of multiwalk */
+
+			fidnum = v9fs_get_idpool(&v9ses->fidpool);
+			if (fidnum < 0) {
+				dprintk(DEBUG_ERROR,
+					"could not get a new fid num\n");
+				return return_fid;
+			}
+
+			while (par != dentry) {
+				result =
+				    v9fs_t_walk(v9ses, oldfid, fidnum, "..",
+						NULL);
+				if (result < 0) {
+					dprintk(DEBUG_ERROR,
+						"problem walking to parent\n");
+
+					break;
+				}
+				oldfid = fidnum;
+				if (par == par->d_parent) {
+					dprintk(DEBUG_ERROR,
+						"can't find dentry\n");
+					break;
+				}
+				par = par->d_parent;
+			}
+			if (par == dentry) {
+				return_fid = v9fs_fid_create(dentry);
+				return_fid->fid = fidnum;
+			}
+		}
+	}
+
+	return return_fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
new file mode 100644
index 00000000000..7db478ccca3
--- /dev/null
+++ b/fs/9p/fid.h
@@ -0,0 +1,57 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/list.h>
+
+#define FID_OP   0
+#define FID_WALK 1
+
+struct v9fs_fid {
+	struct list_head list;	 /* list of fids associated with a dentry */
+	struct list_head active; /* XXX - debug */
+
+	u32 fid;
+	unsigned char fidopen;	  /* set when fid is opened */
+	unsigned char fidcreate;  /* set when fid was just created */
+	unsigned char fidclunked; /* set when fid has already been clunked */
+
+	struct v9fs_qid qid;
+	u32 iounit;
+
+	/* readdir stuff */
+	int rdir_fpos;
+	loff_t rdir_pos;
+	struct v9fs_fcall *rdir_fcall;
+
+	/* management stuff */
+	pid_t pid;		/* thread associated with this fid */
+	uid_t uid;		/* user associated with this fid */
+
+	/* private data */
+	struct file *filp;	/* backpointer to File struct for open files */
+	struct v9fs_session_info *v9ses;	/* session info for this FID */
+};
+
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type);
+void v9fs_fid_destroy(struct v9fs_fid *fid);
+struct v9fs_fid *v9fs_fid_create(struct dentry *);
-- 
cgit v1.2.3-70-g09d2


From 1346f51ede71fc1e5021062898d150e192dc4dc8 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:25 -0700
Subject: [PATCH] v9fs: Change error magic numbers to defined constants

Change magic error numbers to system defined constants in v9fs error.h As
suggested by Jan-Benedict Glaw.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/error.h | 158 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 77 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/error.h b/fs/9p/error.h
index 4bf2cf5aa1b..6dbb66f5b28 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -30,6 +30,7 @@
  */
 
 #include <linux/errno.h>
+#include <asm/errno.h>
 
 struct errormap {
 	char *name;
@@ -43,87 +44,82 @@ static struct hlist_head hash_errmap[ERRHASHSZ];
 
 /* FixMe - reduce to a reasonable size */
 static struct errormap errmap[] = {
-	{"Operation not permitted", 1},
-	{"wstat prohibited", 1},
-	{"No such file or directory", 2},
-	{"file not found", 2},
-	{"Interrupted system call", 4},
-	{"Input/output error", 5},
-	{"No such device or address", 6},
-	{"Argument list too long", 7},
-	{"Bad file descriptor", 9},
-	{"Resource temporarily unavailable", 11},
-	{"Cannot allocate memory", 12},
-	{"Permission denied", 13},
-	{"Bad address", 14},
-	{"Block device required", 15},
-	{"Device or resource busy", 16},
-	{"File exists", 17},
-	{"Invalid cross-device link", 18},
-	{"No such device", 19},
-	{"Not a directory", 20},
-	{"Is a directory", 21},
-	{"Invalid argument", 22},
-	{"Too many open files in system", 23},
-	{"Too many open files", 24},
-	{"Text file busy", 26},
-	{"File too large", 27},
-	{"No space left on device", 28},
-	{"Illegal seek", 29},
-	{"Read-only file system", 30},
-	{"Too many links", 31},
-	{"Broken pipe", 32},
-	{"Numerical argument out of domain", 33},
-	{"Numerical result out of range", 34},
-	{"Resource deadlock avoided", 35},
-	{"File name too long", 36},
-	{"No locks available", 37},
-	{"Function not implemented", 38},
-	{"Directory not empty", 39},
-	{"Too many levels of symbolic links", 40},
-	{"Unknown error 41", 41},
-	{"No message of desired type", 42},
-	{"Identifier removed", 43},
-	{"File locking deadlock error", 58},
-	{"No data available", 61},
-	{"Machine is not on the network", 64},
-	{"Package not installed", 65},
-	{"Object is remote", 66},
-	{"Link has been severed", 67},
-	{"Communication error on send", 70},
-	{"Protocol error", 71},
-	{"Bad message", 74},
-	{"File descriptor in bad state", 77},
-	{"Streams pipe error", 86},
-	{"Too many users", 87},
-	{"Socket operation on non-socket", 88},
-	{"Message too long", 90},
-	{"Protocol not available", 92},
-	{"Protocol not supported", 93},
-	{"Socket type not supported", 94},
-	{"Operation not supported", 95},
-	{"Protocol family not supported", 96},
-	{"Network is down", 100},
-	{"Network is unreachable", 101},
-	{"Network dropped connection on reset", 102},
-	{"Software caused connection abort", 103},
-	{"Connection reset by peer", 104},
-	{"No buffer space available", 105},
-	{"Transport endpoint is already connected", 106},
-	{"Transport endpoint is not connected", 107},
-	{"Cannot send after transport endpoint shutdown", 108},
-	{"Connection timed out", 110},
-	{"Connection refused", 111},
-	{"Host is down", 112},
-	{"No route to host", 113},
-	{"Operation already in progress", 114},
-	{"Operation now in progress", 115},
-	{"Is a named type file", 120},
-	{"Remote I/O error", 121},
-	{"Disk quota exceeded", 122},
-	{"Operation canceled", 125},
-	{"Unknown error 126", 126},
-	{"Unknown error 127", 127},
+	{"Operation not permitted", EPERM},
+	{"wstat prohibited", EPERM},
+	{"No such file or directory", ENOENT},
+	{"file not found", ENOENT},
+	{"Interrupted system call", EINTR},
+	{"Input/output error", EIO},
+	{"No such device or address", ENXIO},
+	{"Argument list too long", E2BIG},
+	{"Bad file descriptor", EBADF},
+	{"Resource temporarily unavailable", EAGAIN},
+	{"Cannot allocate memory", ENOMEM},
+	{"Permission denied", EACCES},
+	{"Bad address", EFAULT},
+	{"Block device required", ENOTBLK},
+	{"Device or resource busy", EBUSY},
+	{"File exists", EEXIST},
+	{"Invalid cross-device link", EXDEV},
+	{"No such device", ENODEV},
+	{"Not a directory", ENOTDIR},
+	{"Is a directory", EISDIR},
+	{"Invalid argument", EINVAL},
+	{"Too many open files in system", ENFILE},
+	{"Too many open files", EMFILE},
+	{"Text file busy", ETXTBSY},
+	{"File too large", EFBIG},
+	{"No space left on device", ENOSPC},
+	{"Illegal seek", ESPIPE},
+	{"Read-only file system", EROFS},
+	{"Too many links", EMLINK},
+	{"Broken pipe", EPIPE},
+	{"Numerical argument out of domain", EDOM},
+	{"Numerical result out of range", ERANGE},
+	{"Resource deadlock avoided", EDEADLK},
+	{"File name too long", ENAMETOOLONG},
+	{"No locks available", ENOLCK},
+	{"Function not implemented", ENOSYS},
+	{"Directory not empty", ENOTEMPTY},
+	{"Too many levels of symbolic links", ELOOP},
+	{"No message of desired type", ENOMSG},
+	{"Identifier removed", EIDRM},
+	{"No data available", ENODATA},
+	{"Machine is not on the network", ENONET},
+	{"Package not installed", ENOPKG},
+	{"Object is remote", EREMOTE},
+	{"Link has been severed", ENOLINK},
+	{"Communication error on send", ECOMM},
+	{"Protocol error", EPROTO},
+	{"Bad message", EBADMSG},
+	{"File descriptor in bad state", EBADFD},
+	{"Streams pipe error", ESTRPIPE},
+	{"Too many users", EUSERS},
+	{"Socket operation on non-socket", ENOTSOCK},
+	{"Message too long", EMSGSIZE},
+	{"Protocol not available", ENOPROTOOPT},
+	{"Protocol not supported", EPROTONOSUPPORT},
+	{"Socket type not supported", ESOCKTNOSUPPORT},
+	{"Operation not supported", EOPNOTSUPP},
+	{"Protocol family not supported", EPFNOSUPPORT},
+	{"Network is down", ENETDOWN},
+	{"Network is unreachable", ENETUNREACH},
+	{"Network dropped connection on reset", ENETRESET},
+	{"Software caused connection abort", ECONNABORTED},
+	{"Connection reset by peer", ECONNRESET},
+	{"No buffer space available", ENOBUFS},
+	{"Transport endpoint is already connected", EISCONN},
+	{"Transport endpoint is not connected", ENOTCONN},
+	{"Cannot send after transport endpoint shutdown", ESHUTDOWN},
+	{"Connection timed out", ETIMEDOUT},
+	{"Connection refused", ECONNREFUSED},
+	{"Host is down", EHOSTDOWN},
+	{"No route to host", EHOSTUNREACH},
+	{"Operation already in progress", EALREADY},
+	{"Operation now in progress", EINPROGRESS},
+	{"Is a named type file", EISNAM},
+	{"Remote I/O error", EREMOTEIO},
+	{"Disk quota exceeded", EDQUOT},
 /* errors from fossil, vacfs, and u9fs */
 	{"fid unknown or out of range", EBADF},
 	{"permission denied", EACCES},
-- 
cgit v1.2.3-70-g09d2


From 73c592b9b844cc353bbaea690fb4aa652ac168a6 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:26 -0700
Subject: [PATCH] v9fs: Clean-up vfs_inode and setattr functions

Cleanup code in v9fs vfs_inode as suggested by Alexey Dobriyan.  Did some
major revamping of the v9fs setattr code to remove unnecessary allocations and
clean up some dead-code.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/error.h     |   2 +-
 fs/9p/vfs_inode.c | 131 +++++++++++++++---------------------------------------
 2 files changed, 36 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/error.h b/fs/9p/error.h
index 6dbb66f5b28..2eb5927d589 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -139,7 +139,7 @@ static struct errormap errmap[] = {
 	{"illegal mode", EINVAL},
 	{"illegal name", ENAMETOOLONG},
 	{"not a directory", ENOTDIR},
-	{"not a member of proposed group", EINVAL},
+	{"not a member of proposed group", EPERM},
 	{"not owner", EACCES},
 	{"only owner can change group in wstat", EACCES},
 	{"read only file system", EROFS},
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index ef78af7ef04..6d2357d1dac 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1,7 +1,7 @@
 /*
  *  linux/fs/9p/vfs_inode.c
  *
- * This file contians vfs inode ops for the 9P2000 protocol.
+ * This file contains vfs inode ops for the 9P2000 protocol.
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
@@ -54,7 +54,7 @@ static struct inode_operations v9fs_symlink_inode_operations;
  *
  */
 
-static inline int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
+static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
 {
 	int res;
 	res = mode & 0777;
@@ -92,7 +92,7 @@ static inline int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
  *
  */
 
-static inline int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 {
 	int res;
 
@@ -132,7 +132,7 @@ static inline int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
  *
  */
 
-static inline void
+static void
 v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat)
 {
 	mistat->type = ~0;
@@ -160,7 +160,7 @@ v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat)
 /**
  * v9fs_mistat2unix - convert mistat to unix stat
  * @mistat: Plan 9 metadata (mistat) structure
- * @stat: unix metadata (stat) structure to populate
+ * @buf: unix metadata (stat) structure to populate
  * @sb: superblock
  *
  */
@@ -177,22 +177,11 @@ v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
 	buf->st_mtime = mistat->mtime;
 	buf->st_ctime = mistat->mtime;
 
-	if (v9ses && v9ses->extended) {
-		/* TODO: string to uid mapping via user-space daemon */
-		buf->st_uid = mistat->n_uid;
-		buf->st_gid = mistat->n_gid;
-
-		sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
-		sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
-	} else {
-		buf->st_uid = v9ses->uid;
-		buf->st_gid = v9ses->gid;
-	}
-
 	buf->st_uid = (unsigned short)-1;
 	buf->st_gid = (unsigned short)-1;
 
 	if (v9ses && v9ses->extended) {
+		/* TODO: string to uid mapping via user-space daemon */
 		if (mistat->n_uid != -1)
 			sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
 
@@ -290,7 +279,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
  * @dir: directory inode file is being created in
  * @file_dentry: dentry file is being created in
  * @perm: permissions file is being created with
- * @open_mode: resulting open mode for file ???
+ * @open_mode: resulting open mode for file
  *
  */
 
@@ -434,9 +423,9 @@ v9fs_create(struct inode *dir,
 
 /**
  * v9fs_remove - helper function to remove files and directories
- * @inode: directory inode that is being deleted
- * @dentry:  dentry that is being deleted
- * @rmdir: where we are a file or a directory
+ * @dir: directory inode that is being deleted
+ * @file:  dentry that is being deleted
+ * @rmdir: removing a directory
  *
  */
 
@@ -502,7 +491,7 @@ v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
 
 /**
  * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
- * @i:  inode that is being unlinked
+ * @inode:  inode that is being unlinked
  * @dentry: dentry that is being unlinked
  * @mode: mode for new directory
  *
@@ -624,7 +613,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 /**
  * v9fs_vfs_unlink - VFS unlink hook to delete an inode
  * @i:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
+ * @d: dentry that is being unlinked
  *
  */
 
@@ -636,7 +625,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 /**
  * v9fs_vfs_rmdir - VFS unlink hook to delete a directory
  * @i:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
+ * @d: dentry that is being unlinked
  *
  */
 
@@ -674,6 +663,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	dprintk(DEBUG_VFS, "\n");
 
+	if (!mistat)
+		return -ENOMEM;
+
 	if ((!oldfid) || (!olddirfid) || (!newdirfid)) {
 		dprintk(DEBUG_ERROR, "problem with arguments\n");
 		return -EBADF;
@@ -771,20 +763,21 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
 	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
 	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
 	int res = -EPERM;
 
 	dprintk(DEBUG_VFS, "\n");
+
+	if (!mistat)
+		return -ENOMEM;
+
 	if (!fid) {
 		dprintk(DEBUG_ERROR,
 			"Couldn't find fid associated with dentry\n");
 		return -EBADF;
 	}
 
-	if (!mistat)
-		return -ENOMEM;
-
 	v9fs_blank_mistat(v9ses, mistat);
 	if (iattr->ia_valid & ATTR_MODE)
 		mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode);
@@ -799,72 +792,19 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		mistat->length = iattr->ia_size;
 
 	if (v9ses->extended) {
-		char *uid = kmalloc(strlen(mistat->uid), GFP_KERNEL);
-		char *gid = kmalloc(strlen(mistat->gid), GFP_KERNEL);
-		char *muid = kmalloc(strlen(mistat->muid), GFP_KERNEL);
-		char *name = kmalloc(strlen(mistat->name), GFP_KERNEL);
-		char *extension = kmalloc(strlen(mistat->extension),
-					  GFP_KERNEL);
-
-		if ((!uid) || (!gid) || (!muid) || (!name) || (!extension)) {
-			kfree(uid);
-			kfree(gid);
-			kfree(muid);
-			kfree(name);
-			kfree(extension);
-
-			return -ENOMEM;
-		}
-
-		strcpy(uid, mistat->uid);
-		strcpy(gid, mistat->gid);
-		strcpy(muid, mistat->muid);
-		strcpy(name, mistat->name);
-		strcpy(extension, mistat->extension);
+		char *ptr = mistat->data+1;
 
 		if (iattr->ia_valid & ATTR_UID) {
-			if (strlen(uid) != 8) {
-				dprintk(DEBUG_ERROR, "uid strlen is %u not 8\n",
-					(unsigned int)strlen(uid));
-				sprintf(uid, "%08x", iattr->ia_uid);
-			} else {
-				kfree(uid);
-				uid = kmalloc(9, GFP_KERNEL);
-			}
-
-			sprintf(uid, "%08x", iattr->ia_uid);
+			mistat->uid = ptr;
+			ptr += 1+sprintf(ptr, "%08x", iattr->ia_uid);
 			mistat->n_uid = iattr->ia_uid;
 		}
 
 		if (iattr->ia_valid & ATTR_GID) {
-			if (strlen(gid) != 8)
-				dprintk(DEBUG_ERROR, "gid strlen is %u not 8\n",
-					(unsigned int)strlen(gid));
-			else {
-				kfree(gid);
-				gid = kmalloc(9, GFP_KERNEL);
-			}
-
-			sprintf(gid, "%08x", iattr->ia_gid);
+			mistat->gid = ptr;
+			ptr += 1+sprintf(ptr, "%08x", iattr->ia_gid);
 			mistat->n_gid = iattr->ia_gid;
 		}
-
-		mistat->uid = mistat->data;
-		strcpy(mistat->uid, uid);
-		mistat->gid = mistat->data + strlen(uid) + 1;
-		strcpy(mistat->gid, gid);
-		mistat->muid = mistat->gid + strlen(gid) + 1;
-		strcpy(mistat->muid, muid);
-		mistat->name = mistat->muid + strlen(muid) + 1;
-		strcpy(mistat->name, name);
-		mistat->extension = mistat->name + strlen(name) + 1;
-		strcpy(mistat->extension, extension);
-
-		kfree(uid);
-		kfree(gid);
-		kfree(muid);
-		kfree(name);
-		kfree(extension);
 	}
 
 	res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall);
@@ -985,17 +925,14 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	int retval = -EPERM;
 	struct v9fs_fid *newfid;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct super_block *sb = dir ? dir->i_sb : NULL;
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
 
 	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
 		symname);
 
-	if ((!dentry) || (!sb) || (!v9ses)) {
-		dprintk(DEBUG_ERROR, "problem with arguments\n");
-		return -EBADF;
-	}
+	if (!mistat)
+		return -ENOMEM;
 
 	if (!v9ses->extended) {
 		dprintk(DEBUG_ERROR, "not extended\n");
@@ -1040,7 +977,7 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 /**
  * v9fs_readlink - read a symlink's location (internal version)
  * @dentry: dentry for symlink
- * @buf: buffer to load symlink location into
+ * @buffer: buffer to load symlink location into
  * @buflen: length of buffer
  *
  */
@@ -1179,7 +1116,7 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
  * v9fs_vfs_link - create a hardlink
  * @old_dentry: dentry for file to link to
  * @dir: inode destination for new link
- * @new_dentry: dentry for link
+ * @dentry: dentry for link
  *
  */
 
@@ -1274,6 +1211,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
 		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
 
+	if (!mistat)
+		return -ENOMEM;
+
 	if (!new_valid_dev(rdev)) {
 		retval = -EINVAL;
 		goto FreeMem;
@@ -1302,7 +1242,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 		sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISCHR(mode))
 		sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev));
-	else if (S_ISFIFO(mode)) ;	/* DO NOTHING */
+	else if (S_ISFIFO(mode))
+		;	/* DO NOTHING */
 	else {
 		retval = -EINVAL;
 		goto FreeMem;
@@ -1319,8 +1260,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 				FCALL_ERROR(fcall));
 			goto FreeMem;
 		}
-
-		kfree(fcall);
 	}
 
 	/* need to update dcache so we show up */
-- 
cgit v1.2.3-70-g09d2


From 5d58bec5b7a8b8303df0a4dcb9a18feeefac6091 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:27 -0700
Subject: [PATCH] v9fs: Fix support for special files (devices, named pipes,
 etc.)

Fix v9fs special files (block, char devices) support.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 6d2357d1dac..36fff087167 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -250,6 +250,9 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		case S_IFBLK:
 		case S_IFCHR:
 		case S_IFSOCK:
+			init_special_inode(inode, inode->i_mode,
+					   inode->i_rdev);
+			break;
 		case S_IFREG:
 			inode->i_op = &v9fs_file_inode_operations;
 			inode->i_fop = &v9fs_file_operations;
-- 
cgit v1.2.3-70-g09d2


From b501611a6f78558eafcf09b228abd866d4ea5d9f Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:27 -0700
Subject: [PATCH] v9fs: readlink extended mode check

LANL reported some issues with random crashes during mount of legacy protocol
servers (9P2000 versus 9P2000.u) -- crash was always happening in readlink
(which should never happen in legacy mode).  Added some sanity conditionals to
the get_inode code which should prevent the errors LANL was seeing.  Code
tested benign through regression.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 36fff087167..0c13fc60004 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,6 +44,7 @@
 #include "fid.h"
 
 static struct inode_operations v9fs_dir_inode_operations;
+static struct inode_operations v9fs_dir_inode_operations_ext;
 static struct inode_operations v9fs_file_inode_operations;
 static struct inode_operations v9fs_symlink_inode_operations;
 
@@ -232,6 +233,7 @@ v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
 struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
 	struct inode *inode = NULL;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
 
@@ -250,6 +252,10 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		case S_IFBLK:
 		case S_IFCHR:
 		case S_IFSOCK:
+			if(!v9ses->extended) {
+				dprintk(DEBUG_ERROR, "special files without extended mode\n");
+				return ERR_PTR(-EINVAL);
+			}
 			init_special_inode(inode, inode->i_mode,
 					   inode->i_rdev);
 			break;
@@ -257,14 +263,21 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 			inode->i_op = &v9fs_file_inode_operations;
 			inode->i_fop = &v9fs_file_operations;
 			break;
+		case S_IFLNK:
+			if(!v9ses->extended) {
+				dprintk(DEBUG_ERROR, "extended modes used w/o 9P2000.u\n");
+				return ERR_PTR(-EINVAL);
+			}
+			inode->i_op = &v9fs_symlink_inode_operations;
+			break;
 		case S_IFDIR:
 			inode->i_nlink++;
-			inode->i_op = &v9fs_dir_inode_operations;
+			if(v9ses->extended)
+				inode->i_op = &v9fs_dir_inode_operations_ext;
+			else
+				inode->i_op = &v9fs_dir_inode_operations;
 			inode->i_fop = &v9fs_dir_operations;
 			break;
-		case S_IFLNK:
-			inode->i_op = &v9fs_symlink_inode_operations;
-			break;
 		default:
 			dprintk(DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
 				mode, mode & S_IFMT);
@@ -1284,7 +1297,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	return retval;
 }
 
-static struct inode_operations v9fs_dir_inode_operations = {
+static struct inode_operations v9fs_dir_inode_operations_ext = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
 	.symlink = v9fs_vfs_symlink,
@@ -1299,6 +1312,18 @@ static struct inode_operations v9fs_dir_inode_operations = {
 	.setattr = v9fs_vfs_setattr,
 };
 
+static struct inode_operations v9fs_dir_inode_operations = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
 static struct inode_operations v9fs_file_inode_operations = {
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
-- 
cgit v1.2.3-70-g09d2


From cb2e87a65d6cd735eb06fa595bf90497af28c37b Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Fri, 9 Sep 2005 13:04:28 -0700
Subject: [PATCH] v9fs: fix handling of malformed 9P messages

This patch attempts to do a better job of cleaning up after detecting errors
on the transport.  This should also improve error reporting on broken
connections to servers.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/error.h      |  1 +
 fs/9p/mux.c        | 53 ++++++++++++++++++++++++++++++++++-------------------
 fs/9p/mux.h        |  1 +
 fs/9p/trans_sock.c | 12 ++++++++++--
 4 files changed, 46 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/error.h b/fs/9p/error.h
index 2eb5927d589..78f89acf7c9 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -47,6 +47,7 @@ static struct errormap errmap[] = {
 	{"Operation not permitted", EPERM},
 	{"wstat prohibited", EPERM},
 	{"No such file or directory", ENOENT},
+	{"directory entry not found", ENOENT},
 	{"file not found", ENOENT},
 	{"Interrupted system call", EINTR},
 	{"Input/output error", EIO},
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 0854bef58c1..8835b576f74 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -162,18 +162,21 @@ static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
 	dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag);
 	ret = wait_event_interruptible(v9ses->read_wait,
 		       ((v9ses->transport->status != Connected) ||
-			(req->rcall != 0) || dprintcond(v9ses, req)));
+			(req->rcall != 0) || (req->err < 0) ||
+			dprintcond(v9ses, req)));
 
 	dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall);
+
+	spin_lock(&v9ses->muxlock);
+	list_del(&req->next);
+	spin_unlock(&v9ses->muxlock);
+
+	if (req->err < 0)
+		return req->err;
+
 	if (v9ses->transport->status == Disconnected)
 		return -ECONNRESET;
 
-	if (ret == 0) {
-		spin_lock(&v9ses->muxlock);
-		list_del(&req->next);
-		spin_unlock(&v9ses->muxlock);
-	}
-
 	return ret;
 }
 
@@ -245,6 +248,9 @@ v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
 	if (!v9ses)
 		return -EINVAL;
 
+	if (!v9ses->transport || v9ses->transport->status != Connected)
+		return -EIO;
+
 	if (rcall)
 		*rcall = NULL;
 
@@ -257,6 +263,7 @@ v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
 	tcall->tag = tid;
 
 	req.tcall = tcall;
+	req.err = 0;
 	req.rcall = NULL;
 
 	ret = v9fs_send(v9ses, &req);
@@ -371,16 +378,21 @@ static int v9fs_recvproc(void *data)
 		}
 
 		err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ);
-		if (err < 0) {
-			kfree(rcall);
-			break;
-		}
 		spin_lock(&v9ses->muxlock);
-		list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
-			if (rreq->tcall->tag == rcall->tag) {
-				req = rreq;
-				req->rcall = rcall;
-				break;
+		if (err < 0) {
+			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+				rreq->err = err;
+			}
+			if(err != -ERESTARTSYS)
+				eprintk(KERN_ERR,
+					"Transport error while reading message %d\n", err);
+		} else {
+			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+				if (rreq->tcall->tag == rcall->tag) {
+					req = rreq;
+					req->rcall = rcall;
+					break;
+				}
 			}
 		}
 
@@ -399,9 +411,10 @@ static int v9fs_recvproc(void *data)
 		spin_unlock(&v9ses->muxlock);
 
 		if (!req) {
-			dprintk(DEBUG_ERROR,
-				"unexpected response: id %d tag %d\n",
-				rcall->id, rcall->tag);
+			if (err >= 0)
+				dprintk(DEBUG_ERROR,
+					"unexpected response: id %d tag %d\n",
+					rcall->id, rcall->tag);
 
 			kfree(rcall);
 		}
@@ -410,6 +423,8 @@ static int v9fs_recvproc(void *data)
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 
+	v9ses->transport->close(v9ses->transport);
+
 	/* Inform all pending processes about the failure */
 	wake_up_all(&v9ses->read_wait);
 
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index 82ce793af1b..4994cb10bad 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -28,6 +28,7 @@
 struct v9fs_rpcreq {
 	struct v9fs_fcall *tcall;
 	struct v9fs_fcall *rcall;
+	int err;	/* error code if response failed */
 
 	/* XXX - could we put scatter/gather buffers here? */
 
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index 081d1c84780..01e26f0013a 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -254,7 +254,12 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 
 static void v9fs_sock_close(struct v9fs_transport *trans)
 {
-	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+	struct v9fs_trans_sock *ts;
+
+	if (!trans)
+		return;
+
+	ts = trans->priv;
 
 	if ((ts) && (ts->s)) {
 		dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s);
@@ -264,7 +269,10 @@ static void v9fs_sock_close(struct v9fs_transport *trans)
 		dprintk(DEBUG_TRANS, "socket closed\n");
 	}
 
-	kfree(ts);
+	if (ts)
+		kfree(ts);
+
+	trans->priv = NULL;
 }
 
 struct v9fs_transport v9fs_trans_tcp = {
-- 
cgit v1.2.3-70-g09d2


From 04578f174f43d29b569500f01ba772afa4016330 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:22 -0700
Subject: [PATCH] FUSE - MAINTAINERS, Kconfig and Makefile changes

This patch adds FUSE filesystem to MAINTAINERS, fs/Kconfig and
fs/Makefile.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 MAINTAINERS |  7 +++++++
 fs/Kconfig  | 13 +++++++++++++
 fs/Makefile |  1 +
 3 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/MAINTAINERS b/MAINTAINERS
index 9c9a86d0685..f038dca34ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -931,6 +931,13 @@ L:	linux-tape@vger.kernel.org
 W:	http://sourceforge.net/projects/ftape
 S:	Orphan
 
+FUSE: FILESYSTEM IN USERSPACE
+P:	Miklos Szeredi
+M:	miklos@szeredi.hu
+L:	fuse-devel@lists.sourceforge.net
+W:	http://fuse.sourceforge.net/
+S:	Maintained
+
 FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit)
 P:	Rik Faith
 M:	faith@cs.unc.edu
diff --git a/fs/Kconfig b/fs/Kconfig
index 443aed4e206..068ccea2f18 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -462,6 +462,19 @@ config AUTOFS4_FS
 	  local network, you probably do not need an automounter, and can say
 	  N here.
 
+config FUSE_FS
+	tristate "Filesystem in Userspace support"
+	help
+	  With FUSE it is possible to implement a fully functional filesystem
+	  in a userspace program.
+
+	  There's also companion library: libfuse.  This library along with
+	  utilities is available from the FUSE homepage:
+	  <http://fuse.sourceforge.net/>
+
+	  If you want to develop a userspace FS, or if you want to use
+	  a filesystem based on FUSE, answer Y or M.
+
 menu "CD-ROM/DVD Filesystems"
 
 config ISO9660_FS
diff --git a/fs/Makefile b/fs/Makefile
index d646502c1ef..1972da18627 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
 obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
+obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_UDF_FS)		+= udf/
 obj-$(CONFIG_RELAYFS_FS)	+= relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
-- 
cgit v1.2.3-70-g09d2


From d8a5ba45457e4a22aa39c939121efd7bb6c76672 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:26 -0700
Subject: [PATCH] FUSE - core

This patch adds FUSE core.

This contains the following files:

 o inode.c
    - superblock operations (alloc_inode, destroy_inode, read_inode,
      clear_inode, put_super, show_options)
    - registers FUSE filesystem

 o fuse_i.h
    - private header file

Requirements
============

 The most important difference between orinary filesystems and FUSE is
 the fact, that the filesystem data/metadata is provided by a userspace
 process run with the privileges of the mount "owner" instead of the
 kernel, or some remote entity usually running with elevated
 privileges.

 The security implication of this is that a non-privileged user must
 not be able to use this capability to compromise the system.  Obvious
 requirements arising from this are:

  - mount owner should not be able to get elevated privileges with the
    help of the mounted filesystem

  - mount owner should not be able to induce undesired behavior in
    other users' or the super user's processes

  - mount owner should not get illegitimate access to information from
    other users' and the super user's processes

 These are currently ensured with the following constraints:

  1) mount is only allowed to directory or file which the mount owner
    can modify without limitation (write access + no sticky bit for
    directories)

  2) nosuid,nodev mount options are forced

  3) any process running with fsuid different from the owner is denied
     all access to the filesystem

 1) and 2) are ensured by the "fusermount" mount utility which is a
    setuid root application doing the actual mount operation.

 3) is ensured by a check in the permission() method in kernel

 I started thinking about doing 3) in a different way because Christoph
 H. made a big deal out of it, saying that FUSE is unacceptable into
 mainline in this form.

 The suggested use of private namespaces would be OK, but in their
 current form have many limitations that make their use impractical (as
 discussed in this thread).

 Suggested improvements that would address these limitations:

   - implement shared subtrees

   - allow a process to join an existing namespace (make namespaces
     first-class objects)

   - implement the namespace creation/joining in a PAM module

 With all that in place the check of owner against current->fsuid may
 be removed from the FUSE kernel module, without compromising the
 security requirements.

 Suid programs still interesting questions, since they get access even
 to the private namespace causing some information leak (exact
 order/timing of filesystem operations performed), giving some
 ptrace-like capabilities to unprivileged users.  BTW this problem is
 not strictly limited to the namespace approach, since suid programs
 setting fsuid and accessing users' files will succeed with the current
 approach too.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/Makefile     |   7 +
 fs/fuse/fuse_i.h     |  89 +++++++++++
 fs/fuse/inode.c      | 428 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fuse.h |  38 +++++
 4 files changed, 562 insertions(+)
 create mode 100644 fs/fuse/Makefile
 create mode 100644 fs/fuse/fuse_i.h
 create mode 100644 fs/fuse/inode.c
 create mode 100644 include/linux/fuse.h

(limited to 'fs')

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
new file mode 100644
index 00000000000..9c3e4cc7b1a
--- /dev/null
+++ b/fs/fuse/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the FUSE filesystem.
+#
+
+obj-$(CONFIG_FUSE_FS) += fuse.o
+
+fuse-objs := inode.o
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
new file mode 100644
index 00000000000..eed6e89ce01
--- /dev/null
+++ b/fs/fuse/fuse_i.h
@@ -0,0 +1,89 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include <linux/fuse.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <asm/semaphore.h>
+
+/** FUSE inode */
+struct fuse_inode {
+	/** Inode data */
+	struct inode inode;
+
+	/** Unique ID, which identifies the inode between userspace
+	 * and kernel */
+	u64 nodeid;
+
+	/** Time in jiffies until the file attributes are valid */
+	unsigned long i_time;
+};
+
+/**
+ * A Fuse connection.
+ *
+ * This structure is created, when the filesystem is mounted, and is
+ * destroyed, when the client device is closed and the filesystem is
+ * unmounted.
+ */
+struct fuse_conn {
+	/** The superblock of the mounted filesystem */
+	struct super_block *sb;
+
+	/** The user id for this mount */
+	uid_t user_id;
+
+	/** Backing dev info */
+	struct backing_dev_info bdi;
+};
+
+static inline struct fuse_conn **get_fuse_conn_super_p(struct super_block *sb)
+{
+	return (struct fuse_conn **) &sb->s_fs_info;
+}
+
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+	return *get_fuse_conn_super_p(sb);
+}
+
+static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
+{
+	return get_fuse_conn_super(inode->i_sb);
+}
+
+static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+{
+	return container_of(inode, struct fuse_inode, inode);
+}
+
+static inline u64 get_node_id(struct inode *inode)
+{
+	return get_fuse_inode(inode)->nodeid;
+}
+
+/**
+ * This is the single global spinlock which protects FUSE's structures
+ *
+ * The following data is protected by this lock:
+ *
+ *  - the s_fs_info field of the super block
+ *  - the sb (super_block) field in fuse_conn
+ */
+extern spinlock_t fuse_lock;
+
+/**
+ * Check if the connection can be released, and if yes, then free the
+ * connection structure
+ */
+void fuse_release_conn(struct fuse_conn *fc);
+
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
new file mode 100644
index 00000000000..ea6339c2b6a
--- /dev/null
+++ b/fs/fuse/inode.c
@@ -0,0 +1,428 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Filesystem in Userspace");
+MODULE_LICENSE("GPL");
+
+spinlock_t fuse_lock;
+static kmem_cache_t *fuse_inode_cachep;
+static int mount_count;
+
+static int mount_max = 1000;
+module_param(mount_max, int, 0644);
+MODULE_PARM_DESC(mount_max, "Maximum number of FUSE mounts allowed, if -1 then unlimited (default: 1000)");
+
+#define FUSE_SUPER_MAGIC 0x65735546
+
+struct fuse_mount_data {
+	int fd;
+	unsigned rootmode;
+	unsigned user_id;
+};
+
+static struct inode *fuse_alloc_inode(struct super_block *sb)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+
+	inode = kmem_cache_alloc(fuse_inode_cachep, SLAB_KERNEL);
+	if (!inode)
+		return NULL;
+
+	fi = get_fuse_inode(inode);
+	fi->i_time = jiffies - 1;
+	fi->nodeid = 0;
+
+	return inode;
+}
+
+static void fuse_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(fuse_inode_cachep, inode);
+}
+
+static void fuse_read_inode(struct inode *inode)
+{
+	/* No op */
+}
+
+static void fuse_clear_inode(struct inode *inode)
+{
+}
+
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
+{
+	if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
+		invalidate_inode_pages(inode->i_mapping);
+
+	inode->i_ino     = attr->ino;
+	inode->i_mode    = (inode->i_mode & S_IFMT) + (attr->mode & 07777);
+	inode->i_nlink   = attr->nlink;
+	inode->i_uid     = attr->uid;
+	inode->i_gid     = attr->gid;
+	i_size_write(inode, attr->size);
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks  = attr->blocks;
+	inode->i_atime.tv_sec   = attr->atime;
+	inode->i_atime.tv_nsec  = attr->atimensec;
+	inode->i_mtime.tv_sec   = attr->mtime;
+	inode->i_mtime.tv_nsec  = attr->mtimensec;
+	inode->i_ctime.tv_sec   = attr->ctime;
+	inode->i_ctime.tv_nsec  = attr->ctimensec;
+}
+
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+{
+	inode->i_mode = attr->mode & S_IFMT;
+	i_size_write(inode, attr->size);
+}
+
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+{
+	unsigned long nodeid = *(unsigned long *) _nodeidp;
+	if (get_node_id(inode) == nodeid)
+		return 1;
+	else
+		return 0;
+}
+
+static int fuse_inode_set(struct inode *inode, void *_nodeidp)
+{
+	unsigned long nodeid = *(unsigned long *) _nodeidp;
+	get_fuse_inode(inode)->nodeid = nodeid;
+	return 0;
+}
+
+struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+			int generation, struct fuse_attr *attr, int version)
+{
+	struct inode *inode;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	int retried = 0;
+
+ retry:
+	inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+	if (!inode)
+		return NULL;
+
+	if ((inode->i_state & I_NEW)) {
+		inode->i_generation = generation;
+		inode->i_data.backing_dev_info = &fc->bdi;
+		fuse_init_inode(inode, attr);
+		unlock_new_inode(inode);
+	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+		BUG_ON(retried);
+		/* Inode has changed type, any I/O on the old should fail */
+		make_bad_inode(inode);
+		iput(inode);
+		retried = 1;
+		goto retry;
+	}
+
+	fuse_change_attributes(inode, attr);
+	inode->i_version = version;
+	return inode;
+}
+
+static void fuse_put_super(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	spin_lock(&fuse_lock);
+	mount_count --;
+	fc->sb = NULL;
+	fc->user_id = 0;
+	fuse_release_conn(fc);
+	*get_fuse_conn_super_p(sb) = NULL;
+	spin_unlock(&fuse_lock);
+}
+
+enum {
+	OPT_FD,
+	OPT_ROOTMODE,
+	OPT_USER_ID,
+	OPT_DEFAULT_PERMISSIONS,
+	OPT_ALLOW_OTHER,
+	OPT_ALLOW_ROOT,
+	OPT_KERNEL_CACHE,
+	OPT_ERR
+};
+
+static match_table_t tokens = {
+	{OPT_FD,			"fd=%u"},
+	{OPT_ROOTMODE,			"rootmode=%o"},
+	{OPT_USER_ID,			"user_id=%u"},
+	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
+	{OPT_ALLOW_OTHER,		"allow_other"},
+	{OPT_ALLOW_ROOT,		"allow_root"},
+	{OPT_KERNEL_CACHE,		"kernel_cache"},
+	{OPT_ERR,			NULL}
+};
+
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
+{
+	char *p;
+	memset(d, 0, sizeof(struct fuse_mount_data));
+	d->fd = -1;
+
+	while ((p = strsep(&opt, ",")) != NULL) {
+		int token;
+		int value;
+		substring_t args[MAX_OPT_ARGS];
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case OPT_FD:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->fd = value;
+			break;
+
+		case OPT_ROOTMODE:
+			if (match_octal(&args[0], &value))
+				return 0;
+			d->rootmode = value;
+			break;
+
+		case OPT_USER_ID:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->user_id = value;
+			break;
+
+		default:
+			return 0;
+		}
+	}
+	if (d->fd == -1)
+		return 0;
+
+	return 1;
+}
+
+static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
+
+	seq_printf(m, ",user_id=%u", fc->user_id);
+	return 0;
+}
+
+void fuse_release_conn(struct fuse_conn *fc)
+{
+	kfree(fc);
+}
+
+static struct fuse_conn *new_conn(void)
+{
+	struct fuse_conn *fc;
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	if (fc != NULL) {
+		memset(fc, 0, sizeof(*fc));
+		fc->sb = NULL;
+		fc->user_id = 0;
+		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+	}
+	return fc;
+}
+
+static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
+{
+	struct fuse_conn *fc;
+
+	fc = new_conn();
+	if (fc == NULL)
+		return NULL;
+	spin_lock(&fuse_lock);
+	fc->sb = sb;
+	spin_unlock(&fuse_lock);
+	return fc;
+}
+
+static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+{
+	struct fuse_attr attr;
+	memset(&attr, 0, sizeof(attr));
+
+	attr.mode = mode;
+	attr.ino = FUSE_ROOT_ID;
+	return fuse_iget(sb, 1, 0, &attr, 0);
+}
+
+static struct super_operations fuse_super_operations = {
+	.alloc_inode    = fuse_alloc_inode,
+	.destroy_inode  = fuse_destroy_inode,
+	.read_inode	= fuse_read_inode,
+	.clear_inode	= fuse_clear_inode,
+	.put_super	= fuse_put_super,
+	.show_options	= fuse_show_options,
+};
+
+static int inc_mount_count(void)
+{
+	int success = 0;
+	spin_lock(&fuse_lock);
+	mount_count ++;
+	if (mount_max == -1 || mount_count <= mount_max)
+		success = 1;
+	spin_unlock(&fuse_lock);
+	return success;
+}
+
+static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct fuse_conn *fc;
+	struct inode *root;
+	struct fuse_mount_data d;
+	struct file *file;
+	int err;
+
+	if (!parse_fuse_opt((char *) data, &d))
+		return -EINVAL;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = FUSE_SUPER_MAGIC;
+	sb->s_op = &fuse_super_operations;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	file = fget(d.fd);
+	if (!file)
+		return -EINVAL;
+
+	fc = get_conn(file, sb);
+	fput(file);
+	if (fc == NULL)
+		return -EINVAL;
+
+	fc->user_id = d.user_id;
+
+	*get_fuse_conn_super_p(sb) = fc;
+
+	err = -ENFILE;
+	if (!inc_mount_count() && current->uid != 0)
+		goto err;
+
+	err = -ENOMEM;
+	root = get_root_inode(sb, d.rootmode);
+	if (root == NULL)
+		goto err;
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		goto err;
+	}
+	return 0;
+
+ err:
+	spin_lock(&fuse_lock);
+	mount_count --;
+	fc->sb = NULL;
+	fuse_release_conn(fc);
+	spin_unlock(&fuse_lock);
+	*get_fuse_conn_super_p(sb) = NULL;
+	return err;
+}
+
+static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
+				       int flags, const char *dev_name,
+				       void *raw_data)
+{
+	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+}
+
+static struct file_system_type fuse_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuse",
+	.get_sb		= fuse_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
+				 unsigned long flags)
+{
+	struct inode * inode = foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(inode);
+}
+
+static int __init fuse_fs_init(void)
+{
+	int err;
+
+	err = register_filesystem(&fuse_fs_type);
+	if (err)
+		printk("fuse: failed to register filesystem\n");
+	else {
+		fuse_inode_cachep = kmem_cache_create("fuse_inode",
+						      sizeof(struct fuse_inode),
+						      0, SLAB_HWCACHE_ALIGN,
+						      fuse_inode_init_once, NULL);
+		if (!fuse_inode_cachep) {
+			unregister_filesystem(&fuse_fs_type);
+			err = -ENOMEM;
+		}
+	}
+
+	return err;
+}
+
+static void fuse_fs_cleanup(void)
+{
+	unregister_filesystem(&fuse_fs_type);
+	kmem_cache_destroy(fuse_inode_cachep);
+}
+
+static int __init fuse_init(void)
+{
+	int res;
+
+	printk("fuse init (API version %i.%i)\n",
+	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+
+	spin_lock_init(&fuse_lock);
+	res = fuse_fs_init();
+	if (res)
+		goto err;
+
+	return 0;
+
+ err:
+	return res;
+}
+
+static void __exit fuse_exit(void)
+{
+	printk(KERN_DEBUG "fuse exit\n");
+
+	fuse_fs_cleanup();
+}
+
+module_init(fuse_init);
+module_exit(fuse_exit);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
new file mode 100644
index 00000000000..2b1f4ae01e9
--- /dev/null
+++ b/include/linux/fuse.h
@@ -0,0 +1,38 @@
+/*
+    FUSE: Filesystem in Userspace
+    Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+    This program can be distributed under the terms of the GNU GPL.
+    See the file COPYING.
+*/
+
+/* This file defines the kernel interface of FUSE */
+
+#include <asm/types.h>
+
+/** Version number of this interface */
+#define FUSE_KERNEL_VERSION 5
+
+/** Minor version number of this interface */
+#define FUSE_KERNEL_MINOR_VERSION 1
+
+/** The node ID of the root inode */
+#define FUSE_ROOT_ID 1
+
+struct fuse_attr {
+	__u64	ino;
+	__u64	size;
+	__u64	blocks;
+	__u64	atime;
+	__u64	mtime;
+	__u64	ctime;
+	__u32	atimensec;
+	__u32	mtimensec;
+	__u32	ctimensec;
+	__u32	mode;
+	__u32	nlink;
+	__u32	uid;
+	__u32	gid;
+	__u32	rdev;
+};
+
-- 
cgit v1.2.3-70-g09d2


From 334f485df85ac7736ebe14940bf0a059c5f26d7d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:27 -0700
Subject: [PATCH] FUSE - device functions

This adds the FUSE device handling functions.

This contains the following files:

 o dev.c
    - fuse device operations (read, write, release, poll)
    - registers misc device
    - support for sending requests to userspace

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/fuse.txt | 341 ++++++++++++++
 fs/fuse/Makefile                   |   2 +-
 fs/fuse/dev.c                      | 884 +++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h                   | 223 ++++++++++
 fs/fuse/inode.c                    |  58 ++-
 include/linux/fuse.h               |  36 +-
 6 files changed, 1537 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/filesystems/fuse.txt
 create mode 100644 fs/fuse/dev.c

(limited to 'fs')

diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
new file mode 100644
index 00000000000..83f96cf5696
--- /dev/null
+++ b/Documentation/filesystems/fuse.txt
@@ -0,0 +1,341 @@
+Definitions
+~~~~~~~~~~~
+
+Userspace filesystem:
+
+  A filesystem in which data and metadata are provided by an ordinary
+  userspace process.  The filesystem can be accessed normally through
+  the kernel interface.
+
+Filesystem daemon:
+
+  The process(es) providing the data and metadata of the filesystem.
+
+Non-privileged mount (or user mount):
+
+  A userspace filesystem mounted by a non-privileged (non-root) user.
+  The filesystem daemon is running with the privileges of the mounting
+  user.  NOTE: this is not the same as mounts allowed with the "user"
+  option in /etc/fstab, which is not discussed here.
+
+Mount owner:
+
+  The user who does the mounting.
+
+User:
+
+  The user who is performing filesystem operations.
+
+What is FUSE?
+~~~~~~~~~~~~~
+
+FUSE is a userspace filesystem framework.  It consists of a kernel
+module (fuse.ko), a userspace library (libfuse.*) and a mount utility
+(fusermount).
+
+One of the most important features of FUSE is allowing secure,
+non-privileged mounts.  This opens up new possibilities for the use of
+filesystems.  A good example is sshfs: a secure network filesystem
+using the sftp protocol.
+
+The userspace library and utilities are available from the FUSE
+homepage:
+
+  http://fuse.sourceforge.net/
+
+Mount options
+~~~~~~~~~~~~~
+
+'fd=N'
+
+  The file descriptor to use for communication between the userspace
+  filesystem and the kernel.  The file descriptor must have been
+  obtained by opening the FUSE device ('/dev/fuse').
+
+'rootmode=M'
+
+  The file mode of the filesystem's root in octal representation.
+
+'user_id=N'
+
+  The numeric user id of the mount owner.
+
+'group_id=N'
+
+  The numeric group id of the mount owner.
+
+'default_permissions'
+
+  By default FUSE doesn't check file access permissions, the
+  filesystem is free to implement it's access policy or leave it to
+  the underlying file access mechanism (e.g. in case of network
+  filesystems).  This option enables permission checking, restricting
+  access based on file mode.  This is option is usually useful
+  together with the 'allow_other' mount option.
+
+'allow_other'
+
+  This option overrides the security measure restricting file access
+  to the user mounting the filesystem.  This option is by default only
+  allowed to root, but this restriction can be removed with a
+  (userspace) configuration option.
+
+'kernel_cache'
+
+  This option disables flushing the cache of the file contents on
+  every open().  This should only be enabled on filesystems, where the
+  file data is never changed externally (not through the mounted FUSE
+  filesystem).  Thus it is not suitable for network filesystems and
+  other "intermediate" filesystems.
+
+  NOTE: if this option is not specified (and neither 'direct_io') data
+  is still cached after the open(), so a read() system call will not
+  always initiate a read operation.
+
+'direct_io'
+
+  This option disables the use of page cache (file content cache) in
+  the kernel for this filesystem.  This has several affects:
+
+     - Each read() or write() system call will initiate one or more
+       read or write operations, data will not be cached in the
+       kernel.
+
+     - The return value of the read() and write() system calls will
+       correspond to the return values of the read and write
+       operations.  This is useful for example if the file size is not
+       known in advance (before reading it).
+
+'max_read=N'
+
+  With this option the maximum size of read operations can be set.
+  The default is infinite.  Note that the size of read requests is
+  limited anyway to 32 pages (which is 128kbyte on i386).
+
+How do non-privileged mounts work?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since the mount() system call is a privileged operation, a helper
+program (fusermount) is needed, which is installed setuid root.
+
+The implication of providing non-privileged mounts is that the mount
+owner must not be able to use this capability to compromise the
+system.  Obvious requirements arising from this are:
+
+ A) mount owner should not be able to get elevated privileges with the
+    help of the mounted filesystem
+
+ B) mount owner should not get illegitimate access to information from
+    other users' and the super user's processes
+
+ C) mount owner should not be able to induce undesired behavior in
+    other users' or the super user's processes
+
+How are requirements fulfilled?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ A) The mount owner could gain elevated privileges by either:
+
+     1) creating a filesystem containing a device file, then opening
+	this device
+
+     2) creating a filesystem containing a suid or sgid application,
+	then executing this application
+
+    The solution is not to allow opening device files and ignore
+    setuid and setgid bits when executing programs.  To ensure this
+    fusermount always adds "nosuid" and "nodev" to the mount options
+    for non-privileged mounts.
+
+ B) If another user is accessing files or directories in the
+    filesystem, the filesystem daemon serving requests can record the
+    exact sequence and timing of operations performed.  This
+    information is otherwise inaccessible to the mount owner, so this
+    counts as an information leak.
+
+    The solution to this problem will be presented in point 2) of C).
+
+ C) There are several ways in which the mount owner can induce
+    undesired behavior in other users' processes, such as:
+
+     1) mounting a filesystem over a file or directory which the mount
+        owner could otherwise not be able to modify (or could only
+        make limited modifications).
+
+        This is solved in fusermount, by checking the access
+        permissions on the mountpoint and only allowing the mount if
+        the mount owner can do unlimited modification (has write
+        access to the mountpoint, and mountpoint is not a "sticky"
+        directory)
+
+     2) Even if 1) is solved the mount owner can change the behavior
+        of other users' processes.
+
+         i) It can slow down or indefinitely delay the execution of a
+           filesystem operation creating a DoS against the user or the
+           whole system.  For example a suid application locking a
+           system file, and then accessing a file on the mount owner's
+           filesystem could be stopped, and thus causing the system
+           file to be locked forever.
+
+         ii) It can present files or directories of unlimited length, or
+           directory structures of unlimited depth, possibly causing a
+           system process to eat up diskspace, memory or other
+           resources, again causing DoS.
+
+	The solution to this as well as B) is not to allow processes
+	to access the filesystem, which could otherwise not be
+	monitored or manipulated by the mount owner.  Since if the
+	mount owner can ptrace a process, it can do all of the above
+	without using a FUSE mount, the same criteria as used in
+	ptrace can be used to check if a process is allowed to access
+	the filesystem or not.
+
+	Note that the ptrace check is not strictly necessary to
+	prevent B/2/i, it is enough to check if mount owner has enough
+	privilege to send signal to the process accessing the
+	filesystem, since SIGSTOP can be used to get a similar effect.
+
+I think these limitations are unacceptable?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a sysadmin trusts the users enough, or can ensure through other
+measures, that system processes will never enter non-privileged
+mounts, it can relax the last limitation with a "user_allow_other"
+config option.  If this config option is set, the mounting user can
+add the "allow_other" mount option which disables the check for other
+users' processes.
+
+Kernel - userspace interface
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following diagram shows how a filesystem operation (in this
+example unlink) is performed in FUSE.
+
+NOTE: everything in this description is greatly simplified
+
+ |  "rm /mnt/fuse/file"               |  FUSE filesystem daemon
+ |                                    |
+ |                                    |  >sys_read()
+ |                                    |    >fuse_dev_read()
+ |                                    |      >request_wait()
+ |                                    |        [sleep on fc->waitq]
+ |                                    |
+ |  >sys_unlink()                     |
+ |    >fuse_unlink()                  |
+ |      [get request from             |
+ |       fc->unused_list]             |
+ |      >request_send()               |
+ |        [queue req on fc->pending]  |
+ |        [wake up fc->waitq]         |        [woken up]
+ |        >request_wait_answer()      |
+ |          [sleep on req->waitq]     |
+ |                                    |      <request_wait()
+ |                                    |      [remove req from fc->pending]
+ |                                    |      [copy req to read buffer]
+ |                                    |      [add req to fc->processing]
+ |                                    |    <fuse_dev_read()
+ |                                    |  <sys_read()
+ |                                    |
+ |                                    |  [perform unlink]
+ |                                    |
+ |                                    |  >sys_write()
+ |                                    |    >fuse_dev_write()
+ |                                    |      [look up req in fc->processing]
+ |                                    |      [remove from fc->processing]
+ |                                    |      [copy write buffer to req]
+ |          [woken up]                |      [wake up req->waitq]
+ |                                    |    <fuse_dev_write()
+ |                                    |  <sys_write()
+ |        <request_wait_answer()      |
+ |      <request_send()               |
+ |      [add request to               |
+ |       fc->unused_list]             |
+ |    <fuse_unlink()                  |
+ |  <sys_unlink()                     |
+
+There are a couple of ways in which to deadlock a FUSE filesystem.
+Since we are talking about unprivileged userspace programs,
+something must be done about these.
+
+Scenario 1 -  Simple deadlock
+-----------------------------
+
+ |  "rm /mnt/fuse/file"               |  FUSE filesystem daemon
+ |                                    |
+ |  >sys_unlink("/mnt/fuse/file")     |
+ |    [acquire inode semaphore        |
+ |     for "file"]                    |
+ |    >fuse_unlink()                  |
+ |      [sleep on req->waitq]         |
+ |                                    |  <sys_read()
+ |                                    |  >sys_unlink("/mnt/fuse/file")
+ |                                    |    [acquire inode semaphore
+ |                                    |     for "file"]
+ |                                    |    *DEADLOCK*
+
+The solution for this is to allow requests to be interrupted while
+they are in userspace:
+
+ |      [interrupted by signal]       |
+ |    <fuse_unlink()                  |
+ |    [release semaphore]             |    [semaphore acquired]
+ |  <sys_unlink()                     |
+ |                                    |    >fuse_unlink()
+ |                                    |      [queue req on fc->pending]
+ |                                    |      [wake up fc->waitq]
+ |                                    |      [sleep on req->waitq]
+
+If the filesystem daemon was single threaded, this will stop here,
+since there's no other thread to dequeue and execute the request.
+In this case the solution is to kill the FUSE daemon as well.  If
+there are multiple serving threads, you just have to kill them as
+long as any remain.
+
+Moral: a filesystem which deadlocks, can soon find itself dead.
+
+Scenario 2 - Tricky deadlock
+----------------------------
+
+This one needs a carefully crafted filesystem.  It's a variation on
+the above, only the call back to the filesystem is not explicit,
+but is caused by a pagefault.
+
+ |  Kamikaze filesystem thread 1      |  Kamikaze filesystem thread 2
+ |                                    |
+ |  [fd = open("/mnt/fuse/file")]     |  [request served normally]
+ |  [mmap fd to 'addr']               |
+ |  [close fd]                        |  [FLUSH triggers 'magic' flag]
+ |  [read a byte from addr]           |
+ |    >do_page_fault()                |
+ |      [find or create page]         |
+ |      [lock page]                   |
+ |      >fuse_readpage()              |
+ |         [queue READ request]       |
+ |         [sleep on req->waitq]      |
+ |                                    |  [read request to buffer]
+ |                                    |  [create reply header before addr]
+ |                                    |  >sys_write(addr - headerlength)
+ |                                    |    >fuse_dev_write()
+ |                                    |      [look up req in fc->processing]
+ |                                    |      [remove from fc->processing]
+ |                                    |      [copy write buffer to req]
+ |                                    |        >do_page_fault()
+ |                                    |           [find or create page]
+ |                                    |           [lock page]
+ |                                    |           * DEADLOCK *
+
+Solution is again to let the the request be interrupted (not
+elaborated further).
+
+An additional problem is that while the write buffer is being
+copied to the request, the request must not be interrupted.  This
+is because the destination address of the copy may not be valid
+after the request is interrupted.
+
+This is solved with doing the copy atomically, and allowing
+interruption while the page(s) belonging to the write buffer are
+faulted with get_user_pages().  The 'req->locked' flag indicates
+when the copy is taking place, and interruption is delayed until
+this flag is unset.
+
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 9c3e4cc7b1a..21021c35648 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 
-fuse-objs := inode.o
+fuse-objs := dev.o inode.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
new file mode 100644
index 00000000000..9aaf10a6588
--- /dev/null
+++ b/fs/fuse/dev.c
@@ -0,0 +1,884 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+
+MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+
+static kmem_cache_t *fuse_req_cachep;
+
+static inline struct fuse_conn *fuse_get_conn(struct file *file)
+{
+	struct fuse_conn *fc;
+	spin_lock(&fuse_lock);
+	fc = file->private_data;
+	if (fc && !fc->sb)
+		fc = NULL;
+	spin_unlock(&fuse_lock);
+	return fc;
+}
+
+static inline void fuse_request_init(struct fuse_req *req)
+{
+	memset(req, 0, sizeof(*req));
+	INIT_LIST_HEAD(&req->list);
+	init_waitqueue_head(&req->waitq);
+	atomic_set(&req->count, 1);
+}
+
+struct fuse_req *fuse_request_alloc(void)
+{
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
+	if (req)
+		fuse_request_init(req);
+	return req;
+}
+
+void fuse_request_free(struct fuse_req *req)
+{
+	kmem_cache_free(fuse_req_cachep, req);
+}
+
+static inline void block_sigs(sigset_t *oldset)
+{
+	sigset_t mask;
+
+	siginitsetinv(&mask, sigmask(SIGKILL));
+	sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static inline void restore_sigs(sigset_t *oldset)
+{
+	sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
+void fuse_reset_request(struct fuse_req *req)
+{
+	int preallocated = req->preallocated;
+	BUG_ON(atomic_read(&req->count) != 1);
+	fuse_request_init(req);
+	req->preallocated = preallocated;
+}
+
+static void __fuse_get_request(struct fuse_req *req)
+{
+	atomic_inc(&req->count);
+}
+
+/* Must be called with > 1 refcount */
+static void __fuse_put_request(struct fuse_req *req)
+{
+	BUG_ON(atomic_read(&req->count) < 2);
+	atomic_dec(&req->count);
+}
+
+static struct fuse_req *do_get_request(struct fuse_conn *fc)
+{
+	struct fuse_req *req;
+
+	spin_lock(&fuse_lock);
+	BUG_ON(list_empty(&fc->unused_list));
+	req = list_entry(fc->unused_list.next, struct fuse_req, list);
+	list_del_init(&req->list);
+	spin_unlock(&fuse_lock);
+	fuse_request_init(req);
+	req->preallocated = 1;
+	req->in.h.uid = current->fsuid;
+	req->in.h.gid = current->fsgid;
+	req->in.h.pid = current->pid;
+	return req;
+}
+
+struct fuse_req *fuse_get_request(struct fuse_conn *fc)
+{
+	if (down_interruptible(&fc->outstanding_sem))
+		return NULL;
+	return do_get_request(fc);
+}
+
+/*
+ * Non-interruptible version of the above function is for operations
+ * which can't legally return -ERESTART{SYS,NOINTR}.  This can still
+ * return NULL, but only in case the signal is SIGKILL.
+ */
+struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc)
+{
+	int intr;
+	sigset_t oldset;
+
+	block_sigs(&oldset);
+	intr = down_interruptible(&fc->outstanding_sem);
+	restore_sigs(&oldset);
+	return intr ? NULL : do_get_request(fc);
+}
+
+static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	spin_lock(&fuse_lock);
+	if (req->preallocated)
+		list_add(&req->list, &fc->unused_list);
+	else
+		fuse_request_free(req);
+
+	/* If we are in debt decrease that first */
+	if (fc->outstanding_debt)
+		fc->outstanding_debt--;
+	else
+		up(&fc->outstanding_sem);
+	spin_unlock(&fuse_lock);
+}
+
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (atomic_dec_and_test(&req->count))
+		fuse_putback_request(fc, req);
+}
+
+/*
+ * This function is called when a request is finished.  Either a reply
+ * has arrived or it was interrupted (and not yet sent) or some error
+ * occured during communication with userspace, or the device file was
+ * closed.  It decreases the referece count for the request.  In case
+ * of a background request the referece to the stored objects are
+ * released.  The requester thread is woken up (if still waiting), and
+ * finally the request is either freed or put on the unused_list
+ *
+ * Called with fuse_lock, unlocks it
+ */
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int putback;
+	req->finished = 1;
+	putback = atomic_dec_and_test(&req->count);
+	spin_unlock(&fuse_lock);
+	if (req->background) {
+		if (req->inode)
+			iput(req->inode);
+		if (req->inode2)
+			iput(req->inode2);
+		if (req->file)
+			fput(req->file);
+	}
+	wake_up(&req->waitq);
+	if (req->in.h.opcode == FUSE_INIT) {
+		int i;
+
+		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
+			fc->conn_error = 1;
+
+		/* After INIT reply is received other requests can go
+		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
+		   up()s on outstanding_sem.  The last up() is done in
+		   fuse_putback_request() */
+		for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+			up(&fc->outstanding_sem);
+	}
+	if (putback)
+		fuse_putback_request(fc, req);
+}
+
+static void background_request(struct fuse_req *req)
+{
+	/* Need to get hold of the inode(s) and/or file used in the
+	   request, so FORGET and RELEASE are not sent too early */
+	req->background = 1;
+	if (req->inode)
+		req->inode = igrab(req->inode);
+	if (req->inode2)
+		req->inode2 = igrab(req->inode2);
+	if (req->file)
+		get_file(req->file);
+}
+
+static int request_wait_answer_nonint(struct fuse_req *req)
+{
+	int err;
+	sigset_t oldset;
+	block_sigs(&oldset);
+	err = wait_event_interruptible(req->waitq, req->finished);
+	restore_sigs(&oldset);
+	return err;
+}
+
+/* Called with fuse_lock held.  Releases, and then reacquires it. */
+static void request_wait_answer(struct fuse_req *req, int interruptible)
+{
+	int intr;
+
+	spin_unlock(&fuse_lock);
+	if (interruptible)
+		intr = wait_event_interruptible(req->waitq, req->finished);
+	else
+		intr = request_wait_answer_nonint(req);
+	spin_lock(&fuse_lock);
+	if (intr && interruptible && req->sent) {
+		/* If request is already in userspace, only allow KILL
+		   signal to interrupt */
+		spin_unlock(&fuse_lock);
+		intr = request_wait_answer_nonint(req);
+		spin_lock(&fuse_lock);
+	}
+	if (!intr)
+		return;
+
+	if (!interruptible || req->sent)
+		req->out.h.error = -EINTR;
+	else
+		req->out.h.error = -ERESTARTNOINTR;
+
+	req->interrupted = 1;
+	if (req->locked) {
+		/* This is uninterruptible sleep, because data is
+		   being copied to/from the buffers of req.  During
+		   locked state, there mustn't be any filesystem
+		   operation (e.g. page fault), since that could lead
+		   to deadlock */
+		spin_unlock(&fuse_lock);
+		wait_event(req->waitq, !req->locked);
+		spin_lock(&fuse_lock);
+	}
+	if (!req->sent && !list_empty(&req->list)) {
+		list_del(&req->list);
+		__fuse_put_request(req);
+	} else if (!req->finished && req->sent)
+		background_request(req);
+}
+
+static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+{
+	unsigned nbytes = 0;
+	unsigned i;
+
+	for (i = 0; i < numargs; i++)
+		nbytes += args[i].size;
+
+	return nbytes;
+}
+
+static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	fc->reqctr++;
+	/* zero is special */
+	if (fc->reqctr == 0)
+		fc->reqctr = 1;
+	req->in.h.unique = fc->reqctr;
+	req->in.h.len = sizeof(struct fuse_in_header) +
+		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+	if (!req->preallocated) {
+		/* If request is not preallocated (either FORGET or
+		   RELEASE), then still decrease outstanding_sem, so
+		   user can't open infinite number of files while not
+		   processing the RELEASE requests.  However for
+		   efficiency do it without blocking, so if down()
+		   would block, just increase the debt instead */
+		if (down_trylock(&fc->outstanding_sem))
+			fc->outstanding_debt++;
+	}
+	list_add_tail(&req->list, &fc->pending);
+	wake_up(&fc->waitq);
+}
+
+static void request_send_wait(struct fuse_conn *fc, struct fuse_req *req,
+			      int interruptible)
+{
+	req->isreply = 1;
+	spin_lock(&fuse_lock);
+	if (!fc->file)
+		req->out.h.error = -ENOTCONN;
+	else if (fc->conn_error)
+		req->out.h.error = -ECONNREFUSED;
+	else {
+		queue_request(fc, req);
+		/* acquire extra reference, since request is still needed
+		   after request_end() */
+		__fuse_get_request(req);
+
+		request_wait_answer(req, interruptible);
+	}
+	spin_unlock(&fuse_lock);
+}
+
+void request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	request_send_wait(fc, req, 1);
+}
+
+/*
+ * Non-interruptible version of the above function is for operations
+ * which can't legally return -ERESTART{SYS,NOINTR}.  This can still
+ * be interrupted but only with SIGKILL.
+ */
+void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req)
+{
+	request_send_wait(fc, req, 0);
+}
+
+static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+{
+	spin_lock(&fuse_lock);
+	if (fc->file) {
+		queue_request(fc, req);
+		spin_unlock(&fuse_lock);
+	} else {
+		req->out.h.error = -ENOTCONN;
+		request_end(fc, req);
+	}
+}
+
+void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 0;
+	request_send_nowait(fc, req);
+}
+
+void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	background_request(req);
+	request_send_nowait(fc, req);
+}
+
+void fuse_send_init(struct fuse_conn *fc)
+{
+	/* This is called from fuse_read_super() so there's guaranteed
+	   to be a request available */
+	struct fuse_req *req = do_get_request(fc);
+	struct fuse_init_in_out *arg = &req->misc.init_in_out;
+	arg->major = FUSE_KERNEL_VERSION;
+	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	req->in.h.opcode = FUSE_INIT;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(*arg);
+	req->out.args[0].value = arg;
+	request_send_background(fc, req);
+}
+
+/*
+ * Lock the request.  Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault.  If the request was already
+ * interrupted bail out.
+ */
+static inline int lock_request(struct fuse_req *req)
+{
+	int err = 0;
+	if (req) {
+		spin_lock(&fuse_lock);
+		if (req->interrupted)
+			err = -ENOENT;
+		else
+			req->locked = 1;
+		spin_unlock(&fuse_lock);
+	}
+	return err;
+}
+
+/*
+ * Unlock request.  If it was interrupted during being locked, the
+ * requester thread is currently waiting for it to be unlocked, so
+ * wake it up.
+ */
+static inline void unlock_request(struct fuse_req *req)
+{
+	if (req) {
+		spin_lock(&fuse_lock);
+		req->locked = 0;
+		if (req->interrupted)
+			wake_up(&req->waitq);
+		spin_unlock(&fuse_lock);
+	}
+}
+
+struct fuse_copy_state {
+	int write;
+	struct fuse_req *req;
+	const struct iovec *iov;
+	unsigned long nr_segs;
+	unsigned long seglen;
+	unsigned long addr;
+	struct page *pg;
+	void *mapaddr;
+	void *buf;
+	unsigned len;
+};
+
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
+			   struct fuse_req *req, const struct iovec *iov,
+			   unsigned long nr_segs)
+{
+	memset(cs, 0, sizeof(*cs));
+	cs->write = write;
+	cs->req = req;
+	cs->iov = iov;
+	cs->nr_segs = nr_segs;
+}
+
+/* Unmap and put previous page of userspace buffer */
+static inline void fuse_copy_finish(struct fuse_copy_state *cs)
+{
+	if (cs->mapaddr) {
+		kunmap_atomic(cs->mapaddr, KM_USER0);
+		if (cs->write) {
+			flush_dcache_page(cs->pg);
+			set_page_dirty_lock(cs->pg);
+		}
+		put_page(cs->pg);
+		cs->mapaddr = NULL;
+	}
+}
+
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
+static int fuse_copy_fill(struct fuse_copy_state *cs)
+{
+	unsigned long offset;
+	int err;
+
+	unlock_request(cs->req);
+	fuse_copy_finish(cs);
+	if (!cs->seglen) {
+		BUG_ON(!cs->nr_segs);
+		cs->seglen = cs->iov[0].iov_len;
+		cs->addr = (unsigned long) cs->iov[0].iov_base;
+		cs->iov ++;
+		cs->nr_segs --;
+	}
+	down_read(&current->mm->mmap_sem);
+	err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
+			     &cs->pg, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (err < 0)
+		return err;
+	BUG_ON(err != 1);
+	offset = cs->addr % PAGE_SIZE;
+	cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+	cs->buf = cs->mapaddr + offset;
+	cs->len = min(PAGE_SIZE - offset, cs->seglen);
+	cs->seglen -= cs->len;
+	cs->addr += cs->len;
+
+	return lock_request(cs->req);
+}
+
+/* Do as much copy to/from userspace buffer as we can */
+static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
+			       unsigned *size)
+{
+	unsigned ncpy = min(*size, cs->len);
+	if (val) {
+		if (cs->write)
+			memcpy(cs->buf, *val, ncpy);
+		else
+			memcpy(*val, cs->buf, ncpy);
+		*val += ncpy;
+	}
+	*size -= ncpy;
+	cs->len -= ncpy;
+	cs->buf += ncpy;
+	return ncpy;
+}
+
+/*
+ * Copy a page in the request to/from the userspace buffer.  Must be
+ * done atomically
+ */
+static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+				 unsigned offset, unsigned count, int zeroing)
+{
+	if (page && zeroing && count < PAGE_SIZE) {
+		void *mapaddr = kmap_atomic(page, KM_USER1);
+		memset(mapaddr, 0, PAGE_SIZE);
+		kunmap_atomic(mapaddr, KM_USER1);
+	}
+	while (count) {
+		int err;
+		if (!cs->len && (err = fuse_copy_fill(cs)))
+			return err;
+		if (page) {
+			void *mapaddr = kmap_atomic(page, KM_USER1);
+			void *buf = mapaddr + offset;
+			offset += fuse_copy_do(cs, &buf, &count);
+			kunmap_atomic(mapaddr, KM_USER1);
+		} else
+			offset += fuse_copy_do(cs, NULL, &count);
+	}
+	if (page && !cs->write)
+		flush_dcache_page(page);
+	return 0;
+}
+
+/* Copy pages in the request to/from userspace buffer */
+static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
+			   int zeroing)
+{
+	unsigned i;
+	struct fuse_req *req = cs->req;
+	unsigned offset = req->page_offset;
+	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
+
+	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+		struct page *page = req->pages[i];
+		int err = fuse_copy_page(cs, page, offset, count, zeroing);
+		if (err)
+			return err;
+
+		nbytes -= count;
+		count = min(nbytes, (unsigned) PAGE_SIZE);
+		offset = 0;
+	}
+	return 0;
+}
+
+/* Copy a single argument in the request to/from userspace buffer */
+static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
+{
+	while (size) {
+		int err;
+		if (!cs->len && (err = fuse_copy_fill(cs)))
+			return err;
+		fuse_copy_do(cs, &val, &size);
+	}
+	return 0;
+}
+
+/* Copy request arguments to/from userspace buffer */
+static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+			  unsigned argpages, struct fuse_arg *args,
+			  int zeroing)
+{
+	int err = 0;
+	unsigned i;
+
+	for (i = 0; !err && i < numargs; i++)  {
+		struct fuse_arg *arg = &args[i];
+		if (i == numargs - 1 && argpages)
+			err = fuse_copy_pages(cs, arg->size, zeroing);
+		else
+			err = fuse_copy_one(cs, arg->value, arg->size);
+	}
+	return err;
+}
+
+/* Wait until a request is available on the pending list */
+static void request_wait(struct fuse_conn *fc)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(&fc->waitq, &wait);
+	while (fc->sb && list_empty(&fc->pending)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
+
+		spin_unlock(&fuse_lock);
+		schedule();
+		spin_lock(&fuse_lock);
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&fc->waitq, &wait);
+}
+
+/*
+ * Read a single request into the userspace filesystem's buffer.  This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer.  If
+ * no reply is needed (FORGET) or request has been interrupted or
+ * there was an error during the copying then it's finished by calling
+ * request_end().  Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
+static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t *off)
+{
+	int err;
+	struct fuse_conn *fc;
+	struct fuse_req *req;
+	struct fuse_in *in;
+	struct fuse_copy_state cs;
+	unsigned reqsize;
+
+	spin_lock(&fuse_lock);
+	fc = file->private_data;
+	err = -EPERM;
+	if (!fc)
+		goto err_unlock;
+	request_wait(fc);
+	err = -ENODEV;
+	if (!fc->sb)
+		goto err_unlock;
+	err = -ERESTARTSYS;
+	if (list_empty(&fc->pending))
+		goto err_unlock;
+
+	req = list_entry(fc->pending.next, struct fuse_req, list);
+	list_del_init(&req->list);
+	spin_unlock(&fuse_lock);
+
+	in = &req->in;
+	reqsize = req->in.h.len;
+	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	err = -EINVAL;
+	if (iov_length(iov, nr_segs) >= reqsize) {
+		err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+		if (!err)
+			err = fuse_copy_args(&cs, in->numargs, in->argpages,
+					     (struct fuse_arg *) in->args, 0);
+	}
+	fuse_copy_finish(&cs);
+
+	spin_lock(&fuse_lock);
+	req->locked = 0;
+	if (!err && req->interrupted)
+		err = -ENOENT;
+	if (err) {
+		if (!req->interrupted)
+			req->out.h.error = -EIO;
+		request_end(fc, req);
+		return err;
+	}
+	if (!req->isreply)
+		request_end(fc, req);
+	else {
+		req->sent = 1;
+		list_add_tail(&req->list, &fc->processing);
+		spin_unlock(&fuse_lock);
+	}
+	return reqsize;
+
+ err_unlock:
+	spin_unlock(&fuse_lock);
+	return err;
+}
+
+static ssize_t fuse_dev_read(struct file *file, char __user *buf,
+			     size_t nbytes, loff_t *off)
+{
+	struct iovec iov;
+	iov.iov_len = nbytes;
+	iov.iov_base = buf;
+	return fuse_dev_readv(file, &iov, 1, off);
+}
+
+/* Look up request on processing list by unique ID */
+static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+{
+	struct list_head *entry;
+
+	list_for_each(entry, &fc->processing) {
+		struct fuse_req *req;
+		req = list_entry(entry, struct fuse_req, list);
+		if (req->in.h.unique == unique)
+			return req;
+	}
+	return NULL;
+}
+
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+			 unsigned nbytes)
+{
+	unsigned reqsize = sizeof(struct fuse_out_header);
+
+	if (out->h.error)
+		return nbytes != reqsize ? -EINVAL : 0;
+
+	reqsize += len_args(out->numargs, out->args);
+
+	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+		return -EINVAL;
+	else if (reqsize > nbytes) {
+		struct fuse_arg *lastarg = &out->args[out->numargs-1];
+		unsigned diffsize = reqsize - nbytes;
+		if (diffsize > lastarg->size)
+			return -EINVAL;
+		lastarg->size -= diffsize;
+	}
+	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
+			      out->page_zeroing);
+}
+
+/*
+ * Write a single reply to a request.  First the header is copied from
+ * the write buffer.  The request is then searched on the processing
+ * list by the unique ID found in the header.  If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling request_end()
+ */
+static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t *off)
+{
+	int err;
+	unsigned nbytes = iov_length(iov, nr_segs);
+	struct fuse_req *req;
+	struct fuse_out_header oh;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -ENODEV;
+
+	fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+	if (nbytes < sizeof(struct fuse_out_header))
+		return -EINVAL;
+
+	err = fuse_copy_one(&cs, &oh, sizeof(oh));
+	if (err)
+		goto err_finish;
+	err = -EINVAL;
+	if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
+	    oh.len != nbytes)
+		goto err_finish;
+
+	spin_lock(&fuse_lock);
+	req = request_find(fc, oh.unique);
+	err = -EINVAL;
+	if (!req)
+		goto err_unlock;
+
+	list_del_init(&req->list);
+	if (req->interrupted) {
+		request_end(fc, req);
+		fuse_copy_finish(&cs);
+		return -ENOENT;
+	}
+	req->out.h = oh;
+	req->locked = 1;
+	cs.req = req;
+	spin_unlock(&fuse_lock);
+
+	err = copy_out_args(&cs, &req->out, nbytes);
+	fuse_copy_finish(&cs);
+
+	spin_lock(&fuse_lock);
+	req->locked = 0;
+	if (!err) {
+		if (req->interrupted)
+			err = -ENOENT;
+	} else if (!req->interrupted)
+		req->out.h.error = -EIO;
+	request_end(fc, req);
+
+	return err ? err : nbytes;
+
+ err_unlock:
+	spin_unlock(&fuse_lock);
+ err_finish:
+	fuse_copy_finish(&cs);
+	return err;
+}
+
+static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
+			      size_t nbytes, loff_t *off)
+{
+	struct iovec iov;
+	iov.iov_len = nbytes;
+	iov.iov_base = (char __user *) buf;
+	return fuse_dev_writev(file, &iov, 1, off);
+}
+
+static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	unsigned mask = POLLOUT | POLLWRNORM;
+
+	if (!fc)
+		return -ENODEV;
+
+	poll_wait(file, &fc->waitq, wait);
+
+	spin_lock(&fuse_lock);
+	if (!list_empty(&fc->pending))
+                mask |= POLLIN | POLLRDNORM;
+	spin_unlock(&fuse_lock);
+
+	return mask;
+}
+
+/* Abort all requests on the given list (pending or processing) */
+static void end_requests(struct fuse_conn *fc, struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct fuse_req *req;
+		req = list_entry(head->next, struct fuse_req, list);
+		list_del_init(&req->list);
+		req->out.h.error = -ECONNABORTED;
+		request_end(fc, req);
+		spin_lock(&fuse_lock);
+	}
+}
+
+static int fuse_dev_release(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc;
+
+	spin_lock(&fuse_lock);
+	fc = file->private_data;
+	if (fc) {
+		fc->file = NULL;
+		end_requests(fc, &fc->pending);
+		end_requests(fc, &fc->processing);
+		fuse_release_conn(fc);
+	}
+	spin_unlock(&fuse_lock);
+	return 0;
+}
+
+struct file_operations fuse_dev_operations = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= fuse_dev_read,
+	.readv		= fuse_dev_readv,
+	.write		= fuse_dev_write,
+	.writev		= fuse_dev_writev,
+	.poll		= fuse_dev_poll,
+	.release	= fuse_dev_release,
+};
+
+static struct miscdevice fuse_miscdevice = {
+	.minor = FUSE_MINOR,
+	.name  = "fuse",
+	.fops = &fuse_dev_operations,
+};
+
+int __init fuse_dev_init(void)
+{
+	int err = -ENOMEM;
+	fuse_req_cachep = kmem_cache_create("fuse_request",
+					    sizeof(struct fuse_req),
+					    0, 0, NULL, NULL);
+	if (!fuse_req_cachep)
+		goto out;
+
+	err = misc_register(&fuse_miscdevice);
+	if (err)
+		goto out_cache_clean;
+
+	return 0;
+
+ out_cache_clean:
+	kmem_cache_destroy(fuse_req_cachep);
+ out:
+	return err;
+}
+
+void fuse_dev_cleanup(void)
+{
+	misc_deregister(&fuse_miscdevice);
+	kmem_cache_destroy(fuse_req_cachep);
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index eed6e89ce01..50ad6a0c39b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -15,6 +15,12 @@
 #include <linux/backing-dev.h>
 #include <asm/semaphore.h>
 
+/** Max number of pages that can be used in a single read request */
+#define FUSE_MAX_PAGES_PER_REQ 32
+
+/** If more requests are outstanding, then the operation will block */
+#define FUSE_MAX_OUTSTANDING 10
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
@@ -28,6 +34,123 @@ struct fuse_inode {
 	unsigned long i_time;
 };
 
+/** One input argument of a request */
+struct fuse_in_arg {
+	unsigned size;
+	const void *value;
+};
+
+/** The request input */
+struct fuse_in {
+	/** The request header */
+	struct fuse_in_header h;
+
+	/** True if the data for the last argument is in req->pages */
+	unsigned argpages:1;
+
+	/** Number of arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_in_arg args[3];
+};
+
+/** One output argument of a request */
+struct fuse_arg {
+	unsigned size;
+	void *value;
+};
+
+/** The request output */
+struct fuse_out {
+	/** Header returned from userspace */
+	struct fuse_out_header h;
+
+	/** Last argument is variable length (can be shorter than
+	    arg->size) */
+	unsigned argvar:1;
+
+	/** Last argument is a list of pages to copy data to */
+	unsigned argpages:1;
+
+	/** Zero partially or not copied pages */
+	unsigned page_zeroing:1;
+
+	/** Number or arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_arg args[3];
+};
+
+struct fuse_req;
+struct fuse_conn;
+
+/**
+ * A request to the client
+ */
+struct fuse_req {
+	/** This can be on either unused_list, pending or processing
+	    lists in fuse_conn */
+	struct list_head list;
+
+	/** refcount */
+	atomic_t count;
+
+	/** True if the request has reply */
+	unsigned isreply:1;
+
+	/** The request is preallocated */
+	unsigned preallocated:1;
+
+	/** The request was interrupted */
+	unsigned interrupted:1;
+
+	/** Request is sent in the background */
+	unsigned background:1;
+
+	/** Data is being copied to/from the request */
+	unsigned locked:1;
+
+	/** Request has been sent to userspace */
+	unsigned sent:1;
+
+	/** The request is finished */
+	unsigned finished:1;
+
+	/** The request input */
+	struct fuse_in in;
+
+	/** The request output */
+	struct fuse_out out;
+
+	/** Used to wake up the task waiting for completion of request*/
+	wait_queue_head_t waitq;
+
+	/** Data for asynchronous requests */
+	union {
+		struct fuse_init_in_out init_in_out;
+	} misc;
+
+	/** page vector */
+	struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+
+	/** number of pages in vector */
+	unsigned num_pages;
+
+	/** offset of data on first page */
+	unsigned page_offset;
+
+	/** Inode used in the request */
+	struct inode *inode;
+
+	/** Second inode used in the request (or NULL) */
+	struct inode *inode2;
+
+	/** File used in the request (or NULL) */
+	struct file *file;
+};
+
 /**
  * A Fuse connection.
  *
@@ -39,9 +162,37 @@ struct fuse_conn {
 	/** The superblock of the mounted filesystem */
 	struct super_block *sb;
 
+	/** The opened client device */
+	struct file *file;
+
 	/** The user id for this mount */
 	uid_t user_id;
 
+	/** Readers of the connection are waiting on this */
+	wait_queue_head_t waitq;
+
+	/** The list of pending requests */
+	struct list_head pending;
+
+	/** The list of requests being processed */
+	struct list_head processing;
+
+	/** Controls the maximum number of outstanding requests */
+	struct semaphore outstanding_sem;
+
+	/** This counts the number of outstanding requests if
+	    outstanding_sem would go negative */
+	unsigned outstanding_debt;
+
+	/** The list of unused requests */
+	struct list_head unused_list;
+
+	/** The next unique request id */
+	u64 reqctr;
+
+	/** Connection failed (version mismatch) */
+	unsigned conn_error : 1;
+
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 };
@@ -71,13 +222,20 @@ static inline u64 get_node_id(struct inode *inode)
 	return get_fuse_inode(inode)->nodeid;
 }
 
+/** Device operations */
+extern struct file_operations fuse_dev_operations;
+
 /**
  * This is the single global spinlock which protects FUSE's structures
  *
  * The following data is protected by this lock:
  *
+ *  - the private_data field of the device file
  *  - the s_fs_info field of the super block
+ *  - unused_list, pending, processing lists in fuse_conn
+ *  - the unique request ID counter reqctr in fuse_conn
  *  - the sb (super_block) field in fuse_conn
+ *  - the file (device file) field in fuse_conn
  */
 extern spinlock_t fuse_lock;
 
@@ -87,3 +245,68 @@ extern spinlock_t fuse_lock;
  */
 void fuse_release_conn(struct fuse_conn *fc);
 
+/**
+ * Initialize the client device
+ */
+int fuse_dev_init(void);
+
+/**
+ * Cleanup the client device
+ */
+void fuse_dev_cleanup(void);
+
+/**
+ * Allocate a request
+ */
+struct fuse_req *fuse_request_alloc(void);
+
+/**
+ * Free a request
+ */
+void fuse_request_free(struct fuse_req *req);
+
+/**
+ * Reinitialize a request, the preallocated flag is left unmodified
+ */
+void fuse_reset_request(struct fuse_req *req);
+
+/**
+ * Reserve a preallocated request
+ */
+struct fuse_req *fuse_get_request(struct fuse_conn *fc);
+
+/**
+ * Reserve a preallocated request, only interruptible by SIGKILL
+ */
+struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc);
+
+/**
+ * Decrement reference count of a request.  If count goes to zero put
+ * on unused list (preallocated) or free reqest (not preallocated).
+ */
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request (synchronous, interruptible)
+ */
+void request_send(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request (synchronous, non-interruptible except by SIGKILL)
+ */
+void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request with no reply
+ */
+void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request in the background
+ */
+void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send the INIT message
+ */
+void fuse_send_init(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ea6339c2b6a..33fad334ba7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -151,6 +151,8 @@ static void fuse_put_super(struct super_block *sb)
 	mount_count --;
 	fc->sb = NULL;
 	fc->user_id = 0;
+	/* Flush all readers on this fs */
+	wake_up_all(&fc->waitq);
 	fuse_release_conn(fc);
 	*get_fuse_conn_super_p(sb) = NULL;
 	spin_unlock(&fuse_lock);
@@ -229,22 +231,51 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
-void fuse_release_conn(struct fuse_conn *fc)
+static void free_conn(struct fuse_conn *fc)
 {
+	while (!list_empty(&fc->unused_list)) {
+		struct fuse_req *req;
+		req = list_entry(fc->unused_list.next, struct fuse_req, list);
+		list_del(&req->list);
+		fuse_request_free(req);
+	}
 	kfree(fc);
 }
 
+/* Must be called with the fuse lock held */
+void fuse_release_conn(struct fuse_conn *fc)
+{
+	if (!fc->sb && !fc->file)
+		free_conn(fc);
+}
+
 static struct fuse_conn *new_conn(void)
 {
 	struct fuse_conn *fc;
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc != NULL) {
+		int i;
 		memset(fc, 0, sizeof(*fc));
 		fc->sb = NULL;
+		fc->file = NULL;
 		fc->user_id = 0;
+		init_waitqueue_head(&fc->waitq);
+		INIT_LIST_HEAD(&fc->pending);
+		INIT_LIST_HEAD(&fc->processing);
+		INIT_LIST_HEAD(&fc->unused_list);
+		sema_init(&fc->outstanding_sem, 0);
+		for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
+			struct fuse_req *req = fuse_request_alloc();
+			if (!req) {
+				free_conn(fc);
+				return NULL;
+			}
+			list_add(&req->list, &fc->unused_list);
+		}
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+		fc->reqctr = 0;
 	}
 	return fc;
 }
@@ -253,11 +284,20 @@ static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
 {
 	struct fuse_conn *fc;
 
+	if (file->f_op != &fuse_dev_operations)
+		return ERR_PTR(-EINVAL);
 	fc = new_conn();
 	if (fc == NULL)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	spin_lock(&fuse_lock);
-	fc->sb = sb;
+	if (file->private_data) {
+		free_conn(fc);
+		fc = ERR_PTR(-EINVAL);
+	} else {
+		file->private_data = fc;
+		fc->sb = sb;
+		fc->file = file;
+	}
 	spin_unlock(&fuse_lock);
 	return fc;
 }
@@ -315,8 +355,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	fc = get_conn(file, sb);
 	fput(file);
-	if (fc == NULL)
-		return -EINVAL;
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
 
 	fc->user_id = d.user_id;
 
@@ -336,6 +376,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		iput(root);
 		goto err;
 	}
+	fuse_send_init(fc);
 	return 0;
 
  err:
@@ -411,8 +452,14 @@ static int __init fuse_init(void)
 	if (res)
 		goto err;
 
+	res = fuse_dev_init();
+	if (res)
+		goto err_fs_cleanup;
+
 	return 0;
 
+ err_fs_cleanup:
+	fuse_fs_cleanup();
  err:
 	return res;
 }
@@ -422,6 +469,7 @@ static void __exit fuse_exit(void)
 	printk(KERN_DEBUG "fuse exit\n");
 
 	fuse_fs_cleanup();
+	fuse_dev_cleanup();
 }
 
 module_init(fuse_init);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 2b1f4ae01e9..a1aebd7104c 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -11,7 +11,7 @@
 #include <asm/types.h>
 
 /** Version number of this interface */
-#define FUSE_KERNEL_VERSION 5
+#define FUSE_KERNEL_VERSION 6
 
 /** Minor version number of this interface */
 #define FUSE_KERNEL_MINOR_VERSION 1
@@ -19,6 +19,12 @@
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
 
+/** The major number of the fuse character device */
+#define FUSE_MAJOR 10
+
+/** The minor number of the fuse character device */
+#define FUSE_MINOR 229
+
 struct fuse_attr {
 	__u64	ino;
 	__u64	size;
@@ -36,3 +42,31 @@ struct fuse_attr {
 	__u32	rdev;
 };
 
+enum fuse_opcode {
+	FUSE_INIT          = 26
+};
+
+/* Conservative buffer size for the client */
+#define FUSE_MAX_IN 8192
+
+struct fuse_init_in_out {
+	__u32	major;
+	__u32	minor;
+};
+
+struct fuse_in_header {
+	__u32	len;
+	__u32	opcode;
+	__u64	unique;
+	__u64	nodeid;
+	__u32	uid;
+	__u32	gid;
+	__u32	pid;
+};
+
+struct fuse_out_header {
+	__u32	len;
+	__s32	error;
+	__u64	unique;
+};
+
-- 
cgit v1.2.3-70-g09d2


From e5e5558e923f35839108a12718494ecb73fb782f Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:28 -0700
Subject: [PATCH] FUSE - read-only operations

This patch adds the read-only filesystem operations of FUSE.

This contains the following files:

 o dir.c
    - directory, symlink and file-inode operations

The following operations are added:

 o lookup
 o getattr
 o readlink
 o follow_link
 o directory open
 o readdir
 o directory release
 o permission
 o dentry revalidate
 o statfs

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/Makefile     |   2 +-
 fs/fuse/dev.c        |   9 ++
 fs/fuse/dir.c        | 413 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h     |  51 +++++++
 fs/fuse/inode.c      |  81 ++++++++++
 include/linux/fuse.h |  60 ++++++++
 6 files changed, 615 insertions(+), 1 deletion(-)
 create mode 100644 fs/fuse/dir.c

(limited to 'fs')

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 21021c35648..c34e268a0ed 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 
-fuse-objs := dev.o inode.o
+fuse-objs := dev.o dir.o inode.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9aaf10a6588..e8f3170946f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -691,6 +691,13 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 	return NULL;
 }
 
+/* fget() needs to be done in this context */
+static void process_getdir(struct fuse_req *req)
+{
+	struct fuse_getdir_out_i *arg = req->out.args[0].value;
+	arg->file = fget(arg->fd);
+}
+
 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 			 unsigned nbytes)
 {
@@ -770,6 +777,8 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	if (!err) {
 		if (req->interrupted)
 			err = -ENOENT;
+		else if (req->in.h.opcode == FUSE_GETDIR && !oh.error)
+			process_getdir(req);
 	} else if (!req->interrupted)
 		req->out.h.error = -EIO;
 	request_end(fc, req);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
new file mode 100644
index 00000000000..a89730e70c5
--- /dev/null
+++ b/fs/fuse/dir.c
@@ -0,0 +1,413 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+
+static inline unsigned long time_to_jiffies(unsigned long sec,
+					    unsigned long nsec)
+{
+	struct timespec ts = {sec, nsec};
+	return jiffies + timespec_to_jiffies(&ts);
+}
+
+static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
+			     struct dentry *entry,
+			     struct fuse_entry_out *outarg)
+{
+	req->in.h.opcode = FUSE_LOOKUP;
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(struct fuse_entry_out);
+	req->out.args[0].value = outarg;
+}
+
+static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
+{
+	if (!entry->d_inode || is_bad_inode(entry->d_inode))
+		return 0;
+	else if (time_after(jiffies, entry->d_time)) {
+		int err;
+		int version;
+		struct fuse_entry_out outarg;
+		struct inode *inode = entry->d_inode;
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_req *req = fuse_get_request_nonint(fc);
+		if (!req)
+			return 0;
+
+		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
+		request_send_nonint(fc, req);
+		version = req->out.h.unique;
+		err = req->out.h.error;
+		fuse_put_request(fc, req);
+		if (err || outarg.nodeid != get_node_id(inode) ||
+		    (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+			return 0;
+
+		fuse_change_attributes(inode, &outarg.attr);
+		inode->i_version = version;
+		entry->d_time = time_to_jiffies(outarg.entry_valid,
+						outarg.entry_valid_nsec);
+		fi->i_time = time_to_jiffies(outarg.attr_valid,
+					     outarg.attr_valid_nsec);
+	}
+	return 1;
+}
+
+static struct dentry_operations fuse_dentry_operations = {
+	.d_revalidate	= fuse_dentry_revalidate,
+};
+
+static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
+			    struct inode **inodep)
+{
+	int err;
+	int version;
+	struct fuse_entry_out outarg;
+	struct inode *inode = NULL;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req;
+
+	if (entry->d_name.len > FUSE_NAME_MAX)
+		return -ENAMETOOLONG;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	fuse_lookup_init(req, dir, entry, &outarg);
+	request_send(fc, req);
+	version = req->out.h.unique;
+	err = req->out.h.error;
+	if (!err) {
+		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+				  &outarg.attr, version);
+		if (!inode) {
+			fuse_send_forget(fc, req, outarg.nodeid, version);
+			return -ENOMEM;
+		}
+	}
+	fuse_put_request(fc, req);
+	if (err && err != -ENOENT)
+		return err;
+
+	if (inode) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		entry->d_time =	time_to_jiffies(outarg.entry_valid,
+						outarg.entry_valid_nsec);
+		fi->i_time = time_to_jiffies(outarg.attr_valid,
+					     outarg.attr_valid_nsec);
+	}
+
+	entry->d_op = &fuse_dentry_operations;
+	*inodep = inode;
+	return 0;
+}
+
+int fuse_do_getattr(struct inode *inode)
+{
+	int err;
+	struct fuse_attr_out arg;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	req->in.h.opcode = FUSE_GETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(arg);
+	req->out.args[0].value = &arg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		if ((inode->i_mode ^ arg.attr.mode) & S_IFMT) {
+			make_bad_inode(inode);
+			err = -EIO;
+		} else {
+			struct fuse_inode *fi = get_fuse_inode(inode);
+			fuse_change_attributes(inode, &arg.attr);
+			fi->i_time = time_to_jiffies(arg.attr_valid,
+						     arg.attr_valid_nsec);
+		}
+	}
+	return err;
+}
+
+static int fuse_revalidate(struct dentry *entry)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (get_node_id(inode) == FUSE_ROOT_ID) {
+		if (current->fsuid != fc->user_id)
+			return -EACCES;
+	} else if (time_before_eq(jiffies, fi->i_time))
+		return 0;
+
+	return fuse_do_getattr(inode);
+}
+
+static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (current->fsuid != fc->user_id)
+		return -EACCES;
+	else {
+		int mode = inode->i_mode;
+		if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
+                    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+                        return -EROFS;
+		if ((mask & MAY_EXEC) && !S_ISDIR(mode) && !(mode & S_IXUGO))
+			return -EACCES;
+		return 0;
+	}
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+			 void *dstbuf, filldir_t filldir)
+{
+	while (nbytes >= FUSE_NAME_OFFSET) {
+		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+		size_t reclen = FUSE_DIRENT_SIZE(dirent);
+		int over;
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+
+		over = filldir(dstbuf, dirent->name, dirent->namelen,
+			       file->f_pos, dirent->ino, dirent->type);
+		if (over)
+			break;
+
+		buf += reclen;
+		nbytes -= reclen;
+		file->f_pos = dirent->off;
+	}
+
+	return 0;
+}
+
+static int fuse_checkdir(struct file *cfile, struct file *file)
+{
+	struct inode *inode;
+	if (!cfile)
+		return -EIO;
+	inode = cfile->f_dentry->d_inode;
+	if (!S_ISREG(inode->i_mode)) {
+		fput(cfile);
+		return -EIO;
+	}
+
+	file->private_data = cfile;
+	return 0;
+}
+
+static int fuse_getdir(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_getdir_out_i outarg;
+	int err;
+
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	req->in.h.opcode = FUSE_GETDIR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(struct fuse_getdir_out);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		err = fuse_checkdir(outarg.file, file);
+	return err;
+}
+
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+	struct file *cfile = file->private_data;
+	char *buf;
+	int ret;
+
+	if (!cfile) {
+		ret = fuse_getdir(file);
+		if (ret)
+			return ret;
+
+		cfile = file->private_data;
+	}
+
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = kernel_read(cfile, file->f_pos, buf, PAGE_SIZE);
+	if (ret > 0)
+		ret = parse_dirfile(buf, ret, file, dstbuf, filldir);
+
+	free_page((unsigned long) buf);
+	return ret;
+}
+
+static char *read_link(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	char *link;
+
+	if (!req)
+		return ERR_PTR(-ERESTARTNOINTR);
+
+	link = (char *) __get_free_page(GFP_KERNEL);
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	req->in.h.opcode = FUSE_READLINK;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->out.argvar = 1;
+	req->out.numargs = 1;
+	req->out.args[0].size = PAGE_SIZE - 1;
+	req->out.args[0].value = link;
+	request_send(fc, req);
+	if (req->out.h.error) {
+		free_page((unsigned long) link);
+		link = ERR_PTR(req->out.h.error);
+	} else
+		link[req->out.args[0].size] = '\0';
+ out:
+	fuse_put_request(fc, req);
+	return link;
+}
+
+static void free_link(char *link)
+{
+	if (!IS_ERR(link))
+		free_page((unsigned long) link);
+}
+
+static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, read_link(dentry));
+	return NULL;
+}
+
+static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+	free_link(nd_get_link(nd));
+}
+
+static int fuse_dir_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+static int fuse_dir_release(struct inode *inode, struct file *file)
+{
+	struct file *cfile = file->private_data;
+
+	if (cfile)
+		fput(cfile);
+
+	return 0;
+}
+
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+			struct kstat *stat)
+{
+	struct inode *inode = entry->d_inode;
+	int err = fuse_revalidate(entry);
+	if (!err)
+		generic_fillattr(inode, stat);
+
+	return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
+{
+	struct inode *inode;
+	int err = fuse_lookup_iget(dir, entry, &inode);
+	if (err)
+		return ERR_PTR(err);
+	if (inode && S_ISDIR(inode->i_mode)) {
+		/* Don't allow creating an alias to a directory  */
+		struct dentry *alias = d_find_alias(inode);
+		if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+			dput(alias);
+			iput(inode);
+			return ERR_PTR(-EIO);
+		}
+	}
+	return d_splice_alias(inode, entry);
+}
+
+static struct inode_operations fuse_dir_inode_operations = {
+	.lookup		= fuse_lookup,
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+};
+
+static struct file_operations fuse_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= fuse_readdir,
+	.open		= fuse_dir_open,
+	.release	= fuse_dir_release,
+};
+
+static struct inode_operations fuse_common_inode_operations = {
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+};
+
+static struct inode_operations fuse_symlink_inode_operations = {
+	.follow_link	= fuse_follow_link,
+	.put_link	= fuse_put_link,
+	.readlink	= generic_readlink,
+	.getattr	= fuse_getattr,
+};
+
+void fuse_init_common(struct inode *inode)
+{
+	inode->i_op = &fuse_common_inode_operations;
+}
+
+void fuse_init_dir(struct inode *inode)
+{
+	inode->i_op = &fuse_dir_inode_operations;
+	inode->i_fop = &fuse_dir_operations;
+}
+
+void fuse_init_symlink(struct inode *inode)
+{
+	inode->i_op = &fuse_symlink_inode_operations;
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 50ad6a0c39b..8d91e1492f9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -30,6 +30,9 @@ struct fuse_inode {
 	 * and kernel */
 	u64 nodeid;
 
+	/** The request used for sending the FORGET message */
+	struct fuse_req *forget_req;
+
 	/** Time in jiffies until the file attributes are valid */
 	unsigned long i_time;
 };
@@ -129,6 +132,7 @@ struct fuse_req {
 
 	/** Data for asynchronous requests */
 	union {
+		struct fuse_forget_in forget_in;
 		struct fuse_init_in_out init_in_out;
 	} misc;
 
@@ -197,6 +201,11 @@ struct fuse_conn {
 	struct backing_dev_info bdi;
 };
 
+struct fuse_getdir_out_i {
+	int fd;
+	void *file; /* Used by kernel only */
+};
+
 static inline struct fuse_conn **get_fuse_conn_super_p(struct super_block *sb)
 {
 	return (struct fuse_conn **) &sb->s_fs_info;
@@ -239,6 +248,38 @@ extern struct file_operations fuse_dev_operations;
  */
 extern spinlock_t fuse_lock;
 
+/**
+ * Get a filled in inode
+ */
+struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+			int generation, struct fuse_attr *attr, int version);
+
+/**
+ * Send FORGET command
+ */
+void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+		      unsigned long nodeid, int version);
+
+/**
+ * Initialise inode operations on regular files and special files
+ */
+void fuse_init_common(struct inode *inode);
+
+/**
+ * Initialise inode and file operations on a directory
+ */
+void fuse_init_dir(struct inode *inode);
+
+/**
+ * Initialise inode operations on a symlink
+ */
+void fuse_init_symlink(struct inode *inode);
+
+/**
+ * Change attributes of an inode
+ */
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr);
+
 /**
  * Check if the connection can be released, and if yes, then free the
  * connection structure
@@ -306,6 +347,16 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
  */
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
+/**
+ * Get the attributes of a file
+ */
+int fuse_do_getattr(struct inode *inode);
+
+/**
+ * Invalidate inode attributes
+ */
+void fuse_invalidate_attr(struct inode *inode);
+
 /**
  * Send the INIT message
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 33fad334ba7..41498a1952a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -51,12 +51,20 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi = get_fuse_inode(inode);
 	fi->i_time = jiffies - 1;
 	fi->nodeid = 0;
+	fi->forget_req = fuse_request_alloc();
+	if (!fi->forget_req) {
+		kmem_cache_free(fuse_inode_cachep, inode);
+		return NULL;
+	}
 
 	return inode;
 }
 
 static void fuse_destroy_inode(struct inode *inode)
 {
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	if (fi->forget_req)
+		fuse_request_free(fi->forget_req);
 	kmem_cache_free(fuse_inode_cachep, inode);
 }
 
@@ -65,8 +73,27 @@ static void fuse_read_inode(struct inode *inode)
 	/* No op */
 }
 
+void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+		      unsigned long nodeid, int version)
+{
+	struct fuse_forget_in *inarg = &req->misc.forget_in;
+	inarg->version = version;
+	req->in.h.opcode = FUSE_FORGET;
+	req->in.h.nodeid = nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_forget_in);
+	req->in.args[0].value = inarg;
+	request_send_noreply(fc, req);
+}
+
 static void fuse_clear_inode(struct inode *inode)
 {
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	if (fc) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		fuse_send_forget(fc, fi->forget_req, fi->nodeid, inode->i_version);
+		fi->forget_req = NULL;
+	}
 }
 
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
@@ -94,6 +121,22 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 {
 	inode->i_mode = attr->mode & S_IFMT;
 	i_size_write(inode, attr->size);
+	if (S_ISREG(inode->i_mode)) {
+		fuse_init_common(inode);
+	} else if (S_ISDIR(inode->i_mode))
+		fuse_init_dir(inode);
+	else if (S_ISLNK(inode->i_mode))
+		fuse_init_symlink(inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+		 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		fuse_init_common(inode);
+		init_special_inode(inode, inode->i_mode,
+				   new_decode_dev(attr->rdev));
+	} else {
+		/* Don't let user create weird files */
+		inode->i_mode = S_IFREG;
+		fuse_init_common(inode);
+	}
 }
 
 static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
@@ -158,6 +201,43 @@ static void fuse_put_super(struct super_block *sb)
 	spin_unlock(&fuse_lock);
 }
 
+static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+	stbuf->f_type    = FUSE_SUPER_MAGIC;
+	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_blocks  = attr->blocks;
+	stbuf->f_bfree   = attr->bfree;
+	stbuf->f_bavail  = attr->bavail;
+	stbuf->f_files   = attr->files;
+	stbuf->f_ffree   = attr->ffree;
+	stbuf->f_namelen = attr->namelen;
+	/* fsid is left zero */
+}
+
+static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct fuse_req *req;
+	struct fuse_statfs_out outarg;
+	int err;
+
+        req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTSYS;
+
+	req->in.numargs = 0;
+	req->in.h.opcode = FUSE_STATFS;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	if (!err)
+		convert_fuse_statfs(buf, &outarg.st);
+	fuse_put_request(fc, req);
+	return err;
+}
+
 enum {
 	OPT_FD,
 	OPT_ROOTMODE,
@@ -318,6 +398,7 @@ static struct super_operations fuse_super_operations = {
 	.read_inode	= fuse_read_inode,
 	.clear_inode	= fuse_clear_inode,
 	.put_super	= fuse_put_super,
+	.statfs		= fuse_statfs,
 	.show_options	= fuse_show_options,
 };
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index a1aebd7104c..21b9ba16f8d 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -42,13 +42,61 @@ struct fuse_attr {
 	__u32	rdev;
 };
 
+struct fuse_kstatfs {
+	__u64	blocks;
+	__u64	bfree;
+	__u64	bavail;
+	__u64	files;
+	__u64	ffree;
+	__u32	bsize;
+	__u32	namelen;
+};
+
 enum fuse_opcode {
+	FUSE_LOOKUP	   = 1,
+	FUSE_FORGET	   = 2,  /* no reply */
+	FUSE_GETATTR	   = 3,
+	FUSE_READLINK	   = 5,
+	FUSE_GETDIR	   = 7,
+	FUSE_STATFS	   = 17,
 	FUSE_INIT          = 26
 };
 
 /* Conservative buffer size for the client */
 #define FUSE_MAX_IN 8192
 
+#define FUSE_NAME_MAX 1024
+
+struct fuse_entry_out {
+	__u64	nodeid;		/* Inode ID */
+	__u64	generation;	/* Inode generation: nodeid:gen must
+				   be unique for the fs's lifetime */
+	__u64	entry_valid;	/* Cache timeout for the name */
+	__u64	attr_valid;	/* Cache timeout for the attributes */
+	__u32	entry_valid_nsec;
+	__u32	attr_valid_nsec;
+	struct fuse_attr attr;
+};
+
+struct fuse_forget_in {
+	__u64	version;
+};
+
+struct fuse_attr_out {
+	__u64	attr_valid;	/* Cache timeout for the attributes */
+	__u32	attr_valid_nsec;
+	__u32	dummy;
+	struct fuse_attr attr;
+};
+
+struct fuse_getdir_out {
+	__u32	fd;
+};
+
+struct fuse_statfs_out {
+	struct fuse_kstatfs st;
+};
+
 struct fuse_init_in_out {
 	__u32	major;
 	__u32	minor;
@@ -70,3 +118,15 @@ struct fuse_out_header {
 	__u64	unique;
 };
 
+struct fuse_dirent {
+	__u64	ino;
+	__u64	off;
+	__u32	namelen;
+	__u32	type;
+	char name[0];
+};
+
+#define FUSE_NAME_OFFSET ((unsigned) ((struct fuse_dirent *) 0)->name)
+#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
+#define FUSE_DIRENT_SIZE(d) \
+	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
-- 
cgit v1.2.3-70-g09d2


From 9e6268db496a2592e89457537ea54a496feabb77 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:29 -0700
Subject: [PATCH] FUSE - read-write operations

This patch adds the write filesystem operations of FUSE.

The following operations are added:

 o setattr
 o symlink
 o mknod
 o mkdir
 o create
 o unlink
 o rmdir
 o rename
 o link

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c        | 373 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/fuse/fuse_i.h     |   7 +-
 fs/fuse/inode.c      |  15 ++-
 include/linux/fuse.h |  43 +++++-
 4 files changed, 419 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index a89730e70c5..92c7188ccd1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -42,7 +42,6 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		return 0;
 	else if (time_after(jiffies, entry->d_time)) {
 		int err;
-		int version;
 		struct fuse_entry_out outarg;
 		struct inode *inode = entry->d_inode;
 		struct fuse_inode *fi = get_fuse_inode(inode);
@@ -53,15 +52,19 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 
 		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
 		request_send_nonint(fc, req);
-		version = req->out.h.unique;
 		err = req->out.h.error;
+		if (!err) {
+			if (outarg.nodeid != get_node_id(inode)) {
+				fuse_send_forget(fc, req, outarg.nodeid, 1);
+				return 0;
+			}
+			fi->nlookup ++;
+		}
 		fuse_put_request(fc, req);
-		if (err || outarg.nodeid != get_node_id(inode) ||
-		    (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+		if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
 			return 0;
 
 		fuse_change_attributes(inode, &outarg.attr);
-		inode->i_version = version;
 		entry->d_time = time_to_jiffies(outarg.entry_valid,
 						outarg.entry_valid_nsec);
 		fi->i_time = time_to_jiffies(outarg.attr_valid,
@@ -78,7 +81,6 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 			    struct inode **inodep)
 {
 	int err;
-	int version;
 	struct fuse_entry_out outarg;
 	struct inode *inode = NULL;
 	struct fuse_conn *fc = get_fuse_conn(dir);
@@ -93,13 +95,12 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
-	version = req->out.h.unique;
 	err = req->out.h.error;
 	if (!err) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-				  &outarg.attr, version);
+				  &outarg.attr);
 		if (!inode) {
-			fuse_send_forget(fc, req, outarg.nodeid, version);
+			fuse_send_forget(fc, req, outarg.nodeid, 1);
 			return -ENOMEM;
 		}
 	}
@@ -120,6 +121,264 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 	return 0;
 }
 
+void fuse_invalidate_attr(struct inode *inode)
+{
+	get_fuse_inode(inode)->i_time = jiffies - 1;
+}
+
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+	d_invalidate(entry);
+	entry->d_time = jiffies - 1;
+}
+
+static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
+			    struct inode *dir, struct dentry *entry,
+			    int mode)
+{
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	struct fuse_inode *fi;
+	int err;
+
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	if (err) {
+		fuse_put_request(fc, req);
+		return err;
+	}
+	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+			  &outarg.attr);
+	if (!inode) {
+		fuse_send_forget(fc, req, outarg.nodeid, 1);
+		return -ENOMEM;
+	}
+	fuse_put_request(fc, req);
+
+	/* Don't allow userspace to do really stupid things... */
+	if ((inode->i_mode ^ mode) & S_IFMT) {
+		iput(inode);
+		return -EIO;
+	}
+
+	entry->d_time = time_to_jiffies(outarg.entry_valid,
+					outarg.entry_valid_nsec);
+
+	fi = get_fuse_inode(inode);
+	fi->i_time = time_to_jiffies(outarg.attr_valid,
+				     outarg.attr_valid_nsec);
+
+	d_instantiate(entry, inode);
+	fuse_invalidate_attr(dir);
+	return 0;
+}
+
+static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
+		      dev_t rdev)
+{
+	struct fuse_mknod_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	inarg.rdev = new_encode_dev(rdev);
+	req->in.h.opcode = FUSE_MKNOD;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = entry->d_name.len + 1;
+	req->in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, req, dir, entry, mode);
+}
+
+static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
+		       struct nameidata *nd)
+{
+	return fuse_mknod(dir, entry, mode, 0);
+}
+
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
+{
+	struct fuse_mkdir_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	req->in.h.opcode = FUSE_MKDIR;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = entry->d_name.len + 1;
+	req->in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, req, dir, entry, S_IFDIR);
+}
+
+static int fuse_symlink(struct inode *dir, struct dentry *entry,
+			const char *link)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	unsigned len = strlen(link) + 1;
+	struct fuse_req *req;
+
+	if (len > FUSE_SYMLINK_MAX)
+		return -ENAMETOOLONG;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	req->in.h.opcode = FUSE_SYMLINK;
+	req->in.numargs = 2;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	req->in.args[1].size = len;
+	req->in.args[1].value = link;
+	return create_new_entry(fc, req, dir, entry, S_IFLNK);
+}
+
+static int fuse_unlink(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	req->in.h.opcode = FUSE_UNLINK;
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		struct inode *inode = entry->d_inode;
+
+		/* Set nlink to zero so the inode can be cleared, if
+                   the inode does have more links this will be
+                   discovered at the next lookup/getattr */
+		inode->i_nlink = 0;
+		fuse_invalidate_attr(inode);
+		fuse_invalidate_attr(dir);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rmdir(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	req->in.h.opcode = FUSE_RMDIR;
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		entry->d_inode->i_nlink = 0;
+		fuse_invalidate_attr(dir);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+		       struct inode *newdir, struct dentry *newent)
+{
+	int err;
+	struct fuse_rename_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(olddir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.newdir = get_node_id(newdir);
+	req->in.h.opcode = FUSE_RENAME;
+	req->in.h.nodeid = get_node_id(olddir);
+	req->inode = olddir;
+	req->inode2 = newdir;
+	req->in.numargs = 3;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = oldent->d_name.len + 1;
+	req->in.args[1].value = oldent->d_name.name;
+	req->in.args[2].size = newent->d_name.len + 1;
+	req->in.args[2].value = newent->d_name.name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		fuse_invalidate_attr(olddir);
+		if (olddir != newdir)
+			fuse_invalidate_attr(newdir);
+	} else if (err == -EINTR) {
+		/* If request was interrupted, DEITY only knows if the
+		   rename actually took place.  If the invalidation
+		   fails (e.g. some process has CWD under the renamed
+		   directory), then there can be inconsistency between
+		   the dcache and the real filesystem.  Tough luck. */
+		fuse_invalidate_entry(oldent);
+		if (newent->d_inode)
+			fuse_invalidate_entry(newent);
+	}
+
+	return err;
+}
+
+static int fuse_link(struct dentry *entry, struct inode *newdir,
+		     struct dentry *newent)
+{
+	int err;
+	struct fuse_link_in inarg;
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.oldnodeid = get_node_id(inode);
+	req->in.h.opcode = FUSE_LINK;
+	req->inode2 = inode;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = newent->d_name.len + 1;
+	req->in.args[1].value = newent->d_name.name;
+	err = create_new_entry(fc, req, newdir, newent, inode->i_mode);
+	/* Contrary to "normal" filesystems it can happen that link
+	   makes two "logical" inodes point to the same "physical"
+	   inode.  We invalidate the attributes of the old one, so it
+	   will reflect changes in the backing inode (link count,
+	   etc.)
+	*/
+	if (!err || err == -EINTR)
+		fuse_invalidate_attr(inode);
+	return err;
+}
+
 int fuse_do_getattr(struct inode *inode)
 {
 	int err;
@@ -341,6 +600,91 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static unsigned iattr_to_fattr(struct iattr *iattr, struct fuse_attr *fattr)
+{
+	unsigned ivalid = iattr->ia_valid;
+	unsigned fvalid = 0;
+
+	memset(fattr, 0, sizeof(*fattr));
+
+	if (ivalid & ATTR_MODE)
+		fvalid |= FATTR_MODE,   fattr->mode = iattr->ia_mode;
+	if (ivalid & ATTR_UID)
+		fvalid |= FATTR_UID,    fattr->uid = iattr->ia_uid;
+	if (ivalid & ATTR_GID)
+		fvalid |= FATTR_GID,    fattr->gid = iattr->ia_gid;
+	if (ivalid & ATTR_SIZE)
+		fvalid |= FATTR_SIZE,   fattr->size = iattr->ia_size;
+	/* You can only _set_ these together (they may change by themselves) */
+	if ((ivalid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) {
+		fvalid |= FATTR_ATIME | FATTR_MTIME;
+		fattr->atime = iattr->ia_atime.tv_sec;
+		fattr->mtime = iattr->ia_mtime.tv_sec;
+	}
+
+	return fvalid;
+}
+
+static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	struct fuse_setattr_in inarg;
+	struct fuse_attr_out outarg;
+	int err;
+	int is_truncate = 0;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		unsigned long limit;
+		is_truncate = 1;
+		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
+			send_sig(SIGXFSZ, current, 0);
+			return -EFBIG;
+		}
+	}
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.valid = iattr_to_fattr(attr, &inarg.attr);
+	req->in.h.opcode = FUSE_SETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+			make_bad_inode(inode);
+			err = -EIO;
+		} else {
+			if (is_truncate) {
+				loff_t origsize = i_size_read(inode);
+				i_size_write(inode, outarg.attr.size);
+				if (origsize > outarg.attr.size)
+					vmtruncate(inode, outarg.attr.size);
+			}
+			fuse_change_attributes(inode, &outarg.attr);
+			fi->i_time = time_to_jiffies(outarg.attr_valid,
+						     outarg.attr_valid_nsec);
+		}
+	} else if (err == -EINTR)
+		fuse_invalidate_attr(inode);
+
+	return err;
+}
+
 static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 			struct kstat *stat)
 {
@@ -373,6 +717,15 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 
 static struct inode_operations fuse_dir_inode_operations = {
 	.lookup		= fuse_lookup,
+	.mkdir		= fuse_mkdir,
+	.symlink	= fuse_symlink,
+	.unlink		= fuse_unlink,
+	.rmdir		= fuse_rmdir,
+	.rename		= fuse_rename,
+	.link		= fuse_link,
+	.setattr	= fuse_setattr,
+	.create		= fuse_create,
+	.mknod		= fuse_mknod,
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
 };
@@ -385,11 +738,13 @@ static struct file_operations fuse_dir_operations = {
 };
 
 static struct inode_operations fuse_common_inode_operations = {
+	.setattr	= fuse_setattr,
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
 };
 
 static struct inode_operations fuse_symlink_inode_operations = {
+	.setattr	= fuse_setattr,
 	.follow_link	= fuse_follow_link,
 	.put_link	= fuse_put_link,
 	.readlink	= generic_readlink,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8d91e1492f9..87d25b8f2dc 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -30,6 +30,9 @@ struct fuse_inode {
 	 * and kernel */
 	u64 nodeid;
 
+	/** Number of lookups on this inode */
+	u64 nlookup;
+
 	/** The request used for sending the FORGET message */
 	struct fuse_req *forget_req;
 
@@ -252,13 +255,13 @@ extern spinlock_t fuse_lock;
  * Get a filled in inode
  */
 struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
-			int generation, struct fuse_attr *attr, int version);
+			int generation, struct fuse_attr *attr);
 
 /**
  * Send FORGET command
  */
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-		      unsigned long nodeid, int version);
+		      unsigned long nodeid, u64 nlookup);
 
 /**
  * Initialise inode operations on regular files and special files
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 41498a1952a..fa03f80806e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -51,6 +51,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi = get_fuse_inode(inode);
 	fi->i_time = jiffies - 1;
 	fi->nodeid = 0;
+	fi->nlookup = 0;
 	fi->forget_req = fuse_request_alloc();
 	if (!fi->forget_req) {
 		kmem_cache_free(fuse_inode_cachep, inode);
@@ -74,10 +75,10 @@ static void fuse_read_inode(struct inode *inode)
 }
 
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-		      unsigned long nodeid, int version)
+		      unsigned long nodeid, u64 nlookup)
 {
 	struct fuse_forget_in *inarg = &req->misc.forget_in;
-	inarg->version = version;
+	inarg->nlookup = nlookup;
 	req->in.h.opcode = FUSE_FORGET;
 	req->in.h.nodeid = nodeid;
 	req->in.numargs = 1;
@@ -91,7 +92,7 @@ static void fuse_clear_inode(struct inode *inode)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	if (fc) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
-		fuse_send_forget(fc, fi->forget_req, fi->nodeid, inode->i_version);
+		fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
 		fi->forget_req = NULL;
 	}
 }
@@ -156,9 +157,10 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp)
 }
 
 struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
-			int generation, struct fuse_attr *attr, int version)
+			int generation, struct fuse_attr *attr)
 {
 	struct inode *inode;
+	struct fuse_inode *fi;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	int retried = 0;
 
@@ -181,8 +183,9 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 		goto retry;
 	}
 
+	fi = get_fuse_inode(inode);
+	fi->nlookup ++;
 	fuse_change_attributes(inode, attr);
-	inode->i_version = version;
 	return inode;
 }
 
@@ -389,7 +392,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 
 	attr.mode = mode;
 	attr.ino = FUSE_ROOT_ID;
-	return fuse_iget(sb, 1, 0, &attr, 0);
+	return fuse_iget(sb, 1, 0, &attr);
 }
 
 static struct super_operations fuse_super_operations = {
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 21b9ba16f8d..19d69a3e162 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -11,7 +11,7 @@
 #include <asm/types.h>
 
 /** Version number of this interface */
-#define FUSE_KERNEL_VERSION 6
+#define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
 #define FUSE_KERNEL_MINOR_VERSION 1
@@ -52,12 +52,28 @@ struct fuse_kstatfs {
 	__u32	namelen;
 };
 
+#define FATTR_MODE	(1 << 0)
+#define FATTR_UID	(1 << 1)
+#define FATTR_GID	(1 << 2)
+#define FATTR_SIZE	(1 << 3)
+#define FATTR_ATIME	(1 << 4)
+#define FATTR_MTIME	(1 << 5)
+#define FATTR_CTIME	(1 << 6)
+
 enum fuse_opcode {
 	FUSE_LOOKUP	   = 1,
 	FUSE_FORGET	   = 2,  /* no reply */
 	FUSE_GETATTR	   = 3,
+	FUSE_SETATTR	   = 4,
 	FUSE_READLINK	   = 5,
+	FUSE_SYMLINK	   = 6,
 	FUSE_GETDIR	   = 7,
+	FUSE_MKNOD	   = 8,
+	FUSE_MKDIR	   = 9,
+	FUSE_UNLINK	   = 10,
+	FUSE_RMDIR	   = 11,
+	FUSE_RENAME	   = 12,
+	FUSE_LINK	   = 13,
 	FUSE_STATFS	   = 17,
 	FUSE_INIT          = 26
 };
@@ -66,6 +82,7 @@ enum fuse_opcode {
 #define FUSE_MAX_IN 8192
 
 #define FUSE_NAME_MAX 1024
+#define FUSE_SYMLINK_MAX 4096
 
 struct fuse_entry_out {
 	__u64	nodeid;		/* Inode ID */
@@ -79,7 +96,7 @@ struct fuse_entry_out {
 };
 
 struct fuse_forget_in {
-	__u64	version;
+	__u64	nlookup;
 };
 
 struct fuse_attr_out {
@@ -93,6 +110,28 @@ struct fuse_getdir_out {
 	__u32	fd;
 };
 
+struct fuse_mknod_in {
+	__u32	mode;
+	__u32	rdev;
+};
+
+struct fuse_mkdir_in {
+	__u32	mode;
+};
+
+struct fuse_rename_in {
+	__u64	newdir;
+};
+
+struct fuse_link_in {
+	__u64	oldnodeid;
+};
+
+struct fuse_setattr_in {
+	__u32	valid;
+	struct fuse_attr attr;
+};
+
 struct fuse_statfs_out {
 	struct fuse_kstatfs st;
 };
-- 
cgit v1.2.3-70-g09d2


From b6aeadeda22a9aa322fdfcd3f4c69ccf0da5cbdd Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:30 -0700
Subject: [PATCH] FUSE - file operations

This patch adds the file operations of FUSE.

The following operations are added:

 o open
 o flush
 o release
 o fsync
 o readpage
 o commit_write

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/Makefile     |   2 +-
 fs/fuse/dir.c        |   1 +
 fs/fuse/file.c       | 341 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h     |  21 ++++
 fs/fuse/inode.c      |   2 +
 include/linux/fuse.h |  47 +++++++
 6 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 fs/fuse/file.c

(limited to 'fs')

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index c34e268a0ed..c3e1f760cac 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 
-fuse-objs := dev.o dir.o inode.o
+fuse-objs := dev.o dir.o file.o inode.o
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 92c7188ccd1..8adc1eed164 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -731,6 +731,7 @@ static struct inode_operations fuse_dir_inode_operations = {
 };
 
 static struct file_operations fuse_dir_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fuse_readdir,
 	.open		= fuse_dir_open,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
new file mode 100644
index 00000000000..de8c9c70246
--- /dev/null
+++ b/fs/fuse/file.c
@@ -0,0 +1,341 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+
+static int fuse_open(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_open_in inarg;
+	struct fuse_open_out outarg;
+	struct fuse_file *ff;
+	int err;
+	/* Restarting the syscall is not allowed if O_CREAT and O_EXCL
+	   are both set, because creation will fail on the restart */
+	int excl = (file->f_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL);
+
+	err = generic_file_open(inode, file);
+	if (err)
+		return err;
+
+	/* If opening the root node, no lookup has been performed on
+	   it, so the attributes must be refreshed */
+	if (get_node_id(inode) == FUSE_ROOT_ID) {
+		int err = fuse_do_getattr(inode);
+		if (err)
+		 	return err;
+	}
+
+	if (excl)
+		req = fuse_get_request_nonint(fc);
+	else
+		req = fuse_get_request(fc);
+	if (!req)
+		return excl ? -EINTR : -ERESTARTSYS;
+
+	err = -ENOMEM;
+	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
+	if (!ff)
+		goto out_put_request;
+
+	ff->release_req = fuse_request_alloc();
+	if (!ff->release_req) {
+		kfree(ff);
+		goto out_put_request;
+	}
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+	req->in.h.opcode = FUSE_OPEN;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	if (excl)
+		request_send_nonint(fc, req);
+	else
+		request_send(fc, req);
+	err = req->out.h.error;
+	if (!err)
+		invalidate_inode_pages(inode->i_mapping);
+	if (err) {
+		fuse_request_free(ff->release_req);
+		kfree(ff);
+	} else {
+		ff->fh = outarg.fh;
+		file->private_data = ff;
+	}
+
+ out_put_request:
+	fuse_put_request(fc, req);
+	return err;
+}
+
+static int fuse_release(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req = ff->release_req;
+	struct fuse_release_in *inarg = &req->misc.release_in;
+
+	inarg->fh = ff->fh;
+	inarg->flags = file->f_flags & ~O_EXCL;
+	req->in.h.opcode = FUSE_RELEASE;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_release_in);
+	req->in.args[0].value = inarg;
+	request_send_background(fc, req);
+	kfree(ff);
+
+	/* Return value is ignored by VFS */
+	return 0;
+}
+
+static int fuse_flush(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_flush_in inarg;
+	int err;
+
+	if (fc->no_flush)
+		return 0;
+
+	req = fuse_get_request_nonint(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	req->in.h.opcode = FUSE_FLUSH;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	request_send_nonint(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_flush = 1;
+		err = 0;
+	}
+	return err;
+}
+
+static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+{
+	struct inode *inode = de->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_fsync_in inarg;
+	int err;
+
+	if (fc->no_fsync)
+		return 0;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTSYS;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	inarg.fsync_flags = datasync ? 1 : 0;
+	req->in.h.opcode = FUSE_FSYNC;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_fsync = 1;
+		err = 0;
+	}
+	return err;
+}
+
+static ssize_t fuse_send_read(struct fuse_req *req, struct file *file,
+			      struct inode *inode, loff_t pos,  size_t count)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_read_in inarg;
+
+	memset(&inarg, 0, sizeof(struct fuse_read_in));
+	inarg.fh = ff->fh;
+	inarg.offset = pos;
+	inarg.size = count;
+	req->in.h.opcode = FUSE_READ;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_read_in);
+	req->in.args[0].value = &inarg;
+	req->out.argpages = 1;
+	req->out.argvar = 1;
+	req->out.numargs = 1;
+	req->out.args[0].size = count;
+	request_send_nonint(fc, req);
+	return req->out.args[0].size;
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
+	struct fuse_req *req = fuse_get_request_nonint(fc);
+	int err = -EINTR;
+	if (!req)
+		goto out;
+
+	req->out.page_zeroing = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		SetPageUptodate(page);
+ out:
+	unlock_page(page);
+	return err;
+}
+
+static ssize_t fuse_send_write(struct fuse_req *req, struct file *file,
+			       struct inode *inode, loff_t pos, size_t count)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_write_in inarg;
+	struct fuse_write_out outarg;
+
+	memset(&inarg, 0, sizeof(struct fuse_write_in));
+	inarg.fh = ff->fh;
+	inarg.offset = pos;
+	inarg.size = count;
+	req->in.h.opcode = FUSE_WRITE;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.argpages = 1;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(struct fuse_write_in);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = count;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(struct fuse_write_out);
+	req->out.args[0].value = &outarg;
+	request_send_nonint(fc, req);
+	return outarg.size;
+}
+
+static int fuse_prepare_write(struct file *file, struct page *page,
+			      unsigned offset, unsigned to)
+{
+	/* No op */
+	return 0;
+}
+
+static int fuse_commit_write(struct file *file, struct page *page,
+			     unsigned offset, unsigned to)
+{
+	int err;
+	ssize_t nres;
+	unsigned count = to - offset;
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
+	struct fuse_req *req = fuse_get_request_nonint(fc);
+	if (!req)
+		return -EINTR;
+
+	req->num_pages = 1;
+	req->pages[0] = page;
+	req->page_offset = offset;
+	nres = fuse_send_write(req, file, inode, pos, count);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err && nres != count)
+		err = -EIO;
+	if (!err) {
+		pos += count;
+		if (pos > i_size_read(inode))
+			i_size_write(inode, pos);
+
+		if (offset == 0 && to == PAGE_CACHE_SIZE) {
+			clear_page_dirty(page);
+			SetPageUptodate(page);
+		}
+	} else if (err == -EINTR || err == -EIO)
+		fuse_invalidate_attr(inode);
+	return err;
+}
+
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_SHARED)) {
+		if ((vma->vm_flags & VM_WRITE))
+			return -ENODEV;
+		else
+			vma->vm_flags &= ~VM_MAYWRITE;
+	}
+	return generic_file_mmap(file, vma);
+}
+
+static int fuse_set_page_dirty(struct page *page)
+{
+	printk("fuse_set_page_dirty: should not happen\n");
+	dump_stack();
+	return 0;
+}
+
+static struct file_operations fuse_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+	.mmap		= fuse_file_mmap,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	.sendfile	= generic_file_sendfile,
+};
+
+static struct address_space_operations fuse_file_aops  = {
+	.readpage	= fuse_readpage,
+	.prepare_write	= fuse_prepare_write,
+	.commit_write	= fuse_commit_write,
+	.set_page_dirty	= fuse_set_page_dirty,
+};
+
+void fuse_init_file_inode(struct inode *inode)
+{
+	inode->i_fop = &fuse_file_operations;
+	inode->i_data.a_ops = &fuse_file_aops;
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 87d25b8f2dc..b4aa8f7bc2c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -40,6 +40,15 @@ struct fuse_inode {
 	unsigned long i_time;
 };
 
+/** FUSE specific file data */
+struct fuse_file {
+	/** Request reserved for flush and release */
+	struct fuse_req *release_req;
+
+	/** File handle used by userspace */
+	u64 fh;
+};
+
 /** One input argument of a request */
 struct fuse_in_arg {
 	unsigned size;
@@ -136,6 +145,7 @@ struct fuse_req {
 	/** Data for asynchronous requests */
 	union {
 		struct fuse_forget_in forget_in;
+		struct fuse_release_in release_in;
 		struct fuse_init_in_out init_in_out;
 	} misc;
 
@@ -200,6 +210,12 @@ struct fuse_conn {
 	/** Connection failed (version mismatch) */
 	unsigned conn_error : 1;
 
+	/** Is fsync not implemented by fs? */
+	unsigned no_fsync : 1;
+
+	/** Is flush not implemented by fs? */
+	unsigned no_flush : 1;
+
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 };
@@ -263,6 +279,11 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 		      unsigned long nodeid, u64 nlookup);
 
+/**
+ * Initialise file operations on a regular file
+ */
+void fuse_init_file_inode(struct inode *inode);
+
 /**
  * Initialise inode operations on regular files and special files
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fa03f80806e..f229d696264 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -124,6 +124,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 	i_size_write(inode, attr->size);
 	if (S_ISREG(inode->i_mode)) {
 		fuse_init_common(inode);
+		fuse_init_file_inode(inode);
 	} else if (S_ISDIR(inode->i_mode))
 		fuse_init_dir(inode);
 	else if (S_ISLNK(inode->i_mode))
@@ -137,6 +138,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 		/* Don't let user create weird files */
 		inode->i_mode = S_IFREG;
 		fuse_init_common(inode);
+		fuse_init_file_inode(inode);
 	}
 }
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 19d69a3e162..61f34636ffb 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -74,7 +74,13 @@ enum fuse_opcode {
 	FUSE_RMDIR	   = 11,
 	FUSE_RENAME	   = 12,
 	FUSE_LINK	   = 13,
+	FUSE_OPEN	   = 14,
+	FUSE_READ	   = 15,
+	FUSE_WRITE	   = 16,
 	FUSE_STATFS	   = 17,
+	FUSE_RELEASE       = 18,
+	FUSE_FSYNC         = 20,
+	FUSE_FLUSH         = 25,
 	FUSE_INIT          = 26
 };
 
@@ -132,10 +138,51 @@ struct fuse_setattr_in {
 	struct fuse_attr attr;
 };
 
+struct fuse_open_in {
+	__u32	flags;
+};
+
+struct fuse_open_out {
+	__u64	fh;
+	__u32	open_flags;
+};
+
+struct fuse_release_in {
+	__u64	fh;
+	__u32	flags;
+};
+
+struct fuse_flush_in {
+	__u64	fh;
+	__u32	flush_flags;
+};
+
+struct fuse_read_in {
+	__u64	fh;
+	__u64	offset;
+	__u32	size;
+};
+
+struct fuse_write_in {
+	__u64	fh;
+	__u64	offset;
+	__u32	size;
+	__u32	write_flags;
+};
+
+struct fuse_write_out {
+	__u32	size;
+};
+
 struct fuse_statfs_out {
 	struct fuse_kstatfs st;
 };
 
+struct fuse_fsync_in {
+	__u64	fh;
+	__u32	fsync_flags;
+};
+
 struct fuse_init_in_out {
 	__u32	major;
 	__u32	minor;
-- 
cgit v1.2.3-70-g09d2


From 1e9a4ed9396e9c31139721b639550ffb1df17065 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:31 -0700
Subject: [PATCH] FUSE - mount options

This patch adds miscellaneous mount options to the FUSE filesystem.

The following mount options are added:

 o default_permissions:  check permissions with generic_permission()
 o allow_other:          allow other users to access files
 o allow_root:           allow root to access files
 o kernel_cache:         don't invalidate page cache on open

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 78 ++++++++++++++++++++++++++++++++++++++++++--------------
 fs/fuse/dir.c    | 35 ++++++++++++++++++++++---
 fs/fuse/file.c   |  2 +-
 fs/fuse/fuse_i.h | 45 ++++++++++++++++++++++++++++----
 fs/fuse/inode.c  | 77 +++++++++++++++++++++++++++----------------------------
 5 files changed, 170 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e8f3170946f..ca6fc0e96d7 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -26,7 +26,7 @@ static inline struct fuse_conn *fuse_get_conn(struct file *file)
 	struct fuse_conn *fc;
 	spin_lock(&fuse_lock);
 	fc = file->private_data;
-	if (fc && !fc->sb)
+	if (fc && !fc->mounted)
 		fc = NULL;
 	spin_unlock(&fuse_lock);
 	return fc;
@@ -148,6 +148,17 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 		fuse_putback_request(fc, req);
 }
 
+void fuse_release_background(struct fuse_req *req)
+{
+	iput(req->inode);
+	iput(req->inode2);
+	if (req->file)
+		fput(req->file);
+	spin_lock(&fuse_lock);
+	list_del(&req->bg_entry);
+	spin_unlock(&fuse_lock);
+}
+
 /*
  * This function is called when a request is finished.  Either a reply
  * has arrived or it was interrupted (and not yet sent) or some error
@@ -166,12 +177,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	putback = atomic_dec_and_test(&req->count);
 	spin_unlock(&fuse_lock);
 	if (req->background) {
-		if (req->inode)
-			iput(req->inode);
-		if (req->inode2)
-			iput(req->inode2);
-		if (req->file)
-			fput(req->file);
+		down_read(&fc->sbput_sem);
+		if (fc->mounted)
+			fuse_release_background(req);
+		up_read(&fc->sbput_sem);
 	}
 	wake_up(&req->waitq);
 	if (req->in.h.opcode == FUSE_INIT) {
@@ -191,11 +200,39 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		fuse_putback_request(fc, req);
 }
 
-static void background_request(struct fuse_req *req)
+/*
+ * Unfortunately request interruption not just solves the deadlock
+ * problem, it causes problems too.  These stem from the fact, that an
+ * interrupted request is continued to be processed in userspace,
+ * while all the locks and object references (inode and file) held
+ * during the operation are released.
+ *
+ * To release the locks is exactly why there's a need to interrupt the
+ * request, so there's not a lot that can be done about this, except
+ * introduce additional locking in userspace.
+ *
+ * More important is to keep inode and file references until userspace
+ * has replied, otherwise FORGET and RELEASE could be sent while the
+ * inode/file is still used by the filesystem.
+ *
+ * For this reason the concept of "background" request is introduced.
+ * An interrupted request is backgrounded if it has been already sent
+ * to userspace.  Backgrounding involves getting an extra reference to
+ * inode(s) or file used in the request, and adding the request to
+ * fc->background list.  When a reply is received for a background
+ * request, the object references are released, and the request is
+ * removed from the list.  If the filesystem is unmounted while there
+ * are still background requests, the list is walked and references
+ * are released as if a reply was received.
+ *
+ * There's one more use for a background request.  The RELEASE message is
+ * always sent as background, since it doesn't return an error or
+ * data.
+ */
+static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	/* Need to get hold of the inode(s) and/or file used in the
-	   request, so FORGET and RELEASE are not sent too early */
 	req->background = 1;
+	list_add(&req->bg_entry, &fc->background);
 	if (req->inode)
 		req->inode = igrab(req->inode);
 	if (req->inode2)
@@ -215,7 +252,8 @@ static int request_wait_answer_nonint(struct fuse_req *req)
 }
 
 /* Called with fuse_lock held.  Releases, and then reacquires it. */
-static void request_wait_answer(struct fuse_req *req, int interruptible)
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req,
+				int interruptible)
 {
 	int intr;
 
@@ -255,7 +293,7 @@ static void request_wait_answer(struct fuse_req *req, int interruptible)
 		list_del(&req->list);
 		__fuse_put_request(req);
 	} else if (!req->finished && req->sent)
-		background_request(req);
+		background_request(fc, req);
 }
 
 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
@@ -297,7 +335,7 @@ static void request_send_wait(struct fuse_conn *fc, struct fuse_req *req,
 {
 	req->isreply = 1;
 	spin_lock(&fuse_lock);
-	if (!fc->file)
+	if (!fc->connected)
 		req->out.h.error = -ENOTCONN;
 	else if (fc->conn_error)
 		req->out.h.error = -ECONNREFUSED;
@@ -307,7 +345,7 @@ static void request_send_wait(struct fuse_conn *fc, struct fuse_req *req,
 		   after request_end() */
 		__fuse_get_request(req);
 
-		request_wait_answer(req, interruptible);
+		request_wait_answer(fc, req, interruptible);
 	}
 	spin_unlock(&fuse_lock);
 }
@@ -330,7 +368,7 @@ void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req)
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fuse_lock);
-	if (fc->file) {
+	if (fc->connected) {
 		queue_request(fc, req);
 		spin_unlock(&fuse_lock);
 	} else {
@@ -348,7 +386,9 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	background_request(req);
+	spin_lock(&fuse_lock);
+	background_request(fc, req);
+	spin_unlock(&fuse_lock);
 	request_send_nowait(fc, req);
 }
 
@@ -583,7 +623,7 @@ static void request_wait(struct fuse_conn *fc)
 	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue_exclusive(&fc->waitq, &wait);
-	while (fc->sb && list_empty(&fc->pending)) {
+	while (fc->mounted && list_empty(&fc->pending)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (signal_pending(current))
 			break;
@@ -622,7 +662,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 		goto err_unlock;
 	request_wait(fc);
 	err = -ENODEV;
-	if (!fc->sb)
+	if (!fc->mounted)
 		goto err_unlock;
 	err = -ERESTARTSYS;
 	if (list_empty(&fc->pending))
@@ -839,7 +879,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 	spin_lock(&fuse_lock);
 	fc = file->private_data;
 	if (fc) {
-		fc->file = NULL;
+		fc->connected = 0;
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		fuse_release_conn(fc);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8adc1eed164..0950455914d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -418,7 +418,8 @@ static int fuse_revalidate(struct dentry *entry)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	if (get_node_id(inode) == FUSE_ROOT_ID) {
-		if (current->fsuid != fc->user_id)
+		if (!(fc->flags & FUSE_ALLOW_OTHER) &&
+		    current->fsuid != fc->user_id)
 			return -EACCES;
 	} else if (time_before_eq(jiffies, fi->i_time))
 		return 0;
@@ -430,9 +431,31 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (current->fsuid != fc->user_id)
+	if (!(fc->flags & FUSE_ALLOW_OTHER) && current->fsuid != fc->user_id)
 		return -EACCES;
-	else {
+	else if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+		int err = generic_permission(inode, mask, NULL);
+
+		/* If permission is denied, try to refresh file
+		   attributes.  This is also needed, because the root
+		   node will at first have no permissions */
+		if (err == -EACCES) {
+		 	err = fuse_do_getattr(inode);
+			if (!err)
+				err = generic_permission(inode, mask, NULL);
+		}
+
+		/* FIXME: Need some mechanism to revoke permissions:
+		   currently if the filesystem suddenly changes the
+		   file mode, we will not be informed about it, and
+		   continue to allow access to the file/directory.
+
+		   This is actually not so grave, since the user can
+		   simply keep access to the file/directory anyway by
+		   keeping it open... */
+
+		return err;
+	} else {
 		int mode = inode->i_mode;
 		if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
                     (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
@@ -636,6 +659,12 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	int err;
 	int is_truncate = 0;
 
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+		err = inode_change_ok(inode, attr);
+		if (err)
+			return err;
+	}
+
 	if (attr->ia_valid & ATTR_SIZE) {
 		unsigned long limit;
 		is_truncate = 1;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index de8c9c70246..96ea302db18 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -70,7 +70,7 @@ static int fuse_open(struct inode *inode, struct file *file)
 	else
 		request_send(fc, req);
 	err = req->out.h.error;
-	if (!err)
+	if (!err && !(fc->flags & FUSE_KERNEL_CACHE))
 		invalidate_inode_pages(inode->i_mapping);
 	if (err) {
 		fuse_request_free(ff->release_req);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b4aa8f7bc2c..c8e6c87496e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,19 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
+/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
+    module will check permissions based on the file mode.  Otherwise no
+    permission checking is done in the kernel */
+#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
+
+/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
+    doing the mount will be allowed to access the filesystem */
+#define FUSE_ALLOW_OTHER         (1 << 1)
+
+/** If the FUSE_KERNEL_CACHE flag is given, then cached data will not
+    be flushed on open */
+#define FUSE_KERNEL_CACHE        (1 << 2)
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
@@ -109,6 +122,9 @@ struct fuse_req {
 	    lists in fuse_conn */
 	struct list_head list;
 
+	/** Entry on the background list */
+	struct list_head bg_entry;
+
 	/** refcount */
 	atomic_t count;
 
@@ -176,15 +192,15 @@ struct fuse_req {
  * unmounted.
  */
 struct fuse_conn {
-	/** The superblock of the mounted filesystem */
-	struct super_block *sb;
-
-	/** The opened client device */
-	struct file *file;
+	/** Reference count */
+	int count;
 
 	/** The user id for this mount */
 	uid_t user_id;
 
+	/** The fuse mount flags for this mount */
+	unsigned flags;
+
 	/** Readers of the connection are waiting on this */
 	wait_queue_head_t waitq;
 
@@ -194,6 +210,10 @@ struct fuse_conn {
 	/** The list of requests being processed */
 	struct list_head processing;
 
+	/** Requests put in the background (RELEASE or any other
+	    interrupted request) */
+	struct list_head background;
+
 	/** Controls the maximum number of outstanding requests */
 	struct semaphore outstanding_sem;
 
@@ -201,12 +221,21 @@ struct fuse_conn {
 	    outstanding_sem would go negative */
 	unsigned outstanding_debt;
 
+	/** RW semaphore for exclusion with fuse_put_super() */
+	struct rw_semaphore sbput_sem;
+
 	/** The list of unused requests */
 	struct list_head unused_list;
 
 	/** The next unique request id */
 	u64 reqctr;
 
+	/** Mount is active */
+	unsigned mounted : 1;
+
+	/** Connection established */
+	unsigned connected : 1;
+
 	/** Connection failed (version mismatch) */
 	unsigned conn_error : 1;
 
@@ -261,6 +290,7 @@ extern struct file_operations fuse_dev_operations;
  *  - the private_data field of the device file
  *  - the s_fs_info field of the super block
  *  - unused_list, pending, processing lists in fuse_conn
+ *  - background list in fuse_conn
  *  - the unique request ID counter reqctr in fuse_conn
  *  - the sb (super_block) field in fuse_conn
  *  - the file (device file) field in fuse_conn
@@ -371,6 +401,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
  */
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
+/**
+ * Release inodes and file assiciated with background request
+ */
+void fuse_release_background(struct fuse_req *req);
+
 /**
  * Get the attributes of a file
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f229d696264..458c62ca0fe 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -15,7 +15,6 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/moduleparam.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
 
@@ -25,11 +24,6 @@ MODULE_LICENSE("GPL");
 
 spinlock_t fuse_lock;
 static kmem_cache_t *fuse_inode_cachep;
-static int mount_count;
-
-static int mount_max = 1000;
-module_param(mount_max, int, 0644);
-MODULE_PARM_DESC(mount_max, "Maximum number of FUSE mounts allowed, if -1 then unlimited (default: 1000)");
 
 #define FUSE_SUPER_MAGIC 0x65735546
 
@@ -37,6 +31,7 @@ struct fuse_mount_data {
 	int fd;
 	unsigned rootmode;
 	unsigned user_id;
+	unsigned flags;
 };
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
@@ -89,8 +84,8 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 
 static void fuse_clear_inode(struct inode *inode)
 {
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	if (fc) {
+	if (inode->i_sb->s_flags & MS_ACTIVE) {
+		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
 		fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
 		fi->forget_req = NULL;
@@ -195,14 +190,19 @@ static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
+	down_write(&fc->sbput_sem);
+	while (!list_empty(&fc->background))
+		fuse_release_background(list_entry(fc->background.next,
+						   struct fuse_req, bg_entry));
+
 	spin_lock(&fuse_lock);
-	mount_count --;
-	fc->sb = NULL;
+	fc->mounted = 0;
 	fc->user_id = 0;
+	fc->flags = 0;
 	/* Flush all readers on this fs */
 	wake_up_all(&fc->waitq);
+	up_write(&fc->sbput_sem);
 	fuse_release_conn(fc);
-	*get_fuse_conn_super_p(sb) = NULL;
 	spin_unlock(&fuse_lock);
 }
 
@@ -249,7 +249,6 @@ enum {
 	OPT_USER_ID,
 	OPT_DEFAULT_PERMISSIONS,
 	OPT_ALLOW_OTHER,
-	OPT_ALLOW_ROOT,
 	OPT_KERNEL_CACHE,
 	OPT_ERR
 };
@@ -260,7 +259,6 @@ static match_table_t tokens = {
 	{OPT_USER_ID,			"user_id=%u"},
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ALLOW_OTHER,		"allow_other"},
-	{OPT_ALLOW_ROOT,		"allow_root"},
 	{OPT_KERNEL_CACHE,		"kernel_cache"},
 	{OPT_ERR,			NULL}
 };
@@ -298,6 +296,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			d->user_id = value;
 			break;
 
+		case OPT_DEFAULT_PERMISSIONS:
+			d->flags |= FUSE_DEFAULT_PERMISSIONS;
+			break;
+
+		case OPT_ALLOW_OTHER:
+			d->flags |= FUSE_ALLOW_OTHER;
+			break;
+
+		case OPT_KERNEL_CACHE:
+			d->flags |= FUSE_KERNEL_CACHE;
+			break;
+
 		default:
 			return 0;
 		}
@@ -313,6 +323,12 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
 
 	seq_printf(m, ",user_id=%u", fc->user_id);
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+		seq_puts(m, ",default_permissions");
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		seq_puts(m, ",allow_other");
+	if (fc->flags & FUSE_KERNEL_CACHE)
+		seq_puts(m, ",kernel_cache");
 	return 0;
 }
 
@@ -330,7 +346,8 @@ static void free_conn(struct fuse_conn *fc)
 /* Must be called with the fuse lock held */
 void fuse_release_conn(struct fuse_conn *fc)
 {
-	if (!fc->sb && !fc->file)
+	fc->count--;
+	if (!fc->count)
 		free_conn(fc);
 }
 
@@ -342,14 +359,13 @@ static struct fuse_conn *new_conn(void)
 	if (fc != NULL) {
 		int i;
 		memset(fc, 0, sizeof(*fc));
-		fc->sb = NULL;
-		fc->file = NULL;
-		fc->user_id = 0;
 		init_waitqueue_head(&fc->waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->unused_list);
+		INIT_LIST_HEAD(&fc->background);
 		sema_init(&fc->outstanding_sem, 0);
+		init_rwsem(&fc->sbput_sem);
 		for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
 			struct fuse_req *req = fuse_request_alloc();
 			if (!req) {
@@ -380,8 +396,10 @@ static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
 		fc = ERR_PTR(-EINVAL);
 	} else {
 		file->private_data = fc;
-		fc->sb = sb;
-		fc->file = file;
+		*get_fuse_conn_super_p(sb) = fc;
+		fc->mounted = 1;
+		fc->connected = 1;
+		fc->count = 2;
 	}
 	spin_unlock(&fuse_lock);
 	return fc;
@@ -407,17 +425,6 @@ static struct super_operations fuse_super_operations = {
 	.show_options	= fuse_show_options,
 };
 
-static int inc_mount_count(void)
-{
-	int success = 0;
-	spin_lock(&fuse_lock);
-	mount_count ++;
-	if (mount_max == -1 || mount_count <= mount_max)
-		success = 1;
-	spin_unlock(&fuse_lock);
-	return success;
-}
-
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct fuse_conn *fc;
@@ -444,14 +451,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (IS_ERR(fc))
 		return PTR_ERR(fc);
 
+	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 
-	*get_fuse_conn_super_p(sb) = fc;
-
-	err = -ENFILE;
-	if (!inc_mount_count() && current->uid != 0)
-		goto err;
-
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
 	if (root == NULL)
@@ -467,11 +469,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
  err:
 	spin_lock(&fuse_lock);
-	mount_count --;
-	fc->sb = NULL;
 	fuse_release_conn(fc);
 	spin_unlock(&fuse_lock);
-	*get_fuse_conn_super_p(sb) = NULL;
 	return err;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 92a8780e1136c5ca0c7ed940000d399943d1576e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:31 -0700
Subject: [PATCH] FUSE - extended attribute operations

This patch adds the extended attribute operations to FUSE.

The following operations are added:

 o getxattr
 o setxattr
 o listxattr
 o removexattr

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c        | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h     |  12 ++++
 include/linux/fuse.h |  18 +++++
 3 files changed, 213 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0950455914d..f127625543b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -744,6 +744,177 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	return d_splice_alias(inode, entry);
 }
 
+static int fuse_setxattr(struct dentry *entry, const char *name,
+			 const void *value, size_t size, int flags)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_setxattr_in inarg;
+	int err;
+
+	if (size > FUSE_XATTR_SIZE_MAX)
+		return -E2BIG;
+
+	if (fc->no_setxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	inarg.flags = flags;
+	req->in.h.opcode = FUSE_SETXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 3;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = strlen(name) + 1;
+	req->in.args[1].value = name;
+	req->in.args[2].size = size;
+	req->in.args[2].value = value;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_setxattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
+static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
+			     void *value, size_t size)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (fc->no_getxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	req->in.h.opcode = FUSE_GETXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = strlen(name) + 1;
+	req->in.args[1].value = name;
+	/* This is really two different operations rolled into one */
+	req->out.numargs = 1;
+	if (size) {
+		req->out.argvar = 1;
+		req->out.args[0].size = size;
+		req->out.args[0].value = value;
+	} else {
+		req->out.args[0].size = sizeof(outarg);
+		req->out.args[0].value = &outarg;
+	}
+	request_send(fc, req);
+	ret = req->out.h.error;
+	if (!ret)
+		ret = size ? req->out.args[0].size : outarg.size;
+	else {
+		if (ret == -ENOSYS) {
+			fc->no_getxattr = 1;
+			ret = -EOPNOTSUPP;
+		}
+	}
+	fuse_put_request(fc, req);
+	return ret;
+}
+
+static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (fc->no_listxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	req->in.h.opcode = FUSE_LISTXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	/* This is really two different operations rolled into one */
+	req->out.numargs = 1;
+	if (size) {
+		req->out.argvar = 1;
+		req->out.args[0].size = size;
+		req->out.args[0].value = list;
+	} else {
+		req->out.args[0].size = sizeof(outarg);
+		req->out.args[0].value = &outarg;
+	}
+	request_send(fc, req);
+	ret = req->out.h.error;
+	if (!ret)
+		ret = size ? req->out.args[0].size : outarg.size;
+	else {
+		if (ret == -ENOSYS) {
+			fc->no_listxattr = 1;
+			ret = -EOPNOTSUPP;
+		}
+	}
+	fuse_put_request(fc, req);
+	return ret;
+}
+
+static int fuse_removexattr(struct dentry *entry, const char *name)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int err;
+
+	if (fc->no_removexattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTNOINTR;
+
+	req->in.h.opcode = FUSE_REMOVEXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = strlen(name) + 1;
+	req->in.args[0].value = name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_removexattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
 static struct inode_operations fuse_dir_inode_operations = {
 	.lookup		= fuse_lookup,
 	.mkdir		= fuse_mkdir,
@@ -757,6 +928,10 @@ static struct inode_operations fuse_dir_inode_operations = {
 	.mknod		= fuse_mknod,
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
 };
 
 static struct file_operations fuse_dir_operations = {
@@ -771,6 +946,10 @@ static struct inode_operations fuse_common_inode_operations = {
 	.setattr	= fuse_setattr,
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
 };
 
 static struct inode_operations fuse_symlink_inode_operations = {
@@ -779,6 +958,10 @@ static struct inode_operations fuse_symlink_inode_operations = {
 	.put_link	= fuse_put_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
 };
 
 void fuse_init_common(struct inode *inode)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c8e6c87496e..86183c56210 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -245,6 +245,18 @@ struct fuse_conn {
 	/** Is flush not implemented by fs? */
 	unsigned no_flush : 1;
 
+	/** Is setxattr not implemented by fs? */
+	unsigned no_setxattr : 1;
+
+	/** Is getxattr not implemented by fs? */
+	unsigned no_getxattr : 1;
+
+	/** Is listxattr not implemented by fs? */
+	unsigned no_listxattr : 1;
+
+	/** Is removexattr not implemented by fs? */
+	unsigned no_removexattr : 1;
+
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 };
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 61f34636ffb..bf564edf990 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -80,6 +80,10 @@ enum fuse_opcode {
 	FUSE_STATFS	   = 17,
 	FUSE_RELEASE       = 18,
 	FUSE_FSYNC         = 20,
+	FUSE_SETXATTR      = 21,
+	FUSE_GETXATTR      = 22,
+	FUSE_LISTXATTR     = 23,
+	FUSE_REMOVEXATTR   = 24,
 	FUSE_FLUSH         = 25,
 	FUSE_INIT          = 26
 };
@@ -89,6 +93,7 @@ enum fuse_opcode {
 
 #define FUSE_NAME_MAX 1024
 #define FUSE_SYMLINK_MAX 4096
+#define FUSE_XATTR_SIZE_MAX 4096
 
 struct fuse_entry_out {
 	__u64	nodeid;		/* Inode ID */
@@ -183,6 +188,19 @@ struct fuse_fsync_in {
 	__u32	fsync_flags;
 };
 
+struct fuse_setxattr_in {
+	__u32	size;
+	__u32	flags;
+};
+
+struct fuse_getxattr_in {
+	__u32	size;
+};
+
+struct fuse_getxattr_out {
+	__u32	size;
+};
+
 struct fuse_init_in_out {
 	__u32	major;
 	__u32	minor;
-- 
cgit v1.2.3-70-g09d2


From db50b96c0f28a21c5a4a19ecaba12d0972aab06a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:33 -0700
Subject: [PATCH] FUSE - readpages operation

This patch adds readpages support to FUSE.

With the help of the readpages() operation multiple reads are bundled
together and sent as a single request to userspace.  This can improve
reading performace.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c   | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h |  3 +++
 fs/fuse/inode.c  | 15 +++++++++++++
 3 files changed, 85 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 96ea302db18..86ffb6db5fe 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -227,6 +227,72 @@ static int fuse_readpage(struct file *file, struct page *page)
 	return err;
 }
 
+static int fuse_send_readpages(struct fuse_req *req, struct file *file,
+			       struct inode *inode)
+{
+	loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT;
+	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+	unsigned i;
+	req->out.page_zeroing = 1;
+	fuse_send_read(req, file, inode, pos, count);
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (!req->out.h.error)
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return req->out.h.error;
+}
+
+struct fuse_readpages_data {
+	struct fuse_req *req;
+	struct file *file;
+	struct inode *inode;
+};
+
+static int fuse_readpages_fill(void *_data, struct page *page)
+{
+	struct fuse_readpages_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (req->num_pages &&
+	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int err = fuse_send_readpages(req, data->file, inode);
+		if (err) {
+			unlock_page(page);
+			return err;
+		}
+		fuse_reset_request(req);
+	}
+	req->pages[req->num_pages] = page;
+	req->num_pages ++;
+	return 0;
+}
+
+static int fuse_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_readpages_data data;
+	int err;
+	data.file = file;
+	data.inode = inode;
+	data.req = fuse_get_request_nonint(fc);
+	if (!data.req)
+		return -EINTR;
+
+	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
+	if (!err && data.req->num_pages)
+		err = fuse_send_readpages(data.req, file, inode);
+	fuse_put_request(fc, data.req);
+	return err;
+}
+
 static ssize_t fuse_send_write(struct fuse_req *req, struct file *file,
 			       struct inode *inode, loff_t pos, size_t count)
 {
@@ -331,6 +397,7 @@ static struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.prepare_write	= fuse_prepare_write,
 	.commit_write	= fuse_commit_write,
+	.readpages	= fuse_readpages,
 	.set_page_dirty	= fuse_set_page_dirty,
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 86183c56210..aff3a01ea02 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -201,6 +201,9 @@ struct fuse_conn {
 	/** The fuse mount flags for this mount */
 	unsigned flags;
 
+	/** Maximum read size */
+	unsigned max_read;
+
 	/** Readers of the connection are waiting on this */
 	wait_queue_head_t waitq;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 458c62ca0fe..0b75c73386e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -32,6 +32,7 @@ struct fuse_mount_data {
 	unsigned rootmode;
 	unsigned user_id;
 	unsigned flags;
+	unsigned max_read;
 };
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
@@ -250,6 +251,7 @@ enum {
 	OPT_DEFAULT_PERMISSIONS,
 	OPT_ALLOW_OTHER,
 	OPT_KERNEL_CACHE,
+	OPT_MAX_READ,
 	OPT_ERR
 };
 
@@ -260,6 +262,7 @@ static match_table_t tokens = {
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_KERNEL_CACHE,		"kernel_cache"},
+	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_ERR,			NULL}
 };
 
@@ -268,6 +271,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 	char *p;
 	memset(d, 0, sizeof(struct fuse_mount_data));
 	d->fd = -1;
+	d->max_read = ~0;
 
 	while ((p = strsep(&opt, ",")) != NULL) {
 		int token;
@@ -308,6 +312,12 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			d->flags |= FUSE_KERNEL_CACHE;
 			break;
 
+		case OPT_MAX_READ:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->max_read = value;
+			break;
+
 		default:
 			return 0;
 		}
@@ -329,6 +339,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",allow_other");
 	if (fc->flags & FUSE_KERNEL_CACHE)
 		seq_puts(m, ",kernel_cache");
+	if (fc->max_read != ~0)
+		seq_printf(m, ",max_read=%u", fc->max_read);
 	return 0;
 }
 
@@ -453,6 +465,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
+	fc->max_read = d.max_read;
+	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
+		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
-- 
cgit v1.2.3-70-g09d2


From 87729a5514e855ce2c71e3e33833a106b8caf2ae Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:34 -0700
Subject: [PATCH] FUSE: tighten check for processes allowed access

This patch tightens the check for allowing processes to access non-privileged
mounts.  The rational is that the filesystem implementation can control the
behavior or get otherwise unavailable information of the filesystem user.  If
the filesystem user process has the same uid, gid, and is not suid or sgid
application, then access is safe.  Otherwise access is not allowed unless the
"allow_other" mount option is given (for which policy is controlled by the
userspace mount utility).

Thanks to everyone linux-fsdevel, especially Martin Mares who helped uncover
problems with the previous approach.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c    | 40 ++++++++++++++++++++++++++++++++++------
 fs/fuse/fuse_i.h |  3 +++
 fs/fuse/inode.c  | 12 ++++++++++++
 3 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f127625543b..65da6e1b6de 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -411,17 +411,45 @@ int fuse_do_getattr(struct inode *inode)
 	return err;
 }
 
+/*
+ * Calling into a user-controlled filesystem gives the filesystem
+ * daemon ptrace-like capabilities over the requester process.  This
+ * means, that the filesystem daemon is able to record the exact
+ * filesystem operations performed, and can also control the behavior
+ * of the requester process in otherwise impossible ways.  For example
+ * it can delay the operation for arbitrary length of time allowing
+ * DoS against the requester.
+ *
+ * For this reason only those processes can call into the filesystem,
+ * for which the owner of the mount has ptrace privilege.  This
+ * excludes processes started by other users, suid or sgid processes.
+ */
+static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+{
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		return 1;
+
+	if (task->euid == fc->user_id &&
+	    task->suid == fc->user_id &&
+	    task->uid == fc->user_id &&
+	    task->egid == fc->group_id &&
+	    task->sgid == fc->group_id &&
+	    task->gid == fc->group_id)
+		return 1;
+
+	return 0;
+}
+
 static int fuse_revalidate(struct dentry *entry)
 {
 	struct inode *inode = entry->d_inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (get_node_id(inode) == FUSE_ROOT_ID) {
-		if (!(fc->flags & FUSE_ALLOW_OTHER) &&
-		    current->fsuid != fc->user_id)
-			return -EACCES;
-	} else if (time_before_eq(jiffies, fi->i_time))
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+	if (get_node_id(inode) != FUSE_ROOT_ID &&
+	    time_before_eq(jiffies, fi->i_time))
 		return 0;
 
 	return fuse_do_getattr(inode);
@@ -431,7 +459,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (!(fc->flags & FUSE_ALLOW_OTHER) && current->fsuid != fc->user_id)
+	if (!fuse_allow_task(fc, current))
 		return -EACCES;
 	else if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
 		int err = generic_permission(inode, mask, NULL);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index aff3a01ea02..3ec2aff3fdb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -198,6 +198,9 @@ struct fuse_conn {
 	/** The user id for this mount */
 	uid_t user_id;
 
+	/** The group id for this mount */
+	gid_t group_id;
+
 	/** The fuse mount flags for this mount */
 	unsigned flags;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b75c73386e..c8e54c0658f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -31,6 +31,7 @@ struct fuse_mount_data {
 	int fd;
 	unsigned rootmode;
 	unsigned user_id;
+	unsigned group_id;
 	unsigned flags;
 	unsigned max_read;
 };
@@ -199,6 +200,7 @@ static void fuse_put_super(struct super_block *sb)
 	spin_lock(&fuse_lock);
 	fc->mounted = 0;
 	fc->user_id = 0;
+	fc->group_id = 0;
 	fc->flags = 0;
 	/* Flush all readers on this fs */
 	wake_up_all(&fc->waitq);
@@ -248,6 +250,7 @@ enum {
 	OPT_FD,
 	OPT_ROOTMODE,
 	OPT_USER_ID,
+	OPT_GROUP_ID,
 	OPT_DEFAULT_PERMISSIONS,
 	OPT_ALLOW_OTHER,
 	OPT_KERNEL_CACHE,
@@ -259,6 +262,7 @@ static match_table_t tokens = {
 	{OPT_FD,			"fd=%u"},
 	{OPT_ROOTMODE,			"rootmode=%o"},
 	{OPT_USER_ID,			"user_id=%u"},
+	{OPT_GROUP_ID,			"group_id=%u"},
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_KERNEL_CACHE,		"kernel_cache"},
@@ -300,6 +304,12 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			d->user_id = value;
 			break;
 
+		case OPT_GROUP_ID:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->group_id = value;
+			break;
+
 		case OPT_DEFAULT_PERMISSIONS:
 			d->flags |= FUSE_DEFAULT_PERMISSIONS;
 			break;
@@ -333,6 +343,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
 
 	seq_printf(m, ",user_id=%u", fc->user_id);
+	seq_printf(m, ",group_id=%u", fc->group_id);
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
 		seq_puts(m, ",default_permissions");
 	if (fc->flags & FUSE_ALLOW_OTHER)
@@ -465,6 +476,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
+	fc->group_id = d.group_id;
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-- 
cgit v1.2.3-70-g09d2


From 5a53368277efa2d80dd2206dddc1f4b19ef0c32a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:34 -0700
Subject: [PATCH] fuse: stricter mount option checking

Check for the presence of all mandatory mount options.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/inode.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c8e54c0658f..298c1d4c153 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -32,6 +32,10 @@ struct fuse_mount_data {
 	unsigned rootmode;
 	unsigned user_id;
 	unsigned group_id;
+	unsigned fd_present : 1;
+	unsigned rootmode_present : 1;
+	unsigned user_id_present : 1;
+	unsigned group_id_present : 1;
 	unsigned flags;
 	unsigned max_read;
 };
@@ -274,7 +278,6 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 {
 	char *p;
 	memset(d, 0, sizeof(struct fuse_mount_data));
-	d->fd = -1;
 	d->max_read = ~0;
 
 	while ((p = strsep(&opt, ",")) != NULL) {
@@ -290,24 +293,28 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			if (match_int(&args[0], &value))
 				return 0;
 			d->fd = value;
+			d->fd_present = 1;
 			break;
 
 		case OPT_ROOTMODE:
 			if (match_octal(&args[0], &value))
 				return 0;
 			d->rootmode = value;
+			d->rootmode_present = 1;
 			break;
 
 		case OPT_USER_ID:
 			if (match_int(&args[0], &value))
 				return 0;
 			d->user_id = value;
+			d->user_id_present = 1;
 			break;
 
 		case OPT_GROUP_ID:
 			if (match_int(&args[0], &value))
 				return 0;
 			d->group_id = value;
+			d->group_id_present = 1;
 			break;
 
 		case OPT_DEFAULT_PERMISSIONS:
@@ -332,7 +339,9 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			return 0;
 		}
 	}
-	if (d->fd == -1)
+
+	if (!d->fd_present || !d->rootmode_present ||
+	    !d->user_id_present || !d->group_id_present)
 		return 0;
 
 	return 1;
-- 
cgit v1.2.3-70-g09d2


From 413ef8cb302511d8e995e2b0e5517ee1a65b9c77 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:35 -0700
Subject: [PATCH] FUSE - direct I/O

This patch adds support for the "direct_io" mount option of FUSE.

When this mount option is specified, the page cache is bypassed for
read and write operations.  This is useful for example, if the
filesystem doesn't know the size of files before reading them, or when
any kind of caching is harmful.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c   | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h |   6 +++
 fs/fuse/inode.c  |   9 ++++
 3 files changed, 146 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 86ffb6db5fe..6bc3fb26de3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -363,6 +363,118 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	return err;
 }
 
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (write)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+			       unsigned nbytes, int write)
+{
+	unsigned long user_addr = (unsigned long) buf;
+	unsigned offset = user_addr & ~PAGE_MASK;
+	int npages;
+
+	/* This doesn't work with nfsd */
+	if (!current->mm)
+		return -EPERM;
+
+	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	npages = min(npages, FUSE_MAX_PAGES_PER_REQ);
+	down_read(&current->mm->mmap_sem);
+	npages = get_user_pages(current, current->mm, user_addr, npages, write,
+				0, req->pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (npages < 0)
+		return npages;
+
+	req->num_pages = npages;
+	req->page_offset = offset;
+	return 0;
+}
+
+static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos, int write)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	size_t nmax = write ? fc->max_write : fc->max_read;
+	loff_t pos = *ppos;
+	ssize_t res = 0;
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -ERESTARTSYS;
+
+	while (count) {
+		size_t tmp;
+		size_t nres;
+		size_t nbytes = min(count, nmax);
+		int err = fuse_get_user_pages(req, buf, nbytes, !write);
+		if (err) {
+			res = err;
+			break;
+		}
+		tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+		nbytes = min(nbytes, tmp);
+		if (write)
+			nres = fuse_send_write(req, file, inode, pos, nbytes);
+		else
+			nres = fuse_send_read(req, file, inode, pos, nbytes);
+		fuse_release_user_pages(req, !write);
+		if (req->out.h.error) {
+			if (!res)
+				res = req->out.h.error;
+			break;
+		} else if (nres > nbytes) {
+			res = -EIO;
+			break;
+		}
+		count -= nres;
+		res += nres;
+		pos += nres;
+		buf += nres;
+		if (nres != nbytes)
+			break;
+		if (count)
+			fuse_reset_request(req);
+	}
+	fuse_put_request(fc, req);
+	if (res > 0) {
+		if (write && pos > i_size_read(inode))
+			i_size_write(inode, pos);
+		*ppos = pos;
+	} else if (write && (res == -EINTR || res == -EIO))
+		fuse_invalidate_attr(inode);
+
+	return res;
+}
+
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	return fuse_direct_io(file, buf, count, ppos, 0);
+}
+
+static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ssize_t res;
+	/* Don't allow parallel writes to the same file */
+	down(&inode->i_sem);
+	res = fuse_direct_io(file, buf, count, ppos, 1);
+	up(&inode->i_sem);
+	return res;
+}
+
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	if ((vma->vm_flags & VM_SHARED)) {
@@ -393,6 +505,17 @@ static struct file_operations fuse_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
+static struct file_operations fuse_direct_io_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= fuse_direct_read,
+	.write		= fuse_direct_write,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	/* no mmap and sendfile */
+};
+
 static struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.prepare_write	= fuse_prepare_write,
@@ -403,6 +526,12 @@ static struct address_space_operations fuse_file_aops  = {
 
 void fuse_init_file_inode(struct inode *inode)
 {
-	inode->i_fop = &fuse_file_operations;
-	inode->i_data.a_ops = &fuse_file_aops;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (fc->flags & FUSE_DIRECT_IO)
+		inode->i_fop = &fuse_direct_io_file_operations;
+	else {
+		inode->i_fop = &fuse_file_operations;
+		inode->i_data.a_ops = &fuse_file_aops;
+	}
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 3ec2aff3fdb..0af1ac64692 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -34,6 +34,9 @@
     be flushed on open */
 #define FUSE_KERNEL_CACHE        (1 << 2)
 
+/** Bypass the page cache for read and write operations  */
+#define FUSE_DIRECT_IO           (1 << 3)
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
@@ -207,6 +210,9 @@ struct fuse_conn {
 	/** Maximum read size */
 	unsigned max_read;
 
+	/** Maximum write size */
+	unsigned max_write;
+
 	/** Readers of the connection are waiting on this */
 	wait_queue_head_t waitq;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 298c1d4c153..652c9d5df97 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -258,6 +258,7 @@ enum {
 	OPT_DEFAULT_PERMISSIONS,
 	OPT_ALLOW_OTHER,
 	OPT_KERNEL_CACHE,
+	OPT_DIRECT_IO,
 	OPT_MAX_READ,
 	OPT_ERR
 };
@@ -270,6 +271,7 @@ static match_table_t tokens = {
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_KERNEL_CACHE,		"kernel_cache"},
+	{OPT_DIRECT_IO,			"direct_io"},
 	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_ERR,			NULL}
 };
@@ -329,6 +331,10 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			d->flags |= FUSE_KERNEL_CACHE;
 			break;
 
+		case OPT_DIRECT_IO:
+			d->flags |= FUSE_DIRECT_IO;
+			break;
+
 		case OPT_MAX_READ:
 			if (match_int(&args[0], &value))
 				return 0;
@@ -359,6 +365,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",allow_other");
 	if (fc->flags & FUSE_KERNEL_CACHE)
 		seq_puts(m, ",kernel_cache");
+	if (fc->flags & FUSE_DIRECT_IO)
+		seq_puts(m, ",direct_io");
 	if (fc->max_read != ~0)
 		seq_printf(m, ",max_read=%u", fc->max_read);
 	return 0;
@@ -489,6 +497,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+	fc->max_write = FUSE_MAX_IN / 2;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
-- 
cgit v1.2.3-70-g09d2


From 04730fef1f9c7277e5c730b193e681ac095b0507 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:36 -0700
Subject: [PATCH] fuse: transfer readdir data through device

This patch removes a long lasting "hack" in FUSE, which used a separate
channel (a file descriptor refering to a disk-file) to transfer directory
contents from userspace to the kernel.

The patch adds three new operations (OPENDIR, READDIR, RELEASEDIR), which
have semantics and implementation exactly maching the respective file
operations (OPEN, READ, RELEASE).

This simplifies the directory reading code.  Also disk space is not
necessary, which can be important in embedded systems.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        |  9 ------
 fs/fuse/dir.c        | 84 +++++++++++++++-------------------------------------
 fs/fuse/file.c       | 38 +++++++++++++++++-------
 fs/fuse/fuse_i.h     | 22 ++++++++++----
 include/linux/fuse.h | 10 +++----
 5 files changed, 73 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ca6fc0e96d7..e4ada021d08 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -731,13 +731,6 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 	return NULL;
 }
 
-/* fget() needs to be done in this context */
-static void process_getdir(struct fuse_req *req)
-{
-	struct fuse_getdir_out_i *arg = req->out.args[0].value;
-	arg->file = fget(arg->fd);
-}
-
 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 			 unsigned nbytes)
 {
@@ -817,8 +810,6 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	if (!err) {
 		if (req->interrupted)
 			err = -ENOENT;
-		else if (req->in.h.opcode == FUSE_GETDIR && !oh.error)
-			process_getdir(req);
 	} else if (!req->interrupted)
 		req->out.h.error = -EIO;
 	request_end(fc, req);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 65da6e1b6de..cf5d1faed7a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -519,70 +519,40 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
 	return 0;
 }
 
-static int fuse_checkdir(struct file *cfile, struct file *file)
+static inline size_t fuse_send_readdir(struct fuse_req *req, struct file *file,
+				       struct inode *inode, loff_t pos,
+				       size_t count)
 {
-	struct inode *inode;
-	if (!cfile)
-		return -EIO;
-	inode = cfile->f_dentry->d_inode;
-	if (!S_ISREG(inode->i_mode)) {
-		fput(cfile);
-		return -EIO;
-	}
-
-	file->private_data = cfile;
-	return 0;
+	return fuse_send_read_common(req, file, inode, pos, count, 1);
 }
 
-static int fuse_getdir(struct file *file)
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 {
+	int err;
+	size_t nbytes;
+	struct page *page;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	struct fuse_getdir_out_i outarg;
-	int err;
-
+	struct fuse_req *req = fuse_get_request_nonint(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
-	req->in.h.opcode = FUSE_GETDIR;
-	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(struct fuse_getdir_out);
-	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		fuse_put_request(fc, req);
+		return -ENOMEM;
+	}
+	req->num_pages = 1;
+	req->pages[0] = page;
+	nbytes = fuse_send_readdir(req, file, inode, file->f_pos, PAGE_SIZE);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err)
-		err = fuse_checkdir(outarg.file, file);
-	return err;
-}
-
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
-{
-	struct file *cfile = file->private_data;
-	char *buf;
-	int ret;
-
-	if (!cfile) {
-		ret = fuse_getdir(file);
-		if (ret)
-			return ret;
-
-		cfile = file->private_data;
-	}
+		err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
+				    filldir);
 
-	buf = (char *) __get_free_page(GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	ret = kernel_read(cfile, file->f_pos, buf, PAGE_SIZE);
-	if (ret > 0)
-		ret = parse_dirfile(buf, ret, file, dstbuf, filldir);
-
-	free_page((unsigned long) buf);
-	return ret;
+	__free_page(page);
+	return err;
 }
 
 static char *read_link(struct dentry *dentry)
@@ -637,18 +607,12 @@ static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
 
 static int fuse_dir_open(struct inode *inode, struct file *file)
 {
-	file->private_data = NULL;
-	return 0;
+	return fuse_open_common(inode, file, 1);
 }
 
 static int fuse_dir_release(struct inode *inode, struct file *file)
 {
-	struct file *cfile = file->private_data;
-
-	if (cfile)
-		fput(cfile);
-
-	return 0;
+	return fuse_release_common(inode, file, 1);
 }
 
 static unsigned iattr_to_fattr(struct iattr *iattr, struct fuse_attr *fattr)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6bc3fb26de3..224453557cf 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 
-static int fuse_open(struct inode *inode, struct file *file)
+int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
@@ -56,7 +56,7 @@ static int fuse_open(struct inode *inode, struct file *file)
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
-	req->in.h.opcode = FUSE_OPEN;
+	req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 	req->in.h.nodeid = get_node_id(inode);
 	req->inode = inode;
 	req->in.numargs = 1;
@@ -85,7 +85,7 @@ static int fuse_open(struct inode *inode, struct file *file)
 	return err;
 }
 
-static int fuse_release(struct inode *inode, struct file *file)
+int fuse_release_common(struct inode *inode, struct file *file, int isdir)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
@@ -94,7 +94,7 @@ static int fuse_release(struct inode *inode, struct file *file)
 
 	inarg->fh = ff->fh;
 	inarg->flags = file->f_flags & ~O_EXCL;
-	req->in.h.opcode = FUSE_RELEASE;
+	req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 	req->in.h.nodeid = get_node_id(inode);
 	req->inode = inode;
 	req->in.numargs = 1;
@@ -107,6 +107,16 @@ static int fuse_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int fuse_open(struct inode *inode, struct file *file)
+{
+	return fuse_open_common(inode, file, 0);
+}
+
+static int fuse_release(struct inode *inode, struct file *file)
+{
+	return fuse_release_common(inode, file, 0);
+}
+
 static int fuse_flush(struct file *file)
 {
 	struct inode *inode = file->f_dentry->d_inode;
@@ -178,8 +188,9 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
 	return err;
 }
 
-static ssize_t fuse_send_read(struct fuse_req *req, struct file *file,
-			      struct inode *inode, loff_t pos,  size_t count)
+size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
+			     struct inode *inode, loff_t pos, size_t count,
+			     int isdir)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
@@ -189,7 +200,7 @@ static ssize_t fuse_send_read(struct fuse_req *req, struct file *file,
 	inarg.fh = ff->fh;
 	inarg.offset = pos;
 	inarg.size = count;
-	req->in.h.opcode = FUSE_READ;
+	req->in.h.opcode = isdir ? FUSE_READDIR : FUSE_READ;
 	req->in.h.nodeid = get_node_id(inode);
 	req->inode = inode;
 	req->file = file;
@@ -204,6 +215,13 @@ static ssize_t fuse_send_read(struct fuse_req *req, struct file *file,
 	return req->out.args[0].size;
 }
 
+static inline size_t fuse_send_read(struct fuse_req *req, struct file *file,
+				    struct inode *inode, loff_t pos,
+				    size_t count)
+{
+	return fuse_send_read_common(req, file, inode, pos, count, 0);
+}
+
 static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
@@ -293,8 +311,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	return err;
 }
 
-static ssize_t fuse_send_write(struct fuse_req *req, struct file *file,
-			       struct inode *inode, loff_t pos, size_t count)
+static size_t fuse_send_write(struct fuse_req *req, struct file *file,
+			      struct inode *inode, loff_t pos, size_t count)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
@@ -332,7 +350,7 @@ static int fuse_commit_write(struct file *file, struct page *page,
 			     unsigned offset, unsigned to)
 {
 	int err;
-	ssize_t nres;
+	size_t nres;
 	unsigned count = to - offset;
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0af1ac64692..8593d5bae7a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -273,11 +273,6 @@ struct fuse_conn {
 	struct backing_dev_info bdi;
 };
 
-struct fuse_getdir_out_i {
-	int fd;
-	void *file; /* Used by kernel only */
-};
-
 static inline struct fuse_conn **get_fuse_conn_super_p(struct super_block *sb)
 {
 	return (struct fuse_conn **) &sb->s_fs_info;
@@ -333,6 +328,23 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 		      unsigned long nodeid, u64 nlookup);
 
+/**
+ * Send READ or READDIR request
+ */
+size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
+			     struct inode *inode, loff_t pos, size_t count,
+			     int isdir);
+
+/**
+ * Send OPEN or OPENDIR request
+ */
+int fuse_open_common(struct inode *inode, struct file *file, int isdir);
+
+/**
+ * Send RELEASE or RELEASEDIR request
+ */
+int fuse_release_common(struct inode *inode, struct file *file, int isdir);
+
 /**
  * Initialise file operations on a regular file
  */
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index e9b814e16c5..cdfaa51b901 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -70,7 +70,6 @@ enum fuse_opcode {
 	FUSE_SETATTR	   = 4,
 	FUSE_READLINK	   = 5,
 	FUSE_SYMLINK	   = 6,
-	FUSE_GETDIR	   = 7,
 	FUSE_MKNOD	   = 8,
 	FUSE_MKDIR	   = 9,
 	FUSE_UNLINK	   = 10,
@@ -88,7 +87,10 @@ enum fuse_opcode {
 	FUSE_LISTXATTR     = 23,
 	FUSE_REMOVEXATTR   = 24,
 	FUSE_FLUSH         = 25,
-	FUSE_INIT          = 26
+	FUSE_INIT          = 26,
+	FUSE_OPENDIR       = 27,
+	FUSE_READDIR       = 28,
+	FUSE_RELEASEDIR    = 29
 };
 
 /* Conservative buffer size for the client */
@@ -120,10 +122,6 @@ struct fuse_attr_out {
 	struct fuse_attr attr;
 };
 
-struct fuse_getdir_out {
-	__u32	fd;
-};
-
 struct fuse_mknod_in {
 	__u32	mode;
 	__u32	rdev;
-- 
cgit v1.2.3-70-g09d2


From 45323fb76465a9576220c7427dbac7b1e7ad3caf Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:37 -0700
Subject: [PATCH] fuse: more flexible caching

Make data caching behavior selectable on a per-open basis instead of
per-mount.  Compatibility for the old mount options 'kernel_cache' and
'direct_io' is retained in the userspace library (version 2.4.0-pre1 or
later).

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/fuse.txt | 26 --------------------------
 fs/fuse/file.c                     | 18 ++++++++----------
 fs/fuse/fuse_i.h                   |  6 ------
 fs/fuse/inode.c                    | 16 ----------------
 include/linux/fuse.h               | 11 ++++++++++-
 5 files changed, 18 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
index 83f96cf5696..6b5741e651a 100644
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -80,32 +80,6 @@ Mount options
   allowed to root, but this restriction can be removed with a
   (userspace) configuration option.
 
-'kernel_cache'
-
-  This option disables flushing the cache of the file contents on
-  every open().  This should only be enabled on filesystems, where the
-  file data is never changed externally (not through the mounted FUSE
-  filesystem).  Thus it is not suitable for network filesystems and
-  other "intermediate" filesystems.
-
-  NOTE: if this option is not specified (and neither 'direct_io') data
-  is still cached after the open(), so a read() system call will not
-  always initiate a read operation.
-
-'direct_io'
-
-  This option disables the use of page cache (file content cache) in
-  the kernel for this filesystem.  This has several affects:
-
-     - Each read() or write() system call will initiate one or more
-       read or write operations, data will not be cached in the
-       kernel.
-
-     - The return value of the read() and write() system calls will
-       correspond to the return values of the read and write
-       operations.  This is useful for example if the file size is not
-       known in advance (before reading it).
-
 'max_read=N'
 
   With this option the maximum size of read operations can be set.
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 224453557cf..a8dc88527fb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,6 +12,8 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 
+static struct file_operations fuse_direct_io_file_operations;
+
 int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -70,12 +72,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	else
 		request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && !(fc->flags & FUSE_KERNEL_CACHE))
-		invalidate_inode_pages(inode->i_mapping);
 	if (err) {
 		fuse_request_free(ff->release_req);
 		kfree(ff);
 	} else {
+		if (!isdir && (outarg.open_flags & FOPEN_DIRECT_IO))
+			file->f_op = &fuse_direct_io_file_operations;
+		if (!(outarg.open_flags & FOPEN_KEEP_CACHE))
+			invalidate_inode_pages(inode->i_mapping);
 		ff->fh = outarg.fh;
 		file->private_data = ff;
 	}
@@ -544,12 +548,6 @@ static struct address_space_operations fuse_file_aops  = {
 
 void fuse_init_file_inode(struct inode *inode)
 {
-	struct fuse_conn *fc = get_fuse_conn(inode);
-
-	if (fc->flags & FUSE_DIRECT_IO)
-		inode->i_fop = &fuse_direct_io_file_operations;
-	else {
-		inode->i_fop = &fuse_file_operations;
-		inode->i_data.a_ops = &fuse_file_aops;
-	}
+	inode->i_fop = &fuse_file_operations;
+	inode->i_data.a_ops = &fuse_file_aops;
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8593d5bae7a..84849601363 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -30,12 +30,6 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
-/** If the FUSE_KERNEL_CACHE flag is given, then cached data will not
-    be flushed on open */
-#define FUSE_KERNEL_CACHE        (1 << 2)
-
-/** Bypass the page cache for read and write operations  */
-#define FUSE_DIRECT_IO           (1 << 3)
 
 /** FUSE inode */
 struct fuse_inode {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 652c9d5df97..8dc66760b41 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -257,8 +257,6 @@ enum {
 	OPT_GROUP_ID,
 	OPT_DEFAULT_PERMISSIONS,
 	OPT_ALLOW_OTHER,
-	OPT_KERNEL_CACHE,
-	OPT_DIRECT_IO,
 	OPT_MAX_READ,
 	OPT_ERR
 };
@@ -270,8 +268,6 @@ static match_table_t tokens = {
 	{OPT_GROUP_ID,			"group_id=%u"},
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ALLOW_OTHER,		"allow_other"},
-	{OPT_KERNEL_CACHE,		"kernel_cache"},
-	{OPT_DIRECT_IO,			"direct_io"},
 	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_ERR,			NULL}
 };
@@ -327,14 +323,6 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
 			d->flags |= FUSE_ALLOW_OTHER;
 			break;
 
-		case OPT_KERNEL_CACHE:
-			d->flags |= FUSE_KERNEL_CACHE;
-			break;
-
-		case OPT_DIRECT_IO:
-			d->flags |= FUSE_DIRECT_IO;
-			break;
-
 		case OPT_MAX_READ:
 			if (match_int(&args[0], &value))
 				return 0;
@@ -363,10 +351,6 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",default_permissions");
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		seq_puts(m, ",allow_other");
-	if (fc->flags & FUSE_KERNEL_CACHE)
-		seq_puts(m, ",kernel_cache");
-	if (fc->flags & FUSE_DIRECT_IO)
-		seq_puts(m, ",direct_io");
 	if (fc->max_read != ~0)
 		seq_printf(m, ",max_read=%u", fc->max_read);
 	return 0;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index cdfaa51b901..c65124a213a 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 1
+#define FUSE_KERNEL_MINOR_VERSION 2
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -63,6 +63,15 @@ struct fuse_kstatfs {
 #define FATTR_MTIME	(1 << 5)
 #define FATTR_CTIME	(1 << 6)
 
+/**
+ * Flags returned by the OPEN request
+ *
+ * FOPEN_DIRECT_IO: bypass page cache for this open file
+ * FOPEN_KEEP_CACHE: don't invalidate the data cache on open
+ */
+#define FOPEN_DIRECT_IO		(1 << 0)
+#define FOPEN_KEEP_CACHE	(1 << 1)
+
 enum fuse_opcode {
 	FUSE_LOOKUP	   = 1,
 	FUSE_FORGET	   = 2,  /* no reply */
-- 
cgit v1.2.3-70-g09d2


From b36c31ba95f0fe0a03c727300d9c4c54438a5636 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:38 -0700
Subject: [PATCH] fuse: don't update file times

Don't change mtime/ctime/atime to local time on read/write.  Rather invalidate
file attributes, so next stat() will force a GETATTR call.  Bug reported by
Ben Grimm.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c   |  2 ++
 fs/fuse/file.c  | 10 ++++++----
 fs/fuse/inode.c |  1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index cf5d1faed7a..9b43fd46aaa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -552,6 +552,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 				    filldir);
 
 	__free_page(page);
+	fuse_invalidate_attr(inode); /* atime changed */
 	return err;
 }
 
@@ -585,6 +586,7 @@ static char *read_link(struct dentry *dentry)
 		link[req->out.args[0].size] = '\0';
  out:
 	fuse_put_request(fc, req);
+	fuse_invalidate_attr(inode); /* atime changed */
 	return link;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a8dc88527fb..6dcae74ce7f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -244,6 +244,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 	fuse_put_request(fc, req);
 	if (!err)
 		SetPageUptodate(page);
+	fuse_invalidate_attr(inode); /* atime changed */
  out:
 	unlock_page(page);
 	return err;
@@ -312,6 +313,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	if (!err && data.req->num_pages)
 		err = fuse_send_readpages(data.req, file, inode);
 	fuse_put_request(fc, data.req);
+	fuse_invalidate_attr(inode); /* atime changed */
 	return err;
 }
 
@@ -380,8 +382,8 @@ static int fuse_commit_write(struct file *file, struct page *page,
 			clear_page_dirty(page);
 			SetPageUptodate(page);
 		}
-	} else if (err == -EINTR || err == -EIO)
-		fuse_invalidate_attr(inode);
+	}
+	fuse_invalidate_attr(inode);
 	return err;
 }
 
@@ -473,8 +475,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		if (write && pos > i_size_read(inode))
 			i_size_write(inode, pos);
 		*ppos = pos;
-	} else if (write && (res == -EINTR || res == -EIO))
-		fuse_invalidate_attr(inode);
+	}
+	fuse_invalidate_attr(inode);
 
 	return res;
 }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8dc66760b41..52e954f4bb9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -173,6 +173,7 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 		return NULL;
 
 	if ((inode->i_state & I_NEW)) {
+		inode->i_flags |= S_NOATIME|S_NOCMTIME;
 		inode->i_generation = generation;
 		inode->i_data.backing_dev_info = &fc->bdi;
 		fuse_init_inode(inode, attr);
-- 
cgit v1.2.3-70-g09d2


From 8254798199332966e2ab647380c990193af7e854 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:38 -0700
Subject: [PATCH] FUSE: add fsync operation for directories

This patch adds a new FSYNCDIR request, which is sent when fsync is called
on directories.  This operation is available in libfuse 2.3-pre1 or
greater.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c        |  7 +++++++
 fs/fuse/file.c       | 17 +++++++++++++----
 fs/fuse/fuse_i.h     |  9 +++++++++
 include/linux/fuse.h |  3 ++-
 4 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9b43fd46aaa..73792d65b6c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -617,6 +617,12 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
 	return fuse_release_common(inode, file, 1);
 }
 
+static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+{
+	/* nfsd can call this with no file */
+	return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
+}
+
 static unsigned iattr_to_fattr(struct iattr *iattr, struct fuse_attr *fattr)
 {
 	unsigned ivalid = iattr->ia_valid;
@@ -934,6 +940,7 @@ static struct file_operations fuse_dir_operations = {
 	.readdir	= fuse_readdir,
 	.open		= fuse_dir_open,
 	.release	= fuse_dir_release,
+	.fsync		= fuse_dir_fsync,
 };
 
 static struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6dcae74ce7f..e225f8c0b26 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -156,7 +156,8 @@ static int fuse_flush(struct file *file)
 	return err;
 }
 
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+		      int isdir)
 {
 	struct inode *inode = de->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -165,7 +166,7 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
 	struct fuse_fsync_in inarg;
 	int err;
 
-	if (fc->no_fsync)
+	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
 	req = fuse_get_request(fc);
@@ -175,7 +176,7 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.fsync_flags = datasync ? 1 : 0;
-	req->in.h.opcode = FUSE_FSYNC;
+	req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
 	req->in.h.nodeid = get_node_id(inode);
 	req->inode = inode;
 	req->file = file;
@@ -186,12 +187,20 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
-		fc->no_fsync = 1;
+		if (isdir)
+			fc->no_fsyncdir = 1;
+		else
+			fc->no_fsync = 1;
 		err = 0;
 	}
 	return err;
 }
 
+static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+{
+	return fuse_fsync_common(file, de, datasync, 0);
+}
+
 size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
 			     struct inode *inode, loff_t pos, size_t count,
 			     int isdir)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 84849601363..d7647289d8a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -248,6 +248,9 @@ struct fuse_conn {
 	/** Is fsync not implemented by fs? */
 	unsigned no_fsync : 1;
 
+	/** Is fsyncdir not implemented by fs? */
+	unsigned no_fsyncdir : 1;
+
 	/** Is flush not implemented by fs? */
 	unsigned no_flush : 1;
 
@@ -339,6 +342,12 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir);
  */
 int fuse_release_common(struct inode *inode, struct file *file, int isdir);
 
+/**
+ * Send FSYNC or FSYNCDIR request
+ */
+int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+		      int isdir);
+
 /**
  * Initialise file operations on a regular file
  */
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index c65124a213a..acbeb96a335 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -99,7 +99,8 @@ enum fuse_opcode {
 	FUSE_INIT          = 26,
 	FUSE_OPENDIR       = 27,
 	FUSE_READDIR       = 28,
-	FUSE_RELEASEDIR    = 29
+	FUSE_RELEASEDIR    = 29,
+	FUSE_FSYNCDIR      = 30
 };
 
 /* Conservative buffer size for the client */
-- 
cgit v1.2.3-70-g09d2


From 7c352bdf048811b8128019ffc1e886161e09c11c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 9 Sep 2005 13:10:39 -0700
Subject: [PATCH] FUSE: don't allow restarting of system calls

This patch removes ability to interrupt and restart operations while there
hasn't been any side-effect.

The reason: applications.  There are some apps it seems that generate
signals at a fast rate.  This means, that if the operation cannot make
enough progress between two signals, it will be restarted for ever.  This
bug actually manifested itself with 'krusader' trying to open a file for
writing under sshfs.  Thanks to Eduard Czimbalmos for the report.

The problem can be solved just by making open() uninterruptible, because in
this case it was the truncate operation that slowed down the progress.  But
it's better to solve this by simply not allowing interrupts at all (except
SIGKILL), because applications don't expect file operations to be
interruptible anyway.  As an added bonus the code is simplified somewhat.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 73 ++++++++++----------------------------------------------
 fs/fuse/dir.c    | 36 ++++++++++++++--------------
 fs/fuse/file.c   | 33 ++++++++++---------------
 fs/fuse/fuse_i.h | 12 +---------
 fs/fuse/inode.c  |  2 +-
 5 files changed, 45 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e4ada021d08..d4c869c6d01 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -103,19 +103,8 @@ static struct fuse_req *do_get_request(struct fuse_conn *fc)
 	return req;
 }
 
+/* This can return NULL, but only in case it's interrupted by a SIGKILL */
 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
-{
-	if (down_interruptible(&fc->outstanding_sem))
-		return NULL;
-	return do_get_request(fc);
-}
-
-/*
- * Non-interruptible version of the above function is for operations
- * which can't legally return -ERESTART{SYS,NOINTR}.  This can still
- * return NULL, but only in case the signal is SIGKILL.
- */
-struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc)
 {
 	int intr;
 	sigset_t oldset;
@@ -241,43 +230,20 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 		get_file(req->file);
 }
 
-static int request_wait_answer_nonint(struct fuse_req *req)
-{
-	int err;
-	sigset_t oldset;
-	block_sigs(&oldset);
-	err = wait_event_interruptible(req->waitq, req->finished);
-	restore_sigs(&oldset);
-	return err;
-}
-
 /* Called with fuse_lock held.  Releases, and then reacquires it. */
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req,
-				int interruptible)
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 {
-	int intr;
+	sigset_t oldset;
 
 	spin_unlock(&fuse_lock);
-	if (interruptible)
-		intr = wait_event_interruptible(req->waitq, req->finished);
-	else
-		intr = request_wait_answer_nonint(req);
+	block_sigs(&oldset);
+	wait_event_interruptible(req->waitq, req->finished);
+	restore_sigs(&oldset);
 	spin_lock(&fuse_lock);
-	if (intr && interruptible && req->sent) {
-		/* If request is already in userspace, only allow KILL
-		   signal to interrupt */
-		spin_unlock(&fuse_lock);
-		intr = request_wait_answer_nonint(req);
-		spin_lock(&fuse_lock);
-	}
-	if (!intr)
+	if (req->finished)
 		return;
 
-	if (!interruptible || req->sent)
-		req->out.h.error = -EINTR;
-	else
-		req->out.h.error = -ERESTARTNOINTR;
-
+	req->out.h.error = -EINTR;
 	req->interrupted = 1;
 	if (req->locked) {
 		/* This is uninterruptible sleep, because data is
@@ -330,8 +296,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	wake_up(&fc->waitq);
 }
 
-static void request_send_wait(struct fuse_conn *fc, struct fuse_req *req,
-			      int interruptible)
+/*
+ * This can only be interrupted by a SIGKILL
+ */
+void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
 	spin_lock(&fuse_lock);
@@ -345,26 +313,11 @@ static void request_send_wait(struct fuse_conn *fc, struct fuse_req *req,
 		   after request_end() */
 		__fuse_get_request(req);
 
-		request_wait_answer(fc, req, interruptible);
+		request_wait_answer(fc, req);
 	}
 	spin_unlock(&fuse_lock);
 }
 
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
-{
-	request_send_wait(fc, req, 1);
-}
-
-/*
- * Non-interruptible version of the above function is for operations
- * which can't legally return -ERESTART{SYS,NOINTR}.  This can still
- * be interrupted but only with SIGKILL.
- */
-void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req)
-{
-	request_send_wait(fc, req, 0);
-}
-
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fuse_lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 73792d65b6c..e79e49b3eec 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -46,12 +46,12 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		struct inode *inode = entry->d_inode;
 		struct fuse_inode *fi = get_fuse_inode(inode);
 		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_req *req = fuse_get_request_nonint(fc);
+		struct fuse_req *req = fuse_get_request(fc);
 		if (!req)
 			return 0;
 
 		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
-		request_send_nonint(fc, req);
+		request_send(fc, req);
 		err = req->out.h.error;
 		if (!err) {
 			if (outarg.nodeid != get_node_id(inode)) {
@@ -91,7 +91,7 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
@@ -185,7 +185,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -211,7 +211,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -236,7 +236,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	req->in.h.opcode = FUSE_SYMLINK;
 	req->in.numargs = 2;
@@ -253,7 +253,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	req->in.h.opcode = FUSE_UNLINK;
 	req->in.h.nodeid = get_node_id(dir);
@@ -284,7 +284,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
@@ -311,7 +311,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	struct fuse_conn *fc = get_fuse_conn(olddir);
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.newdir = get_node_id(newdir);
@@ -356,7 +356,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
@@ -386,7 +386,7 @@ int fuse_do_getattr(struct inode *inode)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	req->in.h.opcode = FUSE_GETATTR;
 	req->in.h.nodeid = get_node_id(inode);
@@ -533,7 +533,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	struct page *page;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request_nonint(fc);
+	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -564,7 +564,7 @@ static char *read_link(struct dentry *dentry)
 	char *link;
 
 	if (!req)
-		return ERR_PTR(-ERESTARTNOINTR);
+		return ERR_PTR(-EINTR);
 
 	link = (char *) __get_free_page(GFP_KERNEL);
 	if (!link) {
@@ -677,7 +677,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.valid = iattr_to_fattr(attr, &inarg.attr);
@@ -761,7 +761,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -801,7 +801,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -851,7 +851,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -897,7 +897,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTNOINTR;
+		return -EINTR;
 
 	req->in.h.opcode = FUSE_REMOVEXATTR;
 	req->in.h.nodeid = get_node_id(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e225f8c0b26..6454022b053 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -22,9 +22,6 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	struct fuse_open_out outarg;
 	struct fuse_file *ff;
 	int err;
-	/* Restarting the syscall is not allowed if O_CREAT and O_EXCL
-	   are both set, because creation will fail on the restart */
-	int excl = (file->f_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL);
 
 	err = generic_file_open(inode, file);
 	if (err)
@@ -38,12 +35,9 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 		 	return err;
 	}
 
-	if (excl)
-		req = fuse_get_request_nonint(fc);
-	else
-		req = fuse_get_request(fc);
+	req = fuse_get_request(fc);
 	if (!req)
-		return excl ? -EINTR : -ERESTARTSYS;
+		return -EINTR;
 
 	err = -ENOMEM;
 	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -67,10 +61,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	if (excl)
-		request_send_nonint(fc, req);
-	else
-		request_send(fc, req);
+	request_send(fc, req);
 	err = req->out.h.error;
 	if (err) {
 		fuse_request_free(ff->release_req);
@@ -133,7 +124,7 @@ static int fuse_flush(struct file *file)
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_request_nonint(fc);
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -146,7 +137,7 @@ static int fuse_flush(struct file *file)
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	request_send_nonint(fc, req);
+	request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -171,7 +162,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTSYS;
+		return -EINTR;
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -224,7 +215,7 @@ size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
 	req->out.argvar = 1;
 	req->out.numargs = 1;
 	req->out.args[0].size = count;
-	request_send_nonint(fc, req);
+	request_send(fc, req);
 	return req->out.args[0].size;
 }
 
@@ -240,7 +231,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
-	struct fuse_req *req = fuse_get_request_nonint(fc);
+	struct fuse_req *req = fuse_get_request(fc);
 	int err = -EINTR;
 	if (!req)
 		goto out;
@@ -314,7 +305,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	int err;
 	data.file = file;
 	data.inode = inode;
-	data.req = fuse_get_request_nonint(fc);
+	data.req = fuse_get_request(fc);
 	if (!data.req)
 		return -EINTR;
 
@@ -350,7 +341,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(struct fuse_write_out);
 	req->out.args[0].value = &outarg;
-	request_send_nonint(fc, req);
+	request_send(fc, req);
 	return outarg.size;
 }
 
@@ -370,7 +361,7 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
-	struct fuse_req *req = fuse_get_request_nonint(fc);
+	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -444,7 +435,7 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	ssize_t res = 0;
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTSYS;
+		return -EINTR;
 
 	while (count) {
 		size_t tmp;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d7647289d8a..24d761518d8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -409,11 +409,6 @@ void fuse_reset_request(struct fuse_req *req);
  */
 struct fuse_req *fuse_get_request(struct fuse_conn *fc);
 
-/**
- * Reserve a preallocated request, only interruptible by SIGKILL
- */
-struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc);
-
 /**
  * Decrement reference count of a request.  If count goes to zero put
  * on unused list (preallocated) or free reqest (not preallocated).
@@ -421,15 +416,10 @@ struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc);
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
- * Send a request (synchronous, interruptible)
+ * Send a request (synchronous)
  */
 void request_send(struct fuse_conn *fc, struct fuse_req *req);
 
-/**
- * Send a request (synchronous, non-interruptible except by SIGKILL)
- */
-void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req);
-
 /**
  * Send a request with no reply
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 52e954f4bb9..e69a546844d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -236,7 +236,7 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 
         req = fuse_get_request(fc);
 	if (!req)
-		return -ERESTARTSYS;
+		return -EINTR;
 
 	req->in.numargs = 0;
 	req->in.h.opcode = FUSE_STATFS;
-- 
cgit v1.2.3-70-g09d2


From a9f6a0dd54efea2a5d57a27e6c232f9197c25154 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Sep 2005 13:10:41 -0700
Subject: [PATCH] more SPIN_LOCK_UNLOCKED -> DEFINE_SPINLOCK conversions

This converts the final 20 DEFINE_SPINLOCK holdouts.  (another 580 places
are already using DEFINE_SPINLOCK).  Build tested on x86.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mach-pxa/corgi_ssp.c     | 2 +-
 arch/ia64/kernel/perfmon.c        | 2 +-
 arch/ia64/sn/kernel/xpnet.c       | 2 +-
 arch/mips/kernel/genrtc.c         | 2 +-
 arch/mips/kernel/i8259.c          | 2 +-
 arch/ppc/platforms/hdpu.c         | 2 +-
 arch/ppc/syslib/mv64x60.c         | 2 +-
 arch/ppc/syslib/qspan_pci.c       | 2 +-
 arch/ppc64/kernel/pmc.c           | 2 +-
 arch/sparc/lib/atomic32.c         | 2 +-
 drivers/net/mv643xx_eth.c         | 2 +-
 drivers/parisc/iosapic.c          | 2 +-
 drivers/scsi/ch.c                 | 2 +-
 drivers/video/geode/display_gx1.c | 2 +-
 fs/xfs/support/ktrace.c           | 2 +-
 net/core/netpoll.c                | 4 ++--
 net/core/pktgen.c                 | 2 +-
 17 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/mach-pxa/corgi_ssp.c b/arch/arm/mach-pxa/corgi_ssp.c
index 8ccffba0018..366a9bde3d8 100644
--- a/arch/arm/mach-pxa/corgi_ssp.c
+++ b/arch/arm/mach-pxa/corgi_ssp.c
@@ -22,7 +22,7 @@
 #include <asm/arch/corgi.h>
 #include <asm/arch/pxa-regs.h>
 
-static spinlock_t corgi_ssp_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(corgi_ssp_lock);
 static struct ssp_dev corgi_ssp_dev;
 static struct ssp_state corgi_ssp_state;
 
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 4ad97b3b39d..1650353e3f7 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -497,7 +497,7 @@ typedef struct {
 static pfm_stats_t		pfm_stats[NR_CPUS];
 static pfm_session_t		pfm_sessions;	/* global sessions information */
 
-static spinlock_t pfm_alt_install_check = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pfm_alt_install_check);
 static pfm_intr_handler_desc_t  *pfm_alt_intr_handler;
 
 static struct proc_dir_entry 	*perfmon_dir;
diff --git a/arch/ia64/sn/kernel/xpnet.c b/arch/ia64/sn/kernel/xpnet.c
index d0c2c114a45..e5c6d3c0a8e 100644
--- a/arch/ia64/sn/kernel/xpnet.c
+++ b/arch/ia64/sn/kernel/xpnet.c
@@ -130,7 +130,7 @@ struct net_device *xpnet_device;
  */
 static u64 xpnet_broadcast_partitions;
 /* protect above */
-static spinlock_t xpnet_broadcast_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(xpnet_broadcast_lock);
 
 /*
  * Since the Block Transfer Engine (BTE) is being used for the transfer
diff --git a/arch/mips/kernel/genrtc.c b/arch/mips/kernel/genrtc.c
index 288bf51ad4e..71416e7bbba 100644
--- a/arch/mips/kernel/genrtc.c
+++ b/arch/mips/kernel/genrtc.c
@@ -14,7 +14,7 @@
 #include <asm/rtc.h>
 #include <asm/time.h>
 
-static spinlock_t mips_rtc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(mips_rtc_lock);
 
 unsigned int get_rtc_time(struct rtc_time *time)
 {
diff --git a/arch/mips/kernel/i8259.c b/arch/mips/kernel/i8259.c
index 7eec7568bfe..447759201d1 100644
--- a/arch/mips/kernel/i8259.c
+++ b/arch/mips/kernel/i8259.c
@@ -31,7 +31,7 @@ void disable_8259A_irq(unsigned int irq);
  * moves to arch independent land
  */
 
-spinlock_t i8259A_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t DEFINE_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
diff --git a/arch/ppc/platforms/hdpu.c b/arch/ppc/platforms/hdpu.c
index b659d7b3d74..ff379686012 100644
--- a/arch/ppc/platforms/hdpu.c
+++ b/arch/ppc/platforms/hdpu.c
@@ -58,7 +58,7 @@ static void parse_bootinfo(unsigned long r3,
 static void hdpu_set_l1pe(void);
 static void hdpu_cpustate_set(unsigned char new_state);
 #ifdef CONFIG_SMP
-static spinlock_t timebase_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 extern int smp_tb_synchronized;
 
diff --git a/arch/ppc/syslib/mv64x60.c b/arch/ppc/syslib/mv64x60.c
index 6262b11f366..839f8872826 100644
--- a/arch/ppc/syslib/mv64x60.c
+++ b/arch/ppc/syslib/mv64x60.c
@@ -31,7 +31,7 @@
 
 
 u8 mv64x60_pci_exclude_bridge = 1;
-spinlock_t	mv64x60_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(mv64x60_lock);
 
 static phys_addr_t 	mv64x60_bridge_pbase;
 static void 		*mv64x60_bridge_vbase;
diff --git a/arch/ppc/syslib/qspan_pci.c b/arch/ppc/syslib/qspan_pci.c
index 57f4ed5e5ae..0970b5d3039 100644
--- a/arch/ppc/syslib/qspan_pci.c
+++ b/arch/ppc/syslib/qspan_pci.c
@@ -94,7 +94,7 @@
 #define mk_config_type1(bus, dev, offset) \
 	mk_config_addr(bus, dev, offset) | 1;
 
-static spinlock_t pcibios_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pcibios_lock);
 
 int qspan_pcibios_read_config_byte(unsigned char bus, unsigned char dev_fn,
 				  unsigned char offset, unsigned char *val)
diff --git a/arch/ppc64/kernel/pmc.c b/arch/ppc64/kernel/pmc.c
index cdfec7438d0..63d9481c3ec 100644
--- a/arch/ppc64/kernel/pmc.c
+++ b/arch/ppc64/kernel/pmc.c
@@ -26,7 +26,7 @@ static void dummy_perf(struct pt_regs *regs)
 	mtspr(SPRN_MMCR0, mmcr0);
 }
 
-static spinlock_t pmc_owner_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pmc_owner_lock);
 static void *pmc_owner_caller; /* mostly for debugging */
 perf_irq_t perf_irq = dummy_perf;
 
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index 19724c5800a..2e64e8c3e8e 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -20,7 +20,7 @@ spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = {
 
 #else /* SMP */
 
-static spinlock_t dummy = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dummy);
 #define ATOMIC_HASH_SIZE	1
 #define ATOMIC_HASH(a)		(&dummy)
 
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 7c9dbc8c942..25c9a99c377 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -94,7 +94,7 @@ static char mv643xx_driver_version[] = "1.0";
 static void __iomem *mv643xx_eth_shared_base;
 
 /* used to protect MV643XX_ETH_SMI_REG, which is shared across ports */
-static spinlock_t mv643xx_eth_phy_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(mv643xx_eth_phy_lock);
 
 static inline u32 mv_read(int offset)
 {
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 91df0bf181d..7a57c1b8373 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -215,7 +215,7 @@ static inline void iosapic_write(void __iomem *iosapic, unsigned int reg, u32 va
 #define IOSAPIC_IRDT_ID_EID_SHIFT              0x10
 
 
-static spinlock_t iosapic_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(iosapic_lock);
 
 static inline void iosapic_eoi(void __iomem *addr, unsigned int data)
 {
diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c
index bd0e1b6be1e..13ecd0c4740 100644
--- a/drivers/scsi/ch.c
+++ b/drivers/scsi/ch.c
@@ -116,7 +116,7 @@ typedef struct {
 } scsi_changer;
 
 static LIST_HEAD(ch_devlist);
-static spinlock_t ch_devlist_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ch_devlist_lock);
 static int ch_devcount;
 
 static struct scsi_driver ch_template =
diff --git a/drivers/video/geode/display_gx1.c b/drivers/video/geode/display_gx1.c
index f4983879fcc..926d53eeb54 100644
--- a/drivers/video/geode/display_gx1.c
+++ b/drivers/video/geode/display_gx1.c
@@ -22,7 +22,7 @@
 #include "geodefb.h"
 #include "display_gx1.h"
 
-static spinlock_t gx1_conf_reg_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(gx1_conf_reg_lock);
 
 static u8 gx1_read_conf_reg(u8 reg)
 {
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 3dae14c8c55..fa8394f9437 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -170,7 +170,7 @@ ktrace_enter(
 	void            *val14,
 	void            *val15)
 {
-	static lock_t   wrap_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(wrap_lock);
 	unsigned long	flags;
 	int             index;
 	ktrace_entry_t  *ktep;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a1a9a7abff5..5265dfd6992 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -645,10 +645,10 @@ int netpoll_setup(struct netpoll *np)
 
 		npinfo->rx_flags = 0;
 		npinfo->rx_np = NULL;
-		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&npinfo->poll_lock);
 		npinfo->poll_owner = -1;
 		npinfo->tries = MAX_RETRIES;
-		npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&npinfo->rx_lock);
 	} else
 		npinfo = ndev->npinfo;
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8eb083b6041..b3ad49fa7d7 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -503,7 +503,7 @@ static int pg_delay_d = 0;
 static int pg_clone_skb_d = 0;
 static int debug = 0;
 
-static spinlock_t _thread_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(_thread_lock);
 static struct pktgen_thread *pktgen_threads = NULL;
 
 static char module_fname[128];
-- 
cgit v1.2.3-70-g09d2


From d99901d6fdfb4098b9996de89ffbbae890e08288 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Fri, 9 Sep 2005 13:59:48 +0400
Subject: [PATCH] Lost sockfd_put() in routing_ioctl()

This patch adds lost sockfd_put() in 32bit compat rounting_ioctl() on
64bit platforms

Signed-Off-By: Kirill Korotaev <dev@sw.ru>
Signed-Off-By: Maxim Giryaev <gem@sw.ru>
Signed-off-By: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat_ioctl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 155e612635f..e28a74203f3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -798,13 +798,16 @@ static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 		r = (void *) &r4;
 	}
 
-	if (ret)
-		return -EFAULT;
+	if (ret) {
+		ret = -EFAULT;
+		goto out;
+	}
 
 	set_fs (KERNEL_DS);
 	ret = sys_ioctl (fd, cmd, (unsigned long) r);
 	set_fs (old_fs);
 
+out:
 	if (mysock)
 		sockfd_put(mysock);
 
-- 
cgit v1.2.3-70-g09d2


From a4531edd75522804dd2b268d8ccc5eaa70748011 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 9 Sep 2005 15:10:52 -0700
Subject: Fix up lost patch in compat_sys_select() for new RCU files world
 order

Andrew lost this in patch reject resolution, and never noticed, since
the compat code isn't in use on x86.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 8c665705c6a..c2e0813164b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1619,6 +1619,7 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
 	char *bits;
 	long timeout;
 	int size, max_fdset, ret = -EINVAL;
+	struct fdtable *fdt;
 
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (tvp) {
@@ -1644,7 +1645,8 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
 		goto out_nofds;
 
 	/* max_fdset can increase, so grab it once to avoid race */
-	max_fdset = current->files->max_fdset;
+	fdt = files_fdtable(current->files);
+	max_fdset = fdt->max_fdset;
 	if (n > max_fdset)
 		n = max_fdset;
 
-- 
cgit v1.2.3-70-g09d2


From ac5b8b6f22118620cd1133d9943b1f31dc40a913 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 9 Sep 2005 15:42:34 -0700
Subject: Preempt-safe RCU file usage

Fix up fs/compat.c fixes.
---
 fs/compat.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index c2e0813164b..ac3fb9ed8ee 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1645,8 +1645,10 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
 		goto out_nofds;
 
 	/* max_fdset can increase, so grab it once to avoid race */
+	rcu_read_lock();
 	fdt = files_fdtable(current->files);
 	max_fdset = fdt->max_fdset;
+	rcu_read_unlock();
 	if (n > max_fdset)
 		n = max_fdset;
 
-- 
cgit v1.2.3-70-g09d2


From b4012a9895b3e28e3bff3aa534d58c7827af6d4f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 10 Sep 2005 00:25:47 -0700
Subject: [PATCH] ntfs build fix

*** Warning: "bit_spin_lock" [fs/ntfs/ntfs.ko] undefined!
*** Warning: "bit_spin_unlock" [fs/ntfs/ntfs.ko] undefined!

Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ntfs/aops.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 545236414d5..b6cc8cf2462 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
 
 #include "aops.h"
 #include "attrib.h"
-- 
cgit v1.2.3-70-g09d2


From fb1c8f93d869b34cacb8b8932e2b83d96a19d720 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 10 Sep 2005 00:25:56 -0700
Subject: [PATCH] spinlock consolidation

This patch (written by me and also containing many suggestions of Arjan van
de Ven) does a major cleanup of the spinlock code.  It does the following
things:

 - consolidates and enhances the spinlock/rwlock debugging code

 - simplifies the asm/spinlock.h files

 - encapsulates the raw spinlock type and moves generic spinlock
   features (such as ->break_lock) into the generic code.

 - cleans up the spinlock code hierarchy to get rid of the spaghetti.

Most notably there's now only a single variant of the debugging code,
located in lib/spinlock_debug.c.  (previously we had one SMP debugging
variant per architecture, plus a separate generic one for UP builds)

Also, i've enhanced the rwlock debugging facility, it will now track
write-owners.  There is new spinlock-owner/CPU-tracking on SMP builds too.
All locks have lockup detection now, which will work for both soft and hard
spin/rwlock lockups.

The arch-level include files now only contain the minimally necessary
subset of the spinlock code - all the rest that can be generalized now
lives in the generic headers:

 include/asm-i386/spinlock_types.h       |   16
 include/asm-x86_64/spinlock_types.h     |   16

I have also split up the various spinlock variants into separate files,
making it easier to see which does what. The new layout is:

   SMP                         |  UP
   ----------------------------|-----------------------------------
   asm/spinlock_types_smp.h    |  linux/spinlock_types_up.h
   linux/spinlock_types.h      |  linux/spinlock_types.h
   asm/spinlock_smp.h          |  linux/spinlock_up.h
   linux/spinlock_api_smp.h    |  linux/spinlock_api_up.h
   linux/spinlock.h            |  linux/spinlock.h

/*
 * here's the role of the various spinlock/rwlock related include files:
 *
 * on SMP builds:
 *
 *  asm/spinlock_types.h: contains the raw_spinlock_t/raw_rwlock_t and the
 *                        initializers
 *
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  asm/spinlock.h:       contains the __raw_spin_*()/etc. lowlevel
 *                        implementations, mostly inline assembly code
 *
 *   (also included on UP-debug builds:)
 *
 *  linux/spinlock_api_smp.h:
 *                        contains the prototypes for the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 *
 * on UP builds:
 *
 *  linux/spinlock_type_up.h:
 *                        contains the generic, simplified UP spinlock type.
 *                        (which is an empty structure on non-debug builds)
 *
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  linux/spinlock_up.h:
 *                        contains the __raw_spin_*()/etc. version of UP
 *                        builds. (which are NOPs on non-debug, non-preempt
 *                        builds)
 *
 *   (included on UP-non-debug builds:)
 *
 *  linux/spinlock_api_up.h:
 *                        builds the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 */

All SMP and UP architectures are converted by this patch.

arm, i386, ia64, ppc, ppc64, s390/s390x, x64 was build-tested via
crosscompilers.  m32r, mips, sh, sparc, have not been tested yet, but should
be mostly fine.

From: Grant Grundler <grundler@parisc-linux.org>

  Booted and lightly tested on a500-44 (64-bit, SMP kernel, dual CPU).
  Builds 32-bit SMP kernel (not booted or tested).  I did not try to build
  non-SMP kernels.  That should be trivial to fix up later if necessary.

  I converted bit ops atomic_hash lock to raw_spinlock_t.  Doing so avoids
  some ugly nesting of linux/*.h and asm/*.h files.  Those particular locks
  are well tested and contained entirely inside arch specific code.  I do NOT
  expect any new issues to arise with them.

 If someone does ever need to use debug/metrics with them, then they will
  need to unravel this hairball between spinlocks, atomic ops, and bit ops
  that exist only because parisc has exactly one atomic instruction: LDCW
  (load and clear word).

From: "Luck, Tony" <tony.luck@intel.com>

   ia64 fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjanv@infradead.org>
Signed-off-by: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <willy@debian.org>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Mikael Pettersson <mikpe@csd.uu.se>
Signed-off-by: Benoit Boissinot <benoit.boissinot@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/alpha_ksyms.c      |   9 -
 arch/alpha/kernel/smp.c              | 172 ----------
 arch/ia64/kernel/mca.c               |  11 +-
 arch/m32r/kernel/smp.c               |  48 +--
 arch/mips/lib/dec_and_lock.c         |   8 -
 arch/parisc/lib/Makefile             |   2 -
 arch/parisc/lib/bitops.c             |   4 +-
 arch/parisc/lib/debuglocks.c         | 277 ----------------
 arch/ppc/lib/Makefile                |   1 -
 arch/ppc/lib/dec_and_lock.c          |   8 -
 arch/ppc64/lib/dec_and_lock.c        |   8 -
 arch/ppc64/lib/locks.c               |  14 +-
 arch/s390/lib/spinlock.c             |  12 +-
 arch/sparc/kernel/sparc_ksyms.c      |  10 -
 arch/sparc/lib/Makefile              |   2 -
 arch/sparc/lib/debuglocks.c          | 202 -----------
 arch/sparc64/kernel/process.c        |   5 -
 arch/sparc64/kernel/sparc64_ksyms.c  |   5 -
 arch/sparc64/lib/Makefile            |   1 -
 arch/sparc64/lib/debuglocks.c        | 366 --------------------
 fs/buffer.c                          |   1 +
 include/asm-alpha/spinlock.h         |  96 ++----
 include/asm-alpha/spinlock_types.h   |  20 ++
 include/asm-arm/spinlock.h           |  50 +--
 include/asm-arm/spinlock_types.h     |  20 ++
 include/asm-i386/spinlock.h          | 200 ++++-------
 include/asm-i386/spinlock_types.h    |  20 ++
 include/asm-ia64/spinlock.h          |  69 ++--
 include/asm-ia64/spinlock_types.h    |  21 ++
 include/asm-m32r/spinlock.h          | 127 ++-----
 include/asm-m32r/spinlock_types.h    |  23 ++
 include/asm-mips/spinlock.h          |  75 ++---
 include/asm-mips/spinlock_types.h    |  20 ++
 include/asm-parisc/atomic.h          |  12 +-
 include/asm-parisc/bitops.h          |   2 +-
 include/asm-parisc/cacheflush.h      |   1 +
 include/asm-parisc/processor.h       |   1 +
 include/asm-parisc/spinlock.h        | 163 ++-------
 include/asm-parisc/spinlock_types.h  |  21 ++
 include/asm-parisc/system.h          |  24 +-
 include/asm-ppc/spinlock.h           |  91 ++---
 include/asm-ppc/spinlock_types.h     |  20 ++
 include/asm-ppc64/spinlock.h         | 191 +++++------
 include/asm-ppc64/spinlock_types.h   |  20 ++
 include/asm-s390/spinlock.h          |  63 ++--
 include/asm-s390/spinlock_types.h    |  21 ++
 include/asm-sh/spinlock.h            |  61 ++--
 include/asm-sh/spinlock_types.h      |  22 ++
 include/asm-sparc/spinlock.h         | 140 ++------
 include/asm-sparc/spinlock_types.h   |  20 ++
 include/asm-sparc64/spinlock.h       | 160 ++-------
 include/asm-sparc64/spinlock_types.h |  20 ++
 include/asm-x86_64/spinlock.h        | 164 +++------
 include/asm-x86_64/spinlock_types.h  |  20 ++
 include/linux/bit_spinlock.h         |  77 +++++
 include/linux/jbd.h                  |   1 +
 include/linux/spinlock.h             | 627 +++++++----------------------------
 include/linux/spinlock_api_smp.h     |  57 ++++
 include/linux/spinlock_api_up.h      |  80 +++++
 include/linux/spinlock_types.h       |  67 ++++
 include/linux/spinlock_types_up.h    |  51 +++
 include/linux/spinlock_up.h          |  74 +++++
 kernel/Makefile                      |   1 +
 kernel/sched.c                       |   4 +
 kernel/spinlock.c                    |  15 +-
 lib/Makefile                         |   1 +
 lib/dec_and_lock.c                   |   3 -
 lib/kernel_lock.c                    |   3 +-
 lib/spinlock_debug.c                 | 257 ++++++++++++++
 69 files changed, 1591 insertions(+), 2871 deletions(-)
 delete mode 100644 arch/parisc/lib/debuglocks.c
 delete mode 100644 arch/sparc/lib/debuglocks.c
 delete mode 100644 arch/sparc64/lib/debuglocks.c
 create mode 100644 include/asm-alpha/spinlock_types.h
 create mode 100644 include/asm-arm/spinlock_types.h
 create mode 100644 include/asm-i386/spinlock_types.h
 create mode 100644 include/asm-ia64/spinlock_types.h
 create mode 100644 include/asm-m32r/spinlock_types.h
 create mode 100644 include/asm-mips/spinlock_types.h
 create mode 100644 include/asm-parisc/spinlock_types.h
 create mode 100644 include/asm-ppc/spinlock_types.h
 create mode 100644 include/asm-ppc64/spinlock_types.h
 create mode 100644 include/asm-s390/spinlock_types.h
 create mode 100644 include/asm-sh/spinlock_types.h
 create mode 100644 include/asm-sparc/spinlock_types.h
 create mode 100644 include/asm-sparc64/spinlock_types.h
 create mode 100644 include/asm-x86_64/spinlock_types.h
 create mode 100644 include/linux/bit_spinlock.h
 create mode 100644 include/linux/spinlock_api_smp.h
 create mode 100644 include/linux/spinlock_api_up.h
 create mode 100644 include/linux/spinlock_types.h
 create mode 100644 include/linux/spinlock_types_up.h
 create mode 100644 include/linux/spinlock_up.h
 create mode 100644 lib/spinlock_debug.c

(limited to 'fs')

diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index fc5ef90c4fc..24ae9a36607 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -185,15 +185,6 @@ EXPORT_SYMBOL(smp_num_cpus);
 EXPORT_SYMBOL(smp_call_function);
 EXPORT_SYMBOL(smp_call_function_on_cpu);
 EXPORT_SYMBOL(_atomic_dec_and_lock);
-#ifdef CONFIG_DEBUG_SPINLOCK
-EXPORT_SYMBOL(_raw_spin_unlock);
-EXPORT_SYMBOL(debug_spin_lock);
-EXPORT_SYMBOL(debug_spin_trylock);
-#endif
-#ifdef CONFIG_DEBUG_RWLOCK
-EXPORT_SYMBOL(_raw_write_lock);
-EXPORT_SYMBOL(_raw_read_lock);
-#endif
 EXPORT_SYMBOL(cpu_present_mask);
 #endif /* CONFIG_SMP */
 
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index e211aa7404e..da0be346579 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -989,175 +989,3 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 
 	preempt_enable();
 }
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-void
-_raw_spin_unlock(spinlock_t * lock)
-{
-	mb();
-	lock->lock = 0;
-
-	lock->on_cpu = -1;
-	lock->previous = NULL;
-	lock->task = NULL;
-	lock->base_file = "none";
-	lock->line_no = 0;
-}
-
-void
-debug_spin_lock(spinlock_t * lock, const char *base_file, int line_no)
-{
-	long tmp;
-	long stuck;
-	void *inline_pc = __builtin_return_address(0);
-	unsigned long started = jiffies;
-	int printed = 0;
-	int cpu = smp_processor_id();
-
-	stuck = 1L << 30;
- try_again:
-
-	/* Use sub-sections to put the actual loop at the end
-	   of this object file's text section so as to perfect
-	   branch prediction.  */
-	__asm__ __volatile__(
-	"1:	ldl_l	%0,%1\n"
-	"	subq	%2,1,%2\n"
-	"	blbs	%0,2f\n"
-	"	or	%0,1,%0\n"
-	"	stl_c	%0,%1\n"
-	"	beq	%0,3f\n"
-	"4:	mb\n"
-	".subsection 2\n"
-	"2:	ldl	%0,%1\n"
-	"	subq	%2,1,%2\n"
-	"3:	blt	%2,4b\n"
-	"	blbs	%0,2b\n"
-	"	br	1b\n"
-	".previous"
-	: "=r" (tmp), "=m" (lock->lock), "=r" (stuck)
-	: "m" (lock->lock), "2" (stuck) : "memory");
-
-	if (stuck < 0) {
-		printk(KERN_WARNING
-		       "%s:%d spinlock stuck in %s at %p(%d)"
-		       " owner %s at %p(%d) %s:%d\n",
-		       base_file, line_no,
-		       current->comm, inline_pc, cpu,
-		       lock->task->comm, lock->previous,
-		       lock->on_cpu, lock->base_file, lock->line_no);
-		stuck = 1L << 36;
-		printed = 1;
-		goto try_again;
-	}
-
-	/* Exiting.  Got the lock.  */
-	lock->on_cpu = cpu;
-	lock->previous = inline_pc;
-	lock->task = current;
-	lock->base_file = base_file;
-	lock->line_no = line_no;
-
-	if (printed) {
-		printk(KERN_WARNING
-		       "%s:%d spinlock grabbed in %s at %p(%d) %ld ticks\n",
-		       base_file, line_no, current->comm, inline_pc,
-		       cpu, jiffies - started);
-	}
-}
-
-int
-debug_spin_trylock(spinlock_t * lock, const char *base_file, int line_no)
-{
-	int ret;
-	if ((ret = !test_and_set_bit(0, lock))) {
-		lock->on_cpu = smp_processor_id();
-		lock->previous = __builtin_return_address(0);
-		lock->task = current;
-	} else {
-		lock->base_file = base_file;
-		lock->line_no = line_no;
-	}
-	return ret;
-}
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
-#ifdef CONFIG_DEBUG_RWLOCK
-void _raw_write_lock(rwlock_t * lock)
-{
-	long regx, regy;
-	int stuck_lock, stuck_reader;
-	void *inline_pc = __builtin_return_address(0);
-
- try_again:
-
-	stuck_lock = 1<<30;
-	stuck_reader = 1<<30;
-
-	__asm__ __volatile__(
-	"1:	ldl_l	%1,%0\n"
-	"	blbs	%1,6f\n"
-	"	blt	%1,8f\n"
-	"	mov	1,%1\n"
-	"	stl_c	%1,%0\n"
-	"	beq	%1,6f\n"
-	"4:	mb\n"
-	".subsection 2\n"
-	"6:	blt	%3,4b	# debug\n"
-	"	subl	%3,1,%3	# debug\n"
-	"	ldl	%1,%0\n"
-	"	blbs	%1,6b\n"
-	"8:	blt	%4,4b	# debug\n"
-	"	subl	%4,1,%4	# debug\n"
-	"	ldl	%1,%0\n"
-	"	blt	%1,8b\n"
-	"	br	1b\n"
-	".previous"
-	: "=m" (*(volatile int *)lock), "=&r" (regx), "=&r" (regy),
-	  "=&r" (stuck_lock), "=&r" (stuck_reader)
-	: "m" (*(volatile int *)lock), "3" (stuck_lock), "4" (stuck_reader) : "memory");
-
-	if (stuck_lock < 0) {
-		printk(KERN_WARNING "write_lock stuck at %p\n", inline_pc);
-		goto try_again;
-	}
-	if (stuck_reader < 0) {
-		printk(KERN_WARNING "write_lock stuck on readers at %p\n",
-		       inline_pc);
-		goto try_again;
-	}
-}
-
-void _raw_read_lock(rwlock_t * lock)
-{
-	long regx;
-	int stuck_lock;
-	void *inline_pc = __builtin_return_address(0);
-
- try_again:
-
-	stuck_lock = 1<<30;
-
-	__asm__ __volatile__(
-	"1:	ldl_l	%1,%0;"
-	"	blbs	%1,6f;"
-	"	subl	%1,2,%1;"
-	"	stl_c	%1,%0;"
-	"	beq	%1,6f;"
-	"4:	mb\n"
-	".subsection 2\n"
-	"6:	ldl	%1,%0;"
-	"	blt	%2,4b	# debug\n"
-	"	subl	%2,1,%2	# debug\n"
-	"	blbs	%1,6b;"
-	"	br	1b\n"
-	".previous"
-	: "=m" (*(volatile int *)lock), "=&r" (regx), "=&r" (stuck_lock)
-	: "m" (*(volatile int *)lock), "2" (stuck_lock) : "memory");
-
-	if (stuck_lock < 0) {
-		printk(KERN_WARNING "read_lock stuck at %p\n", inline_pc);
-		goto try_again;
-	}
-}
-#endif /* CONFIG_DEBUG_RWLOCK */
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 4ebbf397438..8d484204a3f 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -491,12 +491,7 @@ init_handler_platform (pal_min_state_area_t *ms,
 	unw_init_from_interruption(&info, current, pt, sw);
 	ia64_do_show_stack(&info, NULL);
 
-#ifdef CONFIG_SMP
-	/* read_trylock() would be handy... */
-	if (!tasklist_lock.write_lock)
-		read_lock(&tasklist_lock);
-#endif
-	{
+	if (read_trylock(&tasklist_lock)) {
 		struct task_struct *g, *t;
 		do_each_thread (g, t) {
 			if (t == current)
@@ -506,10 +501,6 @@ init_handler_platform (pal_min_state_area_t *ms,
 			show_stack(t, NULL);
 		} while_each_thread (g, t);
 	}
-#ifdef CONFIG_SMP
-	if (!tasklist_lock.write_lock)
-		read_unlock(&tasklist_lock);
-#endif
 
 	printk("\nINIT dump complete.  Please reboot now.\n");
 	while (1);			/* hang city if no debugger */
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 48b187f2d2b..a4576ac7e87 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -892,7 +892,6 @@ unsigned long send_IPI_mask_phys(cpumask_t physid_mask, int ipi_num,
 	int try)
 {
 	spinlock_t *ipilock;
-	unsigned long flags = 0;
 	volatile unsigned long *ipicr_addr;
 	unsigned long ipicr_val;
 	unsigned long my_physid_mask;
@@ -916,50 +915,27 @@ unsigned long send_IPI_mask_phys(cpumask_t physid_mask, int ipi_num,
 	 * write IPICRi (send IPIi)
 	 * unlock ipi_lock[i]
 	 */
+	spin_lock(ipilock);
 	__asm__ __volatile__ (
-		";; LOCK ipi_lock[i]		\n\t"
+		";; CHECK IPICRi == 0		\n\t"
 		".fillinsn			\n"
 		"1:				\n\t"
-		"mvfc	%1, psw 		\n\t"
-		"clrpsw	#0x40 -> nop		\n\t"
-		DCACHE_CLEAR("r4", "r5", "%2")
-		"lock	r4, @%2			\n\t"
-		"addi	r4, #-1			\n\t"
-		"unlock	r4, @%2			\n\t"
-		"mvtc	%1, psw			\n\t"
-		"bnez	r4, 2f			\n\t"
-		LOCK_SECTION_START(".balign 4 \n\t")
-		".fillinsn			\n"
-		"2:				\n\t"
-		"ld	r4, @%2			\n\t"
-		"blez	r4, 2b			\n\t"
+		"ld	%0, @%1			\n\t"
+		"and	%0, %4			\n\t"
+		"beqz	%0, 2f			\n\t"
+		"bnez	%3, 3f			\n\t"
 		"bra	1b			\n\t"
-		LOCK_SECTION_END
-		";; CHECK IPICRi == 0		\n\t"
-		".fillinsn			\n"
-		"3:				\n\t"
-		"ld	%0, @%3			\n\t"
-		"and	%0, %6			\n\t"
-		"beqz	%0, 4f			\n\t"
-		"bnez	%5, 5f			\n\t"
-		"bra	3b			\n\t"
 		";; WRITE IPICRi (send IPIi)	\n\t"
 		".fillinsn			\n"
-		"4:				\n\t"
-		"st	%4, @%3			\n\t"
-		";; UNLOCK ipi_lock[i]		\n\t"
+		"2:				\n\t"
+		"st	%2, @%1			\n\t"
 		".fillinsn			\n"
-		"5:				\n\t"
-		"ldi	r4, #1			\n\t"
-		"st	r4, @%2			\n\t"
+		"3:				\n\t"
 		: "=&r"(ipicr_val)
-		: "r"(flags), "r"(&ipilock->slock), "r"(ipicr_addr),
-		  "r"(mask), "r"(try), "r"(my_physid_mask)
-		: "memory", "r4"
-#ifdef CONFIG_CHIP_M32700_TS1
-		, "r5"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
+		: "r"(ipicr_addr), "r"(mask), "r"(try), "r"(my_physid_mask)
+		: "memory"
 	);
+	spin_unlock(ipilock);
 
 	return ipicr_val;
 }
diff --git a/arch/mips/lib/dec_and_lock.c b/arch/mips/lib/dec_and_lock.c
index e44e9579bd3..fd82c84a93b 100644
--- a/arch/mips/lib/dec_and_lock.c
+++ b/arch/mips/lib/dec_and_lock.c
@@ -20,14 +20,7 @@
  * has a cmpxchg, and where atomic->value is an int holding
  * the value of the atomic (i.e. the high bits aren't used
  * for a lock or anything like that).
- *
- * N.B. ATOMIC_DEC_AND_LOCK gets defined in include/linux/spinlock.h
- * if spinlocks are empty and thus atomic_dec_and_lock is defined
- * to be atomic_dec_and_test - in that case we don't need it
- * defined here as well.
  */
-
-#ifndef ATOMIC_DEC_AND_LOCK
 int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
 	int counter;
@@ -52,4 +45,3 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 }
 
 EXPORT_SYMBOL(_atomic_dec_and_lock);
-#endif /* ATOMIC_DEC_AND_LOCK */
diff --git a/arch/parisc/lib/Makefile b/arch/parisc/lib/Makefile
index 7bf70567629..5f2e6904d14 100644
--- a/arch/parisc/lib/Makefile
+++ b/arch/parisc/lib/Makefile
@@ -5,5 +5,3 @@
 lib-y	:= lusercopy.o bitops.o checksum.o io.o memset.o fixup.o memcpy.o
 
 obj-y	:= iomap.o
-
-lib-$(CONFIG_SMP) += debuglocks.o
diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c
index 2de182f6fe8..90f400b1028 100644
--- a/arch/parisc/lib/bitops.c
+++ b/arch/parisc/lib/bitops.c
@@ -13,8 +13,8 @@
 #include <asm/atomic.h>
 
 #ifdef CONFIG_SMP
-spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = {
-	[0 ... (ATOMIC_HASH_SIZE-1)]  = SPIN_LOCK_UNLOCKED
+raw_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = {
+	[0 ... (ATOMIC_HASH_SIZE-1)]  = __RAW_SPIN_LOCK_UNLOCKED
 };
 #endif
 
diff --git a/arch/parisc/lib/debuglocks.c b/arch/parisc/lib/debuglocks.c
deleted file mode 100644
index 1b33fe6e5b7..00000000000
--- a/arch/parisc/lib/debuglocks.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/* 
- *    Debugging versions of SMP locking primitives.
- *
- *    Copyright (C) 2004 Thibaut VARENE <varenet@parisc-linux.org>
- *
- *    Some code stollen from alpha & sparc64 ;)
- *
- *    This program is free software; you can redistribute it and/or modify
- *    it under the terms of the GNU General Public License as published by
- *    the Free Software Foundation; either version 2 of the License, or
- *    (at your option) any later version.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU General Public License for more details.
- *
- *    You should have received a copy of the GNU General Public License
- *    along with this program; if not, write to the Free Software
- *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *    We use pdc_printf() throughout the file for all output messages, to avoid
- *    losing messages because of disabled interrupts. Since we're using these
- *    messages for debugging purposes, it makes sense not to send them to the
- *    linux console.
- */
-
-
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>	/* in_interrupt() */
-#include <asm/system.h>
-#include <asm/hardirq.h>	/* in_interrupt() */
-#include <asm/pdc.h>
-
-#undef INIT_STUCK
-#define INIT_STUCK 1L << 30
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-
-
-void _dbg_spin_lock(spinlock_t * lock, const char *base_file, int line_no)
-{
-	volatile unsigned int *a;
-	long stuck = INIT_STUCK;
-	void *inline_pc = __builtin_return_address(0);
-	unsigned long started = jiffies;
-	int printed = 0;
-	int cpu = smp_processor_id();
-
-try_again:
-
-	/* Do the actual locking */
-	/* <T-Bone> ggg: we can't get stuck on the outter loop?
-	 * <ggg> T-Bone: We can hit the outer loop
-	 *	alot if multiple CPUs are constantly racing for a lock
-	 *	and the backplane is NOT fair about which CPU sees
-	 *	the update first. But it won't hang since every failed
-	 *	attempt will drop us back into the inner loop and
-	 *	decrement `stuck'.
-	 * <ggg> K-class and some of the others are NOT fair in the HW
-	 * 	implementation so we could see false positives.
-	 * 	But fixing the lock contention is easier than
-	 * 	fixing the HW to be fair.
-	 * <tausq> __ldcw() returns 1 if we get the lock; otherwise we
-	 * 	spin until the value of the lock changes, or we time out.
-	 */
-	mb();
-	a = __ldcw_align(lock);
-	while (stuck && (__ldcw(a) == 0))
-		while ((*a == 0) && --stuck);
-	mb();
-
-	if (unlikely(stuck <= 0)) {
-		pdc_printf(
-			"%s:%d: spin_lock(%s/%p) stuck in %s at %p(%d)"
-			" owned by %s:%d in %s at %p(%d)\n",
-			base_file, line_no, lock->module, lock,
-			current->comm, inline_pc, cpu,
-			lock->bfile, lock->bline, lock->task->comm,
-			lock->previous, lock->oncpu);
-		stuck = INIT_STUCK;
-		printed = 1;
-		goto try_again;
-	}
-
-	/* Exiting.  Got the lock.  */
-	lock->oncpu = cpu;
-	lock->previous = inline_pc;
-	lock->task = current;
-	lock->bfile = (char *)base_file;
-	lock->bline = line_no;
-
-	if (unlikely(printed)) {
-		pdc_printf(
-			"%s:%d: spin_lock grabbed in %s at %p(%d) %ld ticks\n",
-			base_file, line_no, current->comm, inline_pc,
-			cpu, jiffies - started);
-	}
-}
-
-void _dbg_spin_unlock(spinlock_t * lock, const char *base_file, int line_no)
-{
-	CHECK_LOCK(lock);
-	volatile unsigned int *a;
-	mb();
-	a = __ldcw_align(lock);
-	if (unlikely((*a != 0) && lock->babble)) {
-		lock->babble--;
-		pdc_printf(
-			"%s:%d: spin_unlock(%s:%p) not locked\n",
-			base_file, line_no, lock->module, lock);
-	}
-	*a = 1;	
-	mb();
-}
-
-int _dbg_spin_trylock(spinlock_t * lock, const char *base_file, int line_no)
-{
-	int ret;
-	volatile unsigned int *a;
-	mb();
-	a = __ldcw_align(lock);
-	ret = (__ldcw(a) != 0);
-	mb();
-	if (ret) {
-		lock->oncpu = smp_processor_id();
-		lock->previous = __builtin_return_address(0);
-		lock->task = current;
-	} else {
-		lock->bfile = (char *)base_file;
-		lock->bline = line_no;
-	}
-	return ret;
-}
-
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
-#ifdef CONFIG_DEBUG_RWLOCK
-
-/* Interrupts trouble detailed explanation, thx Grant:
- *
- * o writer (wants to modify data) attempts to acquire the rwlock
- * o He gets the write lock.
- * o Interupts are still enabled, we take an interrupt with the
- *   write still holding the lock.
- * o interrupt handler tries to acquire the rwlock for read.
- * o deadlock since the writer can't release it at this point.
- * 
- * In general, any use of spinlocks that competes between "base"
- * level and interrupt level code will risk deadlock. Interrupts
- * need to be disabled in the base level routines to avoid it.
- * Or more precisely, only the IRQ the base level routine
- * is competing with for the lock.  But it's more efficient/faster
- * to just disable all interrupts on that CPU to guarantee
- * once it gets the lock it can release it quickly too.
- */
- 
-void _dbg_write_lock(rwlock_t *rw, const char *bfile, int bline)
-{
-	void *inline_pc = __builtin_return_address(0);
-	unsigned long started = jiffies;
-	long stuck = INIT_STUCK;
-	int printed = 0;
-	int cpu = smp_processor_id();
-	
-	if(unlikely(in_interrupt())) {	/* acquiring write lock in interrupt context, bad idea */
-		pdc_printf("write_lock caller: %s:%d, IRQs enabled,\n", bfile, bline);
-		BUG();
-	}
-
-	/* Note: if interrupts are disabled (which is most likely), the printk
-	will never show on the console. We might need a polling method to flush
-	the dmesg buffer anyhow. */
-	
-retry:
-	_raw_spin_lock(&rw->lock);
-
-	if(rw->counter != 0) {
-		/* this basically never happens */
-		_raw_spin_unlock(&rw->lock);
-		
-		stuck--;
-		if ((unlikely(stuck <= 0)) && (rw->counter < 0)) {
-			pdc_printf(
-				"%s:%d: write_lock stuck on writer"
-				" in %s at %p(%d) %ld ticks\n",
-				bfile, bline, current->comm, inline_pc,
-				cpu, jiffies - started);
-			stuck = INIT_STUCK;
-			printed = 1;
-		}
-		else if (unlikely(stuck <= 0)) {
-			pdc_printf(
-				"%s:%d: write_lock stuck on reader"
-				" in %s at %p(%d) %ld ticks\n",
-				bfile, bline, current->comm, inline_pc,
-				cpu, jiffies - started);
-			stuck = INIT_STUCK;
-			printed = 1;
-		}
-		
-		while(rw->counter != 0);
-
-		goto retry;
-	}
-
-	/* got it.  now leave without unlocking */
-	rw->counter = -1; /* remember we are locked */
-
-	if (unlikely(printed)) {
-		pdc_printf(
-			"%s:%d: write_lock grabbed in %s at %p(%d) %ld ticks\n",
-			bfile, bline, current->comm, inline_pc,
-			cpu, jiffies - started);
-	}
-}
-
-int _dbg_write_trylock(rwlock_t *rw, const char *bfile, int bline)
-{
-#if 0
-	void *inline_pc = __builtin_return_address(0);
-	int cpu = smp_processor_id();
-#endif
-	
-	if(unlikely(in_interrupt())) {	/* acquiring write lock in interrupt context, bad idea */
-		pdc_printf("write_lock caller: %s:%d, IRQs enabled,\n", bfile, bline);
-		BUG();
-	}
-
-	/* Note: if interrupts are disabled (which is most likely), the printk
-	will never show on the console. We might need a polling method to flush
-	the dmesg buffer anyhow. */
-	
-	_raw_spin_lock(&rw->lock);
-
-	if(rw->counter != 0) {
-		/* this basically never happens */
-		_raw_spin_unlock(&rw->lock);
-		return 0;
-	}
-
-	/* got it.  now leave without unlocking */
-	rw->counter = -1; /* remember we are locked */
-#if 0
-	pdc_printf("%s:%d: try write_lock grabbed in %s at %p(%d)\n",
-		   bfile, bline, current->comm, inline_pc, cpu);
-#endif
-	return 1;
-}
-
-void _dbg_read_lock(rwlock_t * rw, const char *bfile, int bline)
-{
-#if 0
-	void *inline_pc = __builtin_return_address(0);
-	unsigned long started = jiffies;
-	int cpu = smp_processor_id();
-#endif
-	unsigned long flags;
-
-	local_irq_save(flags);
-	_raw_spin_lock(&rw->lock); 
-
-	rw->counter++;
-#if 0
-	pdc_printf(
-		"%s:%d: read_lock grabbed in %s at %p(%d) %ld ticks\n",
-		bfile, bline, current->comm, inline_pc,
-		cpu, jiffies - started);
-#endif
-	_raw_spin_unlock(&rw->lock);
-	local_irq_restore(flags);
-}
-
-#endif /* CONFIG_DEBUG_RWLOCK */
diff --git a/arch/ppc/lib/Makefile b/arch/ppc/lib/Makefile
index 1c380e67d43..f1e1fb4144f 100644
--- a/arch/ppc/lib/Makefile
+++ b/arch/ppc/lib/Makefile
@@ -4,6 +4,5 @@
 
 obj-y			:= checksum.o string.o strcase.o dec_and_lock.o div64.o
 
-obj-$(CONFIG_SMP)	+= locks.o
 obj-$(CONFIG_8xx)	+= rheap.o
 obj-$(CONFIG_CPM2)	+= rheap.o
diff --git a/arch/ppc/lib/dec_and_lock.c b/arch/ppc/lib/dec_and_lock.c
index 4ee888070d9..b18f0d9a00f 100644
--- a/arch/ppc/lib/dec_and_lock.c
+++ b/arch/ppc/lib/dec_and_lock.c
@@ -11,14 +11,7 @@
  * has a cmpxchg, and where atomic->value is an int holding
  * the value of the atomic (i.e. the high bits aren't used
  * for a lock or anything like that).
- *
- * N.B. ATOMIC_DEC_AND_LOCK gets defined in include/linux/spinlock.h
- * if spinlocks are empty and thus atomic_dec_and_lock is defined
- * to be atomic_dec_and_test - in that case we don't need it
- * defined here as well.
  */
-
-#ifndef ATOMIC_DEC_AND_LOCK
 int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
 	int counter;
@@ -43,4 +36,3 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 }
 
 EXPORT_SYMBOL(_atomic_dec_and_lock);
-#endif /* ATOMIC_DEC_AND_LOCK */
diff --git a/arch/ppc64/lib/dec_and_lock.c b/arch/ppc64/lib/dec_and_lock.c
index 6e8d8591708..7b9d4da5cf9 100644
--- a/arch/ppc64/lib/dec_and_lock.c
+++ b/arch/ppc64/lib/dec_and_lock.c
@@ -20,14 +20,7 @@
  * has a cmpxchg, and where atomic->value is an int holding
  * the value of the atomic (i.e. the high bits aren't used
  * for a lock or anything like that).
- *
- * N.B. ATOMIC_DEC_AND_LOCK gets defined in include/linux/spinlock.h
- * if spinlocks are empty and thus atomic_dec_and_lock is defined
- * to be atomic_dec_and_test - in that case we don't need it
- * defined here as well.
  */
-
-#ifndef ATOMIC_DEC_AND_LOCK
 int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
 	int counter;
@@ -52,4 +45,3 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 }
 
 EXPORT_SYMBOL(_atomic_dec_and_lock);
-#endif /* ATOMIC_DEC_AND_LOCK */
diff --git a/arch/ppc64/lib/locks.c b/arch/ppc64/lib/locks.c
index ef70ef91abe..033643ab69e 100644
--- a/arch/ppc64/lib/locks.c
+++ b/arch/ppc64/lib/locks.c
@@ -23,12 +23,12 @@
 /* waiting for a spinlock... */
 #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
 
-void __spin_yield(spinlock_t *lock)
+void __spin_yield(raw_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 	struct paca_struct *holder_paca;
 
-	lock_value = lock->lock;
+	lock_value = lock->slock;
 	if (lock_value == 0)
 		return;
 	holder_cpu = lock_value & 0xffff;
@@ -38,7 +38,7 @@ void __spin_yield(spinlock_t *lock)
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
-	if (lock->lock != lock_value)
+	if (lock->slock != lock_value)
 		return;		/* something has changed */
 #ifdef CONFIG_PPC_ISERIES
 	HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc,
@@ -54,7 +54,7 @@ void __spin_yield(spinlock_t *lock)
  * This turns out to be the same for read and write locks, since
  * we only know the holder if it is write-locked.
  */
-void __rw_yield(rwlock_t *rw)
+void __rw_yield(raw_rwlock_t *rw)
 {
 	int lock_value;
 	unsigned int holder_cpu, yield_count;
@@ -82,9 +82,9 @@ void __rw_yield(rwlock_t *rw)
 }
 #endif
 
-void spin_unlock_wait(spinlock_t *lock)
+void __raw_spin_unlock_wait(raw_spinlock_t *lock)
 {
-	while (lock->lock) {
+	while (lock->slock) {
 		HMT_low();
 		if (SHARED_PROCESSOR)
 			__spin_yield(lock);
@@ -92,4 +92,4 @@ void spin_unlock_wait(spinlock_t *lock)
 	HMT_medium();
 }
 
-EXPORT_SYMBOL(spin_unlock_wait);
+EXPORT_SYMBOL(__raw_spin_unlock_wait);
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 888b5596c19..2dc14e9c832 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -36,7 +36,7 @@ _diag44(void)
 }
 
 void
-_raw_spin_lock_wait(spinlock_t *lp, unsigned int pc)
+_raw_spin_lock_wait(raw_spinlock_t *lp, unsigned int pc)
 {
 	int count = spin_retry;
 
@@ -53,7 +53,7 @@ _raw_spin_lock_wait(spinlock_t *lp, unsigned int pc)
 EXPORT_SYMBOL(_raw_spin_lock_wait);
 
 int
-_raw_spin_trylock_retry(spinlock_t *lp, unsigned int pc)
+_raw_spin_trylock_retry(raw_spinlock_t *lp, unsigned int pc)
 {
 	int count = spin_retry;
 
@@ -67,7 +67,7 @@ _raw_spin_trylock_retry(spinlock_t *lp, unsigned int pc)
 EXPORT_SYMBOL(_raw_spin_trylock_retry);
 
 void
-_raw_read_lock_wait(rwlock_t *rw)
+_raw_read_lock_wait(raw_rwlock_t *rw)
 {
 	unsigned int old;
 	int count = spin_retry;
@@ -86,7 +86,7 @@ _raw_read_lock_wait(rwlock_t *rw)
 EXPORT_SYMBOL(_raw_read_lock_wait);
 
 int
-_raw_read_trylock_retry(rwlock_t *rw)
+_raw_read_trylock_retry(raw_rwlock_t *rw)
 {
 	unsigned int old;
 	int count = spin_retry;
@@ -102,7 +102,7 @@ _raw_read_trylock_retry(rwlock_t *rw)
 EXPORT_SYMBOL(_raw_read_trylock_retry);
 
 void
-_raw_write_lock_wait(rwlock_t *rw)
+_raw_write_lock_wait(raw_rwlock_t *rw)
 {
 	int count = spin_retry;
 
@@ -119,7 +119,7 @@ _raw_write_lock_wait(rwlock_t *rw)
 EXPORT_SYMBOL(_raw_write_lock_wait);
 
 int
-_raw_write_trylock_retry(rwlock_t *rw)
+_raw_write_trylock_retry(raw_rwlock_t *rw)
 {
 	int count = spin_retry;
 
diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c
index 5d974a2b735..f8480933362 100644
--- a/arch/sparc/kernel/sparc_ksyms.c
+++ b/arch/sparc/kernel/sparc_ksyms.c
@@ -114,17 +114,7 @@ DOT_ALIAS2(unsigned, urem, unsigned, unsigned)
 /* used by various drivers */
 EXPORT_SYMBOL(sparc_cpu_model);
 EXPORT_SYMBOL(kernel_thread);
-#ifdef CONFIG_DEBUG_SPINLOCK
 #ifdef CONFIG_SMP
-EXPORT_SYMBOL(_do_spin_lock);
-EXPORT_SYMBOL(_do_spin_unlock);
-EXPORT_SYMBOL(_spin_trylock);
-EXPORT_SYMBOL(_do_read_lock);
-EXPORT_SYMBOL(_do_read_unlock);
-EXPORT_SYMBOL(_do_write_lock);
-EXPORT_SYMBOL(_do_write_unlock);
-#endif
-#else
 // XXX find what uses (or used) these.
 EXPORT_SYMBOL(___rw_read_enter);
 EXPORT_SYMBOL(___rw_read_exit);
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 2296ff9dc47..fa500694606 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -9,5 +9,3 @@ lib-y := mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \
 	 strncpy_from_user.o divdi3.o udivdi3.o strlen_user.o \
 	 copy_user.o locks.o atomic.o atomic32.o bitops.o \
 	 lshrdi3.o ashldi3.o rwsem.o muldi3.o bitext.o
-
-lib-$(CONFIG_DEBUG_SPINLOCK) +=	debuglocks.o
diff --git a/arch/sparc/lib/debuglocks.c b/arch/sparc/lib/debuglocks.c
deleted file mode 100644
index fb182352782..00000000000
--- a/arch/sparc/lib/debuglocks.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/* $Id: debuglocks.c,v 1.11 2001/09/20 00:35:31 davem Exp $
- * debuglocks.c: Debugging versions of SMP locking primitives.
- *
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1998-99 Anton Blanchard (anton@progsoc.uts.edu.au)
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/threads.h>	/* For NR_CPUS */
-#include <linux/spinlock.h>
-#include <asm/psr.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_SMP
-
-/* Some notes on how these debugging routines work.  When a lock is acquired
- * an extra debugging member lock->owner_pc is set to the caller of the lock
- * acquisition routine.  Right before releasing a lock, the debugging program
- * counter is cleared to zero.
- *
- * Furthermore, since PC's are 4 byte aligned on Sparc, we stuff the CPU
- * number of the owner in the lowest two bits.
- */
-
-#define STORE_CALLER(A) __asm__ __volatile__("mov %%i7, %0" : "=r" (A));
-
-static inline void show(char *str, spinlock_t *lock, unsigned long caller)
-{
-	int cpu = smp_processor_id();
-
-	printk("%s(%p) CPU#%d stuck at %08lx, owner PC(%08lx):CPU(%lx)\n",str,
-		lock, cpu, caller, lock->owner_pc & ~3, lock->owner_pc & 3);
-}
-
-static inline void show_read(char *str, rwlock_t *lock, unsigned long caller)
-{
-	int cpu = smp_processor_id();
-
-	printk("%s(%p) CPU#%d stuck at %08lx, owner PC(%08lx):CPU(%lx)\n", str,
-		lock, cpu, caller, lock->owner_pc & ~3, lock->owner_pc & 3);
-}
-
-static inline void show_write(char *str, rwlock_t *lock, unsigned long caller)
-{
-	int cpu = smp_processor_id();
-	int i;
-
-	printk("%s(%p) CPU#%d stuck at %08lx, owner PC(%08lx):CPU(%lx)", str,
-		lock, cpu, caller, lock->owner_pc & ~3, lock->owner_pc & 3);
-
-	for(i = 0; i < NR_CPUS; i++)
-		printk(" reader[%d]=%08lx", i, lock->reader_pc[i]);
-
-	printk("\n");
-}
-
-#undef INIT_STUCK
-#define INIT_STUCK 100000000
-
-void _do_spin_lock(spinlock_t *lock, char *str)
-{
-	unsigned long caller;
-	unsigned long val;
-	int cpu = smp_processor_id();
-	int stuck = INIT_STUCK;
-
-	STORE_CALLER(caller);
-
-again:
-	__asm__ __volatile__("ldstub [%1], %0" : "=r" (val) : "r" (&(lock->lock)));
-	if(val) {
-		while(lock->lock) {
-			if (!--stuck) {
-				show(str, lock, caller);
-				stuck = INIT_STUCK;
-			}
-			barrier();
-		}
-		goto again;
-	}
-	lock->owner_pc = (cpu & 3) | (caller & ~3);
-}
-
-int _spin_trylock(spinlock_t *lock)
-{
-	unsigned long val;
-	unsigned long caller;
-	int cpu = smp_processor_id();
-
-	STORE_CALLER(caller);
-
-	__asm__ __volatile__("ldstub [%1], %0" : "=r" (val) : "r" (&(lock->lock)));
-	if(!val) {
-		/* We got it, record our identity for debugging. */
-		lock->owner_pc = (cpu & 3) | (caller & ~3);
-	}
-	return val == 0;
-}
-
-void _do_spin_unlock(spinlock_t *lock)
-{
-	lock->owner_pc = 0;
-	barrier();
-	lock->lock = 0;
-}
-
-void _do_read_lock(rwlock_t *rw, char *str)
-{
-	unsigned long caller;
-	unsigned long val;
-	int cpu = smp_processor_id();
-	int stuck = INIT_STUCK;
-
-	STORE_CALLER(caller);
-
-wlock_again:
-	__asm__ __volatile__("ldstub [%1 + 3], %0" : "=r" (val) : "r" (&(rw->lock)));
-	if(val) {
-		while(rw->lock & 0xff) {
-			if (!--stuck) {
-				show_read(str, rw, caller);
-				stuck = INIT_STUCK;
-			}
-			barrier();
-		}
-		goto wlock_again;
-	}
-
-	rw->reader_pc[cpu] = caller;
-	barrier();
-	rw->lock++;
-}
-
-void _do_read_unlock(rwlock_t *rw, char *str)
-{
-	unsigned long caller;
-	unsigned long val;
-	int cpu = smp_processor_id();
-	int stuck = INIT_STUCK;
-
-	STORE_CALLER(caller);
-
-wlock_again:
-	__asm__ __volatile__("ldstub [%1 + 3], %0" : "=r" (val) : "r" (&(rw->lock)));
-	if(val) {
-		while(rw->lock & 0xff) {
-			if (!--stuck) {
-				show_read(str, rw, caller);
-				stuck = INIT_STUCK;
-			}
-			barrier();
-		}
-		goto wlock_again;
-	}
-
-	rw->reader_pc[cpu] = 0;
-	barrier();
-	rw->lock -= 0x1ff;
-}
-
-void _do_write_lock(rwlock_t *rw, char *str)
-{
-	unsigned long caller;
-	unsigned long val;
-	int cpu = smp_processor_id();
-	int stuck = INIT_STUCK;
-
-	STORE_CALLER(caller);
-
-wlock_again:
-	__asm__ __volatile__("ldstub [%1 + 3], %0" : "=r" (val) : "r" (&(rw->lock)));
-	if(val) {
-wlock_wait:
-		while(rw->lock) {
-			if (!--stuck) {
-				show_write(str, rw, caller);
-				stuck = INIT_STUCK;
-			}
-			barrier();
-		}
-		goto wlock_again;
-	}
-
-	if (rw->lock & ~0xff) {
-		*(((unsigned char *)&rw->lock)+3) = 0;
-		barrier();
-		goto wlock_wait;
-	}
-
-	barrier();
-	rw->owner_pc = (cpu & 3) | (caller & ~3);
-}
-
-void _do_write_unlock(rwlock_t *rw)
-{
-	rw->owner_pc = 0;
-	barrier();
-	rw->lock = 0;
-}
-
-#endif /* SMP */
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 66255434128..7d10b039709 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -607,11 +607,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	struct thread_info *t = p->thread_info;
 	char *child_trap_frame;
 
-#ifdef CONFIG_DEBUG_SPINLOCK
-	p->thread.smp_lock_count = 0;
-	p->thread.smp_lock_pc = 0;
-#endif
-
 	/* Calculate offset to stack_frame & pt_regs */
 	child_trap_frame = ((char *)t) + (THREAD_SIZE - (TRACEREG_SZ+STACKFRAME_SZ));
 	memcpy(child_trap_frame, (((struct sparc_stackf *)regs)-1), (TRACEREG_SZ+STACKFRAME_SZ));
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 7d9a0f6c437..cbb5e59824e 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -115,17 +115,12 @@ EXPORT_PER_CPU_SYMBOL(__cpu_data);
 
 /* used by various drivers */
 #ifdef CONFIG_SMP
-#ifndef CONFIG_DEBUG_SPINLOCK
 /* Out of line rw-locking implementation. */
 EXPORT_SYMBOL(__read_lock);
 EXPORT_SYMBOL(__read_unlock);
 EXPORT_SYMBOL(__write_lock);
 EXPORT_SYMBOL(__write_unlock);
 EXPORT_SYMBOL(__write_trylock);
-/* Out of line spin-locking implementation. */
-EXPORT_SYMBOL(_raw_spin_lock);
-EXPORT_SYMBOL(_raw_spin_lock_flags);
-#endif
 
 /* Hard IRQ locking */
 EXPORT_SYMBOL(synchronize_irq);
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
index 40dbeec7e5d..d968aebe83b 100644
--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -14,7 +14,6 @@ lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
 	 copy_in_user.o user_fixup.o memmove.o \
 	 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o
 
-lib-$(CONFIG_DEBUG_SPINLOCK) += debuglocks.o
 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
 
 obj-y += iomap.o
diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c
deleted file mode 100644
index f5f0b5586f0..00000000000
--- a/arch/sparc64/lib/debuglocks.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/* $Id: debuglocks.c,v 1.9 2001/11/17 00:10:48 davem Exp $
- * debuglocks.c: Debugging versions of SMP locking primitives.
- *
- * Copyright (C) 1998 David S. Miller (davem@redhat.com)
- */
-
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_SMP
-
-static inline void show (char *str, spinlock_t *lock, unsigned long caller)
-{
-	int cpu = smp_processor_id();
-
-	printk("%s(%p) CPU#%d stuck at %08x, owner PC(%08x):CPU(%x)\n",
-	       str, lock, cpu, (unsigned int) caller,
-	       lock->owner_pc, lock->owner_cpu);
-}
-
-static inline void show_read (char *str, rwlock_t *lock, unsigned long caller)
-{
-	int cpu = smp_processor_id();
-
-	printk("%s(%p) CPU#%d stuck at %08x, writer PC(%08x):CPU(%x)\n",
-	       str, lock, cpu, (unsigned int) caller,
-	       lock->writer_pc, lock->writer_cpu);
-}
-
-static inline void show_write (char *str, rwlock_t *lock, unsigned long caller)
-{
-	int cpu = smp_processor_id();
-	int i;
-
-	printk("%s(%p) CPU#%d stuck at %08x\n",
-	       str, lock, cpu, (unsigned int) caller);
-	printk("Writer: PC(%08x):CPU(%x)\n",
-	       lock->writer_pc, lock->writer_cpu);
-	printk("Readers:");
-	for (i = 0; i < NR_CPUS; i++)
-		if (lock->reader_pc[i])
-			printk(" %d[%08x]", i, lock->reader_pc[i]);
-	printk("\n");
-}
-
-#undef INIT_STUCK
-#define INIT_STUCK 100000000
-
-void _do_spin_lock(spinlock_t *lock, char *str, unsigned long caller)
-{
-	unsigned long val;
-	int stuck = INIT_STUCK;
-	int cpu = get_cpu();
-	int shown = 0;
-
-again:
-	__asm__ __volatile__("ldstub [%1], %0"
-			     : "=r" (val)
-			     : "r" (&(lock->lock))
-			     : "memory");
-	membar_storeload_storestore();
-	if (val) {
-		while (lock->lock) {
-			if (!--stuck) {
-				if (shown++ <= 2)
-					show(str, lock, caller);
-				stuck = INIT_STUCK;
-			}
-			rmb();
-		}
-		goto again;
-	}
-	lock->owner_pc = ((unsigned int)caller);
-	lock->owner_cpu = cpu;
-	current->thread.smp_lock_count++;
-	current->thread.smp_lock_pc = ((unsigned int)caller);
-
-	put_cpu();
-}
-
-int _do_spin_trylock(spinlock_t *lock, unsigned long caller)
-{
-	unsigned long val;
-	int cpu = get_cpu();
-
-	__asm__ __volatile__("ldstub [%1], %0"
-			     : "=r" (val)
-			     : "r" (&(lock->lock))
-			     : "memory");
-	membar_storeload_storestore();
-	if (!val) {
-		lock->owner_pc = ((unsigned int)caller);
-		lock->owner_cpu = cpu;
-		current->thread.smp_lock_count++;
-		current->thread.smp_lock_pc = ((unsigned int)caller);
-	}
-
-	put_cpu();
-
-	return val == 0;
-}
-
-void _do_spin_unlock(spinlock_t *lock)
-{
-	lock->owner_pc = 0;
-	lock->owner_cpu = NO_PROC_ID;
-	membar_storestore_loadstore();
-	lock->lock = 0;
-	current->thread.smp_lock_count--;
-}
-
-/* Keep INIT_STUCK the same... */
-
-void _do_read_lock(rwlock_t *rw, char *str, unsigned long caller)
-{
-	unsigned long val;
-	int stuck = INIT_STUCK;
-	int cpu = get_cpu();
-	int shown = 0;
-
-wlock_again:
-	/* Wait for any writer to go away.  */
-	while (((long)(rw->lock)) < 0) {
-		if (!--stuck) {
-			if (shown++ <= 2)
-				show_read(str, rw, caller);
-			stuck = INIT_STUCK;
-		}
-		rmb();
-	}
-	/* Try once to increment the counter.  */
-	__asm__ __volatile__(
-"	ldx		[%0], %%g1\n"
-"	brlz,a,pn	%%g1, 2f\n"
-"	 mov		1, %0\n"
-"	add		%%g1, 1, %%g7\n"
-"	casx		[%0], %%g1, %%g7\n"
-"	sub		%%g1, %%g7, %0\n"
-"2:"	: "=r" (val)
-	: "0" (&(rw->lock))
-	: "g1", "g7", "memory");
-	membar_storeload_storestore();
-	if (val)
-		goto wlock_again;
-	rw->reader_pc[cpu] = ((unsigned int)caller);
-	current->thread.smp_lock_count++;
-	current->thread.smp_lock_pc = ((unsigned int)caller);
-
-	put_cpu();
-}
-
-void _do_read_unlock(rwlock_t *rw, char *str, unsigned long caller)
-{
-	unsigned long val;
-	int stuck = INIT_STUCK;
-	int cpu = get_cpu();
-	int shown = 0;
-
-	/* Drop our identity _first_. */
-	rw->reader_pc[cpu] = 0;
-	current->thread.smp_lock_count--;
-runlock_again:
-	/* Spin trying to decrement the counter using casx.  */
-	__asm__ __volatile__(
-"	membar	#StoreLoad | #LoadLoad\n"
-"	ldx	[%0], %%g1\n"
-"	sub	%%g1, 1, %%g7\n"
-"	casx	[%0], %%g1, %%g7\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	sub	%%g1, %%g7, %0\n"
-	: "=r" (val)
-	: "0" (&(rw->lock))
-	: "g1", "g7", "memory");
-	if (val) {
-		if (!--stuck) {
-			if (shown++ <= 2)
-				show_read(str, rw, caller);
-			stuck = INIT_STUCK;
-		}
-		goto runlock_again;
-	}
-
-	put_cpu();
-}
-
-void _do_write_lock(rwlock_t *rw, char *str, unsigned long caller)
-{
-	unsigned long val;
-	int stuck = INIT_STUCK;
-	int cpu = get_cpu();
-	int shown = 0;
-
-wlock_again:
-	/* Spin while there is another writer. */
-	while (((long)rw->lock) < 0) {
-		if (!--stuck) {
-			if (shown++ <= 2)
-				show_write(str, rw, caller);
-			stuck = INIT_STUCK;
-		}
-		rmb();
-	}
-
-	/* Try to acuire the write bit.  */
-	__asm__ __volatile__(
-"	mov	1, %%g3\n"
-"	sllx	%%g3, 63, %%g3\n"
-"	ldx	[%0], %%g1\n"
-"	brlz,pn	%%g1, 1f\n"
-"	 or	%%g1, %%g3, %%g7\n"
-"	casx	[%0], %%g1, %%g7\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	ba,pt	%%xcc, 2f\n"
-"	 sub	%%g1, %%g7, %0\n"
-"1:	mov	1, %0\n"
-"2:"	: "=r" (val)
-	: "0" (&(rw->lock))
-	: "g3", "g1", "g7", "memory");
-	if (val) {
-		/* We couldn't get the write bit. */
-		if (!--stuck) {
-			if (shown++ <= 2)
-				show_write(str, rw, caller);
-			stuck = INIT_STUCK;
-		}
-		goto wlock_again;
-	}
-	if ((rw->lock & ((1UL<<63)-1UL)) != 0UL) {
-		/* Readers still around, drop the write
-		 * lock, spin, and try again.
-		 */
-		if (!--stuck) {
-			if (shown++ <= 2)
-				show_write(str, rw, caller);
-			stuck = INIT_STUCK;
-		}
-		__asm__ __volatile__(
-"		mov	1, %%g3\n"
-"		sllx	%%g3, 63, %%g3\n"
-"1:		ldx	[%0], %%g1\n"
-"		andn	%%g1, %%g3, %%g7\n"
-"		casx	[%0], %%g1, %%g7\n"
-"		cmp	%%g1, %%g7\n"
-"		membar	#StoreLoad | #StoreStore\n"
-"		bne,pn	%%xcc, 1b\n"
-"		 nop"
-		: /* no outputs */
-		: "r" (&(rw->lock))
-		: "g3", "g1", "g7", "cc", "memory");
-		while(rw->lock != 0) {
-			if (!--stuck) {
-				if (shown++ <= 2)
-					show_write(str, rw, caller);
-				stuck = INIT_STUCK;
-			}
-			rmb();
-		}
-		goto wlock_again;
-	}
-
-	/* We have it, say who we are. */
-	rw->writer_pc = ((unsigned int)caller);
-	rw->writer_cpu = cpu;
-	current->thread.smp_lock_count++;
-	current->thread.smp_lock_pc = ((unsigned int)caller);
-
-	put_cpu();
-}
-
-void _do_write_unlock(rwlock_t *rw, unsigned long caller)
-{
-	unsigned long val;
-	int stuck = INIT_STUCK;
-	int shown = 0;
-
-	/* Drop our identity _first_ */
-	rw->writer_pc = 0;
-	rw->writer_cpu = NO_PROC_ID;
-	current->thread.smp_lock_count--;
-wlock_again:
-	__asm__ __volatile__(
-"	membar	#StoreLoad | #LoadLoad\n"
-"	mov	1, %%g3\n"
-"	sllx	%%g3, 63, %%g3\n"
-"	ldx	[%0], %%g1\n"
-"	andn	%%g1, %%g3, %%g7\n"
-"	casx	[%0], %%g1, %%g7\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	sub	%%g1, %%g7, %0\n"
-	: "=r" (val)
-	: "0" (&(rw->lock))
-	: "g3", "g1", "g7", "memory");
-	if (val) {
-		if (!--stuck) {
-			if (shown++ <= 2)
-				show_write("write_unlock", rw, caller);
-			stuck = INIT_STUCK;
-		}
-		goto wlock_again;
-	}
-}
-
-int _do_write_trylock(rwlock_t *rw, char *str, unsigned long caller)
-{
-	unsigned long val;
-	int cpu = get_cpu();
-
-	/* Try to acuire the write bit.  */
-	__asm__ __volatile__(
-"	mov	1, %%g3\n"
-"	sllx	%%g3, 63, %%g3\n"
-"	ldx	[%0], %%g1\n"
-"	brlz,pn	%%g1, 1f\n"
-"	 or	%%g1, %%g3, %%g7\n"
-"	casx	[%0], %%g1, %%g7\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	ba,pt	%%xcc, 2f\n"
-"	 sub	%%g1, %%g7, %0\n"
-"1:	mov	1, %0\n"
-"2:"	: "=r" (val)
-	: "0" (&(rw->lock))
-	: "g3", "g1", "g7", "memory");
-
-	if (val) {
-		put_cpu();
-		return 0;
-	}
-
-	if ((rw->lock & ((1UL<<63)-1UL)) != 0UL) {
-		/* Readers still around, drop the write
-		 * lock, return failure.
-		 */
-		__asm__ __volatile__(
-"		mov	1, %%g3\n"
-"		sllx	%%g3, 63, %%g3\n"
-"1:		ldx	[%0], %%g1\n"
-"		andn	%%g1, %%g3, %%g7\n"
-"		casx	[%0], %%g1, %%g7\n"
-"		cmp	%%g1, %%g7\n"
-"		membar	#StoreLoad | #StoreStore\n"
-"		bne,pn	%%xcc, 1b\n"
-"		 nop"
-		: /* no outputs */
-		: "r" (&(rw->lock))
-		: "g3", "g1", "g7", "cc", "memory");
-
-		put_cpu();
-
-		return 0;
-	}
-
-	/* We have it, say who we are. */
-	rw->writer_pc = ((unsigned int)caller);
-	rw->writer_cpu = cpu;
-	current->thread.smp_lock_count++;
-	current->thread.smp_lock_pc = ((unsigned int)caller);
-
-	put_cpu();
-
-	return 1;
-}
-
-#endif /* CONFIG_SMP */
diff --git a/fs/buffer.c b/fs/buffer.c
index 1c62203a490..6cbfceabd95 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -40,6 +40,7 @@
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/mpage.h>
+#include <linux/bit_spinlock.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static void invalidate_bh_lrus(void);
diff --git a/include/asm-alpha/spinlock.h b/include/asm-alpha/spinlock.h
index 80780dba998..8197c69eff4 100644
--- a/include/asm-alpha/spinlock.h
+++ b/include/asm-alpha/spinlock.h
@@ -6,7 +6,6 @@
 #include <linux/kernel.h>
 #include <asm/current.h>
 
-
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
@@ -14,43 +13,18 @@
  * We make no fairness assumptions. They have a cost.
  */
 
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	int on_cpu;
-	int line_no;
-	void *previous;
-	struct task_struct * task;
-	const char *base_file;
-#endif
-} spinlock_t;
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define SPIN_LOCK_UNLOCKED	(spinlock_t){ 0, -1, 0, NULL, NULL, NULL }
-#else
-#define SPIN_LOCK_UNLOCKED	(spinlock_t){ 0 }
-#endif
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_is_locked(x)	((x)->lock != 0)
-#define spin_unlock_wait(x)	do { barrier(); } while ((x)->lock)
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-extern void _raw_spin_unlock(spinlock_t * lock);
-extern void debug_spin_lock(spinlock_t * lock, const char *, int);
-extern int debug_spin_trylock(spinlock_t * lock, const char *, int);
-#define _raw_spin_lock(LOCK) \
-	debug_spin_lock(LOCK, __BASE_FILE__, __LINE__)
-#define _raw_spin_trylock(LOCK) \
-	debug_spin_trylock(LOCK, __BASE_FILE__, __LINE__)
-#else
-static inline void _raw_spin_unlock(spinlock_t * lock)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_spin_is_locked(x)	((x)->lock != 0)
+#define __raw_spin_unlock_wait(x) \
+		do { cpu_relax(); } while ((x)->lock)
+
+static inline void __raw_spin_unlock(raw_spinlock_t * lock)
 {
 	mb();
 	lock->lock = 0;
 }
 
-static inline void _raw_spin_lock(spinlock_t * lock)
+static inline void __raw_spin_lock(raw_spinlock_t * lock)
 {
 	long tmp;
 
@@ -70,80 +44,64 @@ static inline void _raw_spin_lock(spinlock_t * lock)
 	: "m"(lock->lock) : "memory");
 }
 
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	return !test_and_set_bit(0, &lock->lock);
 }
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
 
 /***********************************************************/
 
-typedef struct {
-	volatile unsigned int lock;
-} rwlock_t;
-
-#define RW_LOCK_UNLOCKED	(rwlock_t){ 0 }
-
-#define rwlock_init(x)		do { *(x) = RW_LOCK_UNLOCKED; } while(0)
-
-static inline int read_can_lock(rwlock_t *lock)
+static inline int __raw_read_can_lock(raw_rwlock_t *lock)
 {
 	return (lock->lock & 1) == 0;
 }
 
-static inline int write_can_lock(rwlock_t *lock)
+static inline int __raw_write_can_lock(raw_rwlock_t *lock)
 {
 	return lock->lock == 0;
 }
 
-#ifdef CONFIG_DEBUG_RWLOCK
-extern void _raw_write_lock(rwlock_t * lock);
-extern void _raw_read_lock(rwlock_t * lock);
-#else
-static inline void _raw_write_lock(rwlock_t * lock)
+static inline void __raw_read_lock(raw_rwlock_t *lock)
 {
 	long regx;
 
 	__asm__ __volatile__(
 	"1:	ldl_l	%1,%0\n"
-	"	bne	%1,6f\n"
-	"	lda	%1,1\n"
+	"	blbs	%1,6f\n"
+	"	subl	%1,2,%1\n"
 	"	stl_c	%1,%0\n"
 	"	beq	%1,6f\n"
 	"	mb\n"
 	".subsection 2\n"
 	"6:	ldl	%1,%0\n"
-	"	bne	%1,6b\n"
+	"	blbs	%1,6b\n"
 	"	br	1b\n"
 	".previous"
 	: "=m" (*lock), "=&r" (regx)
 	: "m" (*lock) : "memory");
 }
 
-static inline void _raw_read_lock(rwlock_t * lock)
+static inline void __raw_write_lock(raw_rwlock_t *lock)
 {
 	long regx;
 
 	__asm__ __volatile__(
 	"1:	ldl_l	%1,%0\n"
-	"	blbs	%1,6f\n"
-	"	subl	%1,2,%1\n"
+	"	bne	%1,6f\n"
+	"	lda	%1,1\n"
 	"	stl_c	%1,%0\n"
 	"	beq	%1,6f\n"
 	"	mb\n"
 	".subsection 2\n"
 	"6:	ldl	%1,%0\n"
-	"	blbs	%1,6b\n"
+	"	bne	%1,6b\n"
 	"	br	1b\n"
 	".previous"
 	: "=m" (*lock), "=&r" (regx)
 	: "m" (*lock) : "memory");
 }
-#endif /* CONFIG_DEBUG_RWLOCK */
 
-static inline int _raw_read_trylock(rwlock_t * lock)
+static inline int __raw_read_trylock(raw_rwlock_t * lock)
 {
 	long regx;
 	int success;
@@ -165,7 +123,7 @@ static inline int _raw_read_trylock(rwlock_t * lock)
 	return success;
 }
 
-static inline int _raw_write_trylock(rwlock_t * lock)
+static inline int __raw_write_trylock(raw_rwlock_t * lock)
 {
 	long regx;
 	int success;
@@ -187,13 +145,7 @@ static inline int _raw_write_trylock(rwlock_t * lock)
 	return success;
 }
 
-static inline void _raw_write_unlock(rwlock_t * lock)
-{
-	mb();
-	lock->lock = 0;
-}
-
-static inline void _raw_read_unlock(rwlock_t * lock)
+static inline void __raw_read_unlock(raw_rwlock_t * lock)
 {
 	long regx;
 	__asm__ __volatile__(
@@ -209,4 +161,10 @@ static inline void _raw_read_unlock(rwlock_t * lock)
 	: "m" (*lock) : "memory");
 }
 
+static inline void __raw_write_unlock(raw_rwlock_t * lock)
+{
+	mb();
+	lock->lock = 0;
+}
+
 #endif /* _ALPHA_SPINLOCK_H */
diff --git a/include/asm-alpha/spinlock_types.h b/include/asm-alpha/spinlock_types.h
new file mode 100644
index 00000000000..8141eb5ebf0
--- /dev/null
+++ b/include/asm-alpha/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef _ALPHA_SPINLOCK_TYPES_H
+#define _ALPHA_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/include/asm-arm/spinlock.h b/include/asm-arm/spinlock.h
index 1f906d09b68..cb4906b4555 100644
--- a/include/asm-arm/spinlock.h
+++ b/include/asm-arm/spinlock.h
@@ -16,21 +16,14 @@
  * Unlocked value: 0
  * Locked value: 1
  */
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
 
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
+#define __raw_spin_is_locked(x)		((x)->lock != 0)
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
 
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while (0)
-#define spin_is_locked(x)	((x)->lock != 0)
-#define spin_unlock_wait(x)	do { barrier(); } while (spin_is_locked(x))
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned long tmp;
 
@@ -47,7 +40,7 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 	smp_mb();
 }
 
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	unsigned long tmp;
 
@@ -67,7 +60,7 @@ static inline int _raw_spin_trylock(spinlock_t *lock)
 	}
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	smp_mb();
 
@@ -80,23 +73,14 @@ static inline void _raw_spin_unlock(spinlock_t *lock)
 
 /*
  * RWLOCKS
- */
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RW_LOCK_UNLOCKED	(rwlock_t) { 0 }
-#define rwlock_init(x)		do { *(x) = RW_LOCK_UNLOCKED; } while (0)
-#define rwlock_is_locked(x)	(*((volatile unsigned int *)(x)) != 0)
-
-/*
+ *
+ *
  * Write locks are easy - we just set bit 31.  When unlocking, we can
  * just write zero since the lock is exclusively held.
  */
-static inline void _raw_write_lock(rwlock_t *rw)
+#define rwlock_is_locked(x)	(*((volatile unsigned int *)(x)) != 0)
+
+static inline void __raw_write_lock(rwlock_t *rw)
 {
 	unsigned long tmp;
 
@@ -113,7 +97,7 @@ static inline void _raw_write_lock(rwlock_t *rw)
 	smp_mb();
 }
 
-static inline int _raw_write_trylock(rwlock_t *rw)
+static inline int __raw_write_trylock(rwlock_t *rw)
 {
 	unsigned long tmp;
 
@@ -133,7 +117,7 @@ static inline int _raw_write_trylock(rwlock_t *rw)
 	}
 }
 
-static inline void _raw_write_unlock(rwlock_t *rw)
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	smp_mb();
 
@@ -156,7 +140,7 @@ static inline void _raw_write_unlock(rwlock_t *rw)
  * currently active.  However, we know we won't have any write
  * locks.
  */
-static inline void _raw_read_lock(rwlock_t *rw)
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
 	unsigned long tmp, tmp2;
 
@@ -173,7 +157,7 @@ static inline void _raw_read_lock(rwlock_t *rw)
 	smp_mb();
 }
 
-static inline void _raw_read_unlock(rwlock_t *rw)
+static inline void __raw_read_unlock(rwlock_t *rw)
 {
 	unsigned long tmp, tmp2;
 
@@ -190,6 +174,6 @@ static inline void _raw_read_unlock(rwlock_t *rw)
 	: "cc");
 }
 
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+#define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
 
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-arm/spinlock_types.h b/include/asm-arm/spinlock_types.h
new file mode 100644
index 00000000000..43e83f6d2ee
--- /dev/null
+++ b/include/asm-arm/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h
index f9ff31f4003..23604350cdf 100644
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -7,46 +7,21 @@
 #include <linux/config.h>
 #include <linux/compiler.h>
 
-asmlinkage int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
-
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
-	volatile unsigned int slock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#define SPINLOCK_MAGIC	0xdead4ead
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define SPINLOCK_MAGIC_INIT	, SPINLOCK_MAGIC
-#else
-#define SPINLOCK_MAGIC_INIT	/* */
-#endif
-
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-/*
+ *
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
  *
  * We make no fairness assumptions. They have a cost.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
  */
 
-#define spin_is_locked(x)	(*(volatile signed char *)(&(x)->slock) <= 0)
-#define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
+#define __raw_spin_is_locked(x) \
+		(*(volatile signed char *)(&(x)->slock) <= 0)
 
-#define spin_lock_string \
+#define __raw_spin_lock_string \
 	"\n1:\t" \
 	"lock ; decb %0\n\t" \
 	"jns 3f\n" \
@@ -57,7 +32,7 @@ typedef struct {
 	"jmp 1b\n" \
 	"3:\n\t"
 
-#define spin_lock_string_flags \
+#define __raw_spin_lock_string_flags \
 	"\n1:\t" \
 	"lock ; decb %0\n\t" \
 	"jns 4f\n\t" \
@@ -73,86 +48,71 @@ typedef struct {
 	"jmp 1b\n" \
 	"4:\n\t"
 
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+	__asm__ __volatile__(
+		__raw_spin_lock_string
+		:"=m" (lock->slock) : : "memory");
+}
+
+static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+{
+	__asm__ __volatile__(
+		__raw_spin_lock_string_flags
+		:"=m" (lock->slock) : "r" (flags) : "memory");
+}
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+	char oldval;
+	__asm__ __volatile__(
+		"xchgb %b0,%1"
+		:"=q" (oldval), "=m" (lock->slock)
+		:"0" (0) : "memory");
+	return oldval > 0;
+}
+
 /*
- * This works. Despite all the confusion.
- * (except on PPro SMP or if we are using OOSTORE)
+ * __raw_spin_unlock based on writing $1 to the low byte.
+ * This method works. Despite all the confusion.
+ * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
  * (PPro errata 66, 92)
  */
 
 #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
 
-#define spin_unlock_string \
+#define __raw_spin_unlock_string \
 	"movb $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	BUG_ON(!spin_is_locked(lock));
-#endif
 	__asm__ __volatile__(
-		spin_unlock_string
+		__raw_spin_unlock_string
 	);
 }
 
 #else
 
-#define spin_unlock_string \
+#define __raw_spin_unlock_string \
 	"xchgb %b0, %1" \
 		:"=q" (oldval), "=m" (lock->slock) \
 		:"0" (oldval) : "memory"
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	char oldval = 1;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	BUG_ON(!spin_is_locked(lock));
-#endif
-	__asm__ __volatile__(
-		spin_unlock_string
-	);
-}
 
-#endif
-
-static inline int _raw_spin_trylock(spinlock_t *lock)
-{
-	char oldval;
 	__asm__ __volatile__(
-		"xchgb %b0,%1"
-		:"=q" (oldval), "=m" (lock->slock)
-		:"0" (0) : "memory");
-	return oldval > 0;
+		__raw_spin_unlock_string
+	);
 }
 
-static inline void _raw_spin_lock(spinlock_t *lock)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
-	if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
-		printk("eip: %p\n", __builtin_return_address(0));
-		BUG();
-	}
 #endif
-	__asm__ __volatile__(
-		spin_lock_string
-		:"=m" (lock->slock) : : "memory");
-}
 
-static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
-	if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
-		printk("eip: %p\n", __builtin_return_address(0));
-		BUG();
-	}
-#endif
-	__asm__ __volatile__(
-		spin_lock_string_flags
-		:"=m" (lock->slock) : "r" (flags) : "memory");
-}
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
 
 /*
  * Read-write spinlocks, allowing multiple readers
@@ -163,72 +123,41 @@ static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
  * can "mix" irq-safe locks - any writer needs to get a
  * irq-safe write-lock, but readers can get non-irqsafe
  * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ *
+ * The inline assembly is non-obvious. Think about it.
+ *
+ * Changed to use the same technique as rw semaphores.  See
+ * semaphore.h for details.  -ben
+ *
+ * the helpers are in arch/i386/kernel/semaphore.c
  */
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RWLOCK_MAGIC	0xdeaf1eed
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define RWLOCK_MAGIC_INIT	, RWLOCK_MAGIC
-#else
-#define RWLOCK_MAGIC_INIT	/* */
-#endif
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
-
-#define rwlock_init(x)	do { *(x) = RW_LOCK_UNLOCKED; } while(0)
 
 /**
  * read_can_lock - would read_trylock() succeed?
  * @lock: the rwlock in question.
  */
-#define read_can_lock(x) ((int)(x)->lock > 0)
+#define __raw_read_can_lock(x)		((int)(x)->lock > 0)
 
 /**
  * write_can_lock - would write_trylock() succeed?
  * @lock: the rwlock in question.
  */
-#define write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
+#define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-/*
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
- *
- * The inline assembly is non-obvious. Think about it.
- *
- * Changed to use the same technique as rw semaphores.  See
- * semaphore.h for details.  -ben
- */
-/* the spinlock helpers are in arch/i386/kernel/semaphore.c */
-
-static inline void _raw_read_lock(rwlock_t *rw)
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define _raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define _raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
-
-static inline int _raw_read_trylock(rwlock_t *lock)
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -238,7 +167,7 @@ static inline int _raw_read_trylock(rwlock_t *lock)
 	return 0;
 }
 
-static inline int _raw_write_trylock(rwlock_t *lock)
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -247,4 +176,15 @@ static inline int _raw_write_trylock(rwlock_t *lock)
 	return 0;
 }
 
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
+{
+	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
+}
+
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
+{
+	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
+				 : "=m" (rw->lock) : : "memory");
+}
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-i386/spinlock_types.h b/include/asm-i386/spinlock_types.h
new file mode 100644
index 00000000000..59efe849f35
--- /dev/null
+++ b/include/asm-i386/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int slock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
+
+#endif
diff --git a/include/asm-ia64/spinlock.h b/include/asm-ia64/spinlock.h
index d2430aa0d49..5b78611411c 100644
--- a/include/asm-ia64/spinlock.h
+++ b/include/asm-ia64/spinlock.h
@@ -17,28 +17,20 @@
 #include <asm/intrinsics.h>
 #include <asm/system.h>
 
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED			(spinlock_t) { 0 }
-#define spin_lock_init(x)			((x)->lock = 0)
+#define __raw_spin_lock_init(x)			((x)->lock = 0)
 
 #ifdef ASM_SUPPORTED
 /*
  * Try to get the lock.  If we fail to get the lock, make a non-standard call to
  * ia64_spinlock_contention().  We do not use a normal call because that would force all
- * callers of spin_lock() to be non-leaf routines.  Instead, ia64_spinlock_contention() is
- * carefully coded to touch only those registers that spin_lock() marks "clobbered".
+ * callers of __raw_spin_lock() to be non-leaf routines.  Instead, ia64_spinlock_contention() is
+ * carefully coded to touch only those registers that __raw_spin_lock() marks "clobbered".
  */
 
 #define IA64_SPINLOCK_CLOBBERS "ar.ccv", "ar.pfs", "p14", "p15", "r27", "r28", "r29", "r30", "b6", "memory"
 
 static inline void
-_raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
+__raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
@@ -94,17 +86,17 @@ _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
 #endif
 }
 
-#define _raw_spin_lock(lock) _raw_spin_lock_flags(lock, 0)
+#define __raw_spin_lock(lock) __raw_spin_lock_flags(lock, 0)
 
 /* Unlock by doing an ordered store and releasing the cacheline with nta */
-static inline void _raw_spin_unlock(spinlock_t *x) {
+static inline void __raw_spin_unlock(raw_spinlock_t *x) {
 	barrier();
 	asm volatile ("st4.rel.nta [%0] = r0\n\t" :: "r"(x));
 }
 
 #else /* !ASM_SUPPORTED */
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
-# define _raw_spin_lock(x)								\
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+# define __raw_spin_lock(x)								\
 do {											\
 	__u32 *ia64_spinlock_ptr = (__u32 *) (x);					\
 	__u64 ia64_spinlock_val;							\
@@ -117,29 +109,20 @@ do {											\
 		} while (ia64_spinlock_val);						\
 	}										\
 } while (0)
-#define _raw_spin_unlock(x)	do { barrier(); ((spinlock_t *) x)->lock = 0; } while (0)
+#define __raw_spin_unlock(x)	do { barrier(); ((raw_spinlock_t *) x)->lock = 0; } while (0)
 #endif /* !ASM_SUPPORTED */
 
-#define spin_is_locked(x)	((x)->lock != 0)
-#define _raw_spin_trylock(x)	(cmpxchg_acq(&(x)->lock, 0, 1) == 0)
-#define spin_unlock_wait(x)	do { barrier(); } while ((x)->lock)
-
-typedef struct {
-	volatile unsigned int read_counter	: 24;
-	volatile unsigned int write_lock	:  8;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
+#define __raw_spin_is_locked(x)		((x)->lock != 0)
+#define __raw_spin_trylock(x)		(cmpxchg_acq(&(x)->lock, 0, 1) == 0)
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
 
-#define rwlock_init(x)		do { *(x) = RW_LOCK_UNLOCKED; } while(0)
-#define read_can_lock(rw)	(*(volatile int *)(rw) >= 0)
-#define write_can_lock(rw)	(*(volatile int *)(rw) == 0)
+#define __raw_read_can_lock(rw)		(*(volatile int *)(rw) >= 0)
+#define __raw_write_can_lock(rw)	(*(volatile int *)(rw) == 0)
 
-#define _raw_read_lock(rw)								\
+#define __raw_read_lock(rw)								\
 do {											\
-	rwlock_t *__read_lock_ptr = (rw);						\
+	raw_rwlock_t *__read_lock_ptr = (rw);						\
 											\
 	while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) {		\
 		ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);			\
@@ -148,14 +131,14 @@ do {											\
 	}										\
 } while (0)
 
-#define _raw_read_unlock(rw)					\
+#define __raw_read_unlock(rw)					\
 do {								\
-	rwlock_t *__read_lock_ptr = (rw);			\
+	raw_rwlock_t *__read_lock_ptr = (rw);			\
 	ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);	\
 } while (0)
 
 #ifdef ASM_SUPPORTED
-#define _raw_write_lock(rw)							\
+#define __raw_write_lock(rw)							\
 do {										\
  	__asm__ __volatile__ (							\
 		"mov ar.ccv = r0\n"						\
@@ -170,7 +153,7 @@ do {										\
 		:: "r"(rw) : "ar.ccv", "p7", "r2", "r29", "memory");		\
 } while(0)
 
-#define _raw_write_trylock(rw)							\
+#define __raw_write_trylock(rw)							\
 ({										\
 	register long result;							\
 										\
@@ -182,7 +165,7 @@ do {										\
 	(result == 0);								\
 })
 
-static inline void _raw_write_unlock(rwlock_t *x)
+static inline void __raw_write_unlock(raw_rwlock_t *x)
 {
 	u8 *y = (u8 *)x;
 	barrier();
@@ -191,7 +174,7 @@ static inline void _raw_write_unlock(rwlock_t *x)
 
 #else /* !ASM_SUPPORTED */
 
-#define _raw_write_lock(l)								\
+#define __raw_write_lock(l)								\
 ({											\
 	__u64 ia64_val, ia64_set_val = ia64_dep_mi(-1, 0, 31, 1);			\
 	__u32 *ia64_write_lock_ptr = (__u32 *) (l);					\
@@ -202,7 +185,7 @@ static inline void _raw_write_unlock(rwlock_t *x)
 	} while (ia64_val);								\
 })
 
-#define _raw_write_trylock(rw)						\
+#define __raw_write_trylock(rw)						\
 ({									\
 	__u64 ia64_val;							\
 	__u64 ia64_set_val = ia64_dep_mi(-1, 0, 31,1);			\
@@ -210,7 +193,7 @@ static inline void _raw_write_unlock(rwlock_t *x)
 	(ia64_val == 0);						\
 })
 
-static inline void _raw_write_unlock(rwlock_t *x)
+static inline void __raw_write_unlock(raw_rwlock_t *x)
 {
 	barrier();
 	x->write_lock = 0;
@@ -218,6 +201,6 @@ static inline void _raw_write_unlock(rwlock_t *x)
 
 #endif /* !ASM_SUPPORTED */
 
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+#define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
 
 #endif /*  _ASM_IA64_SPINLOCK_H */
diff --git a/include/asm-ia64/spinlock_types.h b/include/asm-ia64/spinlock_types.h
new file mode 100644
index 00000000000..474e46f1ab4
--- /dev/null
+++ b/include/asm-ia64/spinlock_types.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_IA64_SPINLOCK_TYPES_H
+#define _ASM_IA64_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile unsigned int read_counter	: 31;
+	volatile unsigned int write_lock	:  1;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0, 0 }
+
+#endif
diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h
index 6608d8371c5..7de7def28da 100644
--- a/include/asm-m32r/spinlock.h
+++ b/include/asm-m32r/spinlock.h
@@ -14,57 +14,30 @@
 #include <asm/atomic.h>
 #include <asm/page.h>
 
-extern int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
-
-#define RW_LOCK_BIAS		 0x01000000
-#define RW_LOCK_BIAS_STR	"0x01000000"
-
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
-	volatile int slock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#define SPINLOCK_MAGIC	0xdead4ead
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define SPINLOCK_MAGIC_INIT	, SPINLOCK_MAGIC
-#else
-#define SPINLOCK_MAGIC_INIT	/* */
-#endif
-
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-/*
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ *
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
  *
  * We make no fairness assumptions. They have a cost.
  */
 
-#define spin_is_locked(x)	(*(volatile int *)(&(x)->slock) <= 0)
-#define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_is_locked(x)		(*(volatile int *)(&(x)->slock) <= 0)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_spin_unlock_wait(x) \
+		do { cpu_relax(); } while (__raw_spin_is_locked(x))
 
 /**
- * _raw_spin_trylock - Try spin lock and return a result
+ * __raw_spin_trylock - Try spin lock and return a result
  * @lock: Pointer to the lock variable
  *
- * _raw_spin_trylock() tries to get the lock and returns a result.
+ * __raw_spin_trylock() tries to get the lock and returns a result.
  * On the m32r, the result value is 1 (= Success) or 0 (= Failure).
  */
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int oldval;
 	unsigned long tmp1, tmp2;
@@ -78,7 +51,7 @@ static inline int _raw_spin_trylock(spinlock_t *lock)
 	 * }
 	 */
 	__asm__ __volatile__ (
-		"# spin_trylock			\n\t"
+		"# __raw_spin_trylock		\n\t"
 		"ldi	%1, #0;			\n\t"
 		"mvfc	%2, psw;		\n\t"
 		"clrpsw	#0x40 -> nop;		\n\t"
@@ -97,16 +70,10 @@ static inline int _raw_spin_trylock(spinlock_t *lock)
 	return (oldval > 0);
 }
 
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned long tmp0, tmp1;
 
-#ifdef CONFIG_DEBUG_SPINLOCK
-	if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
-		printk("pc: %p\n", __builtin_return_address(0));
-		BUG();
-	}
-#endif
 	/*
 	 * lock->slock :  =1 : unlock
 	 *             : <=0 : lock
@@ -118,7 +85,7 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 	 * }
 	 */
 	__asm__ __volatile__ (
-		"# spin_lock			\n\t"
+		"# __raw_spin_lock		\n\t"
 		".fillinsn			\n"
 		"1:				\n\t"
 		"mvfc	%1, psw;		\n\t"
@@ -145,12 +112,8 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 	);
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	BUG_ON(!spin_is_locked(lock));
-#endif
 	mb();
 	lock->slock = 1;
 }
@@ -164,59 +127,32 @@ static inline void _raw_spin_unlock(spinlock_t *lock)
  * can "mix" irq-safe locks - any writer needs to get a
  * irq-safe write-lock, but readers can get non-irqsafe
  * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ *
+ * The inline assembly is non-obvious. Think about it.
+ *
+ * Changed to use the same technique as rw semaphores.  See
+ * semaphore.h for details.  -ben
  */
-typedef struct {
-	volatile int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RWLOCK_MAGIC	0xdeaf1eed
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define RWLOCK_MAGIC_INIT	, RWLOCK_MAGIC
-#else
-#define RWLOCK_MAGIC_INIT	/* */
-#endif
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
-
-#define rwlock_init(x)	do { *(x) = RW_LOCK_UNLOCKED; } while(0)
 
 /**
  * read_can_lock - would read_trylock() succeed?
  * @lock: the rwlock in question.
  */
-#define read_can_lock(x) ((int)(x)->lock > 0)
+#define __raw_read_can_lock(x) ((int)(x)->lock > 0)
 
 /**
  * write_can_lock - would write_trylock() succeed?
  * @lock: the rwlock in question.
  */
-#define write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
-
-/*
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
- *
- * The inline assembly is non-obvious. Think about it.
- *
- * Changed to use the same technique as rw semaphores.  See
- * semaphore.h for details.  -ben
- */
-/* the spinlock helpers are in arch/i386/kernel/semaphore.c */
+#define __raw_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
 
-static inline void _raw_read_lock(rwlock_t *rw)
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
 	unsigned long tmp0, tmp1;
 
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
 	/*
 	 * rw->lock :  >0 : unlock
 	 *          : <=0 : lock
@@ -264,13 +200,10 @@ static inline void _raw_read_lock(rwlock_t *rw)
 	);
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
 	unsigned long tmp0, tmp1, tmp2;
 
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
 	/*
 	 * rw->lock :  =RW_LOCK_BIAS_STR : unlock
 	 *          : !=RW_LOCK_BIAS_STR : lock
@@ -320,7 +253,7 @@ static inline void _raw_write_lock(rwlock_t *rw)
 	);
 }
 
-static inline void _raw_read_unlock(rwlock_t *rw)
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	unsigned long tmp0, tmp1;
 
@@ -342,7 +275,7 @@ static inline void _raw_read_unlock(rwlock_t *rw)
 	);
 }
 
-static inline void _raw_write_unlock(rwlock_t *rw)
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	unsigned long tmp0, tmp1, tmp2;
 
@@ -366,9 +299,9 @@ static inline void _raw_write_unlock(rwlock_t *rw)
 	);
 }
 
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+#define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
 
-static inline int _raw_write_trylock(rwlock_t *lock)
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
diff --git a/include/asm-m32r/spinlock_types.h b/include/asm-m32r/spinlock_types.h
new file mode 100644
index 00000000000..7e9941c45f4
--- /dev/null
+++ b/include/asm-m32r/spinlock_types.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_M32R_SPINLOCK_TYPES_H
+#define _ASM_M32R_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile int slock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
+
+typedef struct {
+	volatile int lock;
+} raw_rwlock_t;
+
+#define RW_LOCK_BIAS			0x01000000
+#define RW_LOCK_BIAS_STR		"0x01000000"
+
+#define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
+
+#endif
diff --git a/include/asm-mips/spinlock.h b/include/asm-mips/spinlock.h
index 114d3eb98a6..4d0135b1115 100644
--- a/include/asm-mips/spinlock.h
+++ b/include/asm-mips/spinlock.h
@@ -16,20 +16,10 @@
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  */
 
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
-
-#define spin_lock_init(x)	do { (x)->lock = 0; } while(0)
-
-#define spin_is_locked(x)	((x)->lock != 0)
-#define spin_unlock_wait(x)	do { barrier(); } while ((x)->lock)
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_is_locked(x)	((x)->lock != 0)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_spin_unlock_wait(x) \
+		do { cpu_relax(); } while ((x)->lock)
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -38,13 +28,13 @@ typedef struct {
  * We make no fairness assumptions.  They have a cost.
  */
 
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned int tmp;
 
 	if (R10000_LLSC_WAR) {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_spin_lock	\n"
+		"	.set	noreorder	# __raw_spin_lock	\n"
 		"1:	ll	%1, %2					\n"
 		"	bnez	%1, 1b					\n"
 		"	 li	%1, 1					\n"
@@ -58,7 +48,7 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 		: "memory");
 	} else {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_spin_lock	\n"
+		"	.set	noreorder	# __raw_spin_lock	\n"
 		"1:	ll	%1, %2					\n"
 		"	bnez	%1, 1b					\n"
 		"	 li	%1, 1					\n"
@@ -72,10 +62,10 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 	}
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
-	"	.set	noreorder	# _raw_spin_unlock	\n"
+	"	.set	noreorder	# __raw_spin_unlock	\n"
 	"	sync						\n"
 	"	sw	$0, %0					\n"
 	"	.set\treorder					\n"
@@ -84,13 +74,13 @@ static inline void _raw_spin_unlock(spinlock_t *lock)
 	: "memory");
 }
 
-static inline unsigned int _raw_spin_trylock(spinlock_t *lock)
+static inline unsigned int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	unsigned int temp, res;
 
 	if (R10000_LLSC_WAR) {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_spin_trylock	\n"
+		"	.set	noreorder	# __raw_spin_trylock	\n"
 		"1:	ll	%0, %3					\n"
 		"	ori	%2, %0, 1				\n"
 		"	sc	%2, %1					\n"
@@ -104,7 +94,7 @@ static inline unsigned int _raw_spin_trylock(spinlock_t *lock)
 		: "memory");
 	} else {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_spin_trylock	\n"
+		"	.set	noreorder	# __raw_spin_trylock	\n"
 		"1:	ll	%0, %3					\n"
 		"	ori	%2, %0, 1				\n"
 		"	sc	%2, %1					\n"
@@ -129,24 +119,13 @@ static inline unsigned int _raw_spin_trylock(spinlock_t *lock)
  * read-locks.
  */
 
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
-
-#define rwlock_init(x)  do { *(x) = RW_LOCK_UNLOCKED; } while(0)
-
-static inline void _raw_read_lock(rwlock_t *rw)
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
 	if (R10000_LLSC_WAR) {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_read_lock	\n"
+		"	.set	noreorder	# __raw_read_lock	\n"
 		"1:	ll	%1, %2					\n"
 		"	bltz	%1, 1b					\n"
 		"	 addu	%1, 1					\n"
@@ -160,7 +139,7 @@ static inline void _raw_read_lock(rwlock_t *rw)
 		: "memory");
 	} else {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_read_lock	\n"
+		"	.set	noreorder	# __raw_read_lock	\n"
 		"1:	ll	%1, %2					\n"
 		"	bltz	%1, 1b					\n"
 		"	 addu	%1, 1					\n"
@@ -177,13 +156,13 @@ static inline void _raw_read_lock(rwlock_t *rw)
 /* Note the use of sub, not subu which will make the kernel die with an
    overflow exception if we ever try to unlock an rwlock that is already
    unlocked or is being held by a writer.  */
-static inline void _raw_read_unlock(rwlock_t *rw)
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
 	if (R10000_LLSC_WAR) {
 		__asm__ __volatile__(
-		"1:	ll	%1, %2		# _raw_read_unlock	\n"
+		"1:	ll	%1, %2		# __raw_read_unlock	\n"
 		"	sub	%1, 1					\n"
 		"	sc	%1, %0					\n"
 		"	beqzl	%1, 1b					\n"
@@ -193,7 +172,7 @@ static inline void _raw_read_unlock(rwlock_t *rw)
 		: "memory");
 	} else {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_read_unlock	\n"
+		"	.set	noreorder	# __raw_read_unlock	\n"
 		"1:	ll	%1, %2					\n"
 		"	sub	%1, 1					\n"
 		"	sc	%1, %0					\n"
@@ -206,13 +185,13 @@ static inline void _raw_read_unlock(rwlock_t *rw)
 	}
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
 	if (R10000_LLSC_WAR) {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_write_lock	\n"
+		"	.set	noreorder	# __raw_write_lock	\n"
 		"1:	ll	%1, %2					\n"
 		"	bnez	%1, 1b					\n"
 		"	 lui	%1, 0x8000				\n"
@@ -226,7 +205,7 @@ static inline void _raw_write_lock(rwlock_t *rw)
 		: "memory");
 	} else {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_write_lock	\n"
+		"	.set	noreorder	# __raw_write_lock	\n"
 		"1:	ll	%1, %2					\n"
 		"	bnez	%1, 1b					\n"
 		"	 lui	%1, 0x8000				\n"
@@ -241,26 +220,26 @@ static inline void _raw_write_lock(rwlock_t *rw)
 	}
 }
 
-static inline void _raw_write_unlock(rwlock_t *rw)
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	__asm__ __volatile__(
-	"	sync			# _raw_write_unlock	\n"
+	"	sync			# __raw_write_unlock	\n"
 	"	sw	$0, %0					\n"
 	: "=m" (rw->lock)
 	: "m" (rw->lock)
 	: "memory");
 }
 
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+#define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
 
-static inline int _raw_write_trylock(rwlock_t *rw)
+static inline int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 	int ret;
 
 	if (R10000_LLSC_WAR) {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_write_trylock	\n"
+		"	.set	noreorder	# __raw_write_trylock	\n"
 		"	li	%2, 0					\n"
 		"1:	ll	%1, %3					\n"
 		"	bnez	%1, 2f					\n"
@@ -277,7 +256,7 @@ static inline int _raw_write_trylock(rwlock_t *rw)
 		: "memory");
 	} else {
 		__asm__ __volatile__(
-		"	.set	noreorder	# _raw_write_trylock	\n"
+		"	.set	noreorder	# __raw_write_trylock	\n"
 		"	li	%2, 0					\n"
 		"1:	ll	%1, %3					\n"
 		"	bnez	%1, 2f					\n"
diff --git a/include/asm-mips/spinlock_types.h b/include/asm-mips/spinlock_types.h
new file mode 100644
index 00000000000..ce26c5048b1
--- /dev/null
+++ b/include/asm-mips/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_SPINLOCK_TYPES_H
+#define _ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/include/asm-parisc/atomic.h b/include/asm-parisc/atomic.h
index e24f7579adb..048a2c7fd0c 100644
--- a/include/asm-parisc/atomic.h
+++ b/include/asm-parisc/atomic.h
@@ -24,19 +24,19 @@
 #  define ATOMIC_HASH_SIZE 4
 #  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) a)/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
 
-extern spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned;
+extern raw_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned;
 
-/* Can't use _raw_spin_lock_irq because of #include problems, so
+/* Can't use raw_spin_lock_irq because of #include problems, so
  * this is the substitute */
 #define _atomic_spin_lock_irqsave(l,f) do {	\
-	spinlock_t *s = ATOMIC_HASH(l);		\
+	raw_spinlock_t *s = ATOMIC_HASH(l);		\
 	local_irq_save(f);			\
-	_raw_spin_lock(s);			\
+	__raw_spin_lock(s);			\
 } while(0)
 
 #define _atomic_spin_unlock_irqrestore(l,f) do {	\
-	spinlock_t *s = ATOMIC_HASH(l);			\
-	_raw_spin_unlock(s);				\
+	raw_spinlock_t *s = ATOMIC_HASH(l);			\
+	__raw_spin_unlock(s);				\
 	local_irq_restore(f);				\
 } while(0)
 
diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h
index 928e5ef850b..af7db694b22 100644
--- a/include/asm-parisc/bitops.h
+++ b/include/asm-parisc/bitops.h
@@ -2,7 +2,7 @@
 #define _PARISC_BITOPS_H
 
 #include <linux/compiler.h>
-#include <asm/system.h>
+#include <asm/spinlock.h>
 #include <asm/byteorder.h>
 #include <asm/atomic.h>
 
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index 06732719d92..aa592d8c0e3 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -3,6 +3,7 @@
 
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <asm/cache.h>	/* for flush_user_dcache_range_asm() proto */
 
 /* The usual comment is "Caches aren't brain-dead on the <architecture>".
  * Unfortunately, that doesn't apply to PA-RISC. */
diff --git a/include/asm-parisc/processor.h b/include/asm-parisc/processor.h
index 0b61f51d846..a9dfadd0565 100644
--- a/include/asm-parisc/processor.h
+++ b/include/asm-parisc/processor.h
@@ -11,6 +11,7 @@
 #ifndef __ASSEMBLY__
 #include <linux/config.h>
 #include <linux/threads.h>
+#include <linux/spinlock_types.h>
 
 #include <asm/hardware.h>
 #include <asm/page.h>
diff --git a/include/asm-parisc/spinlock.h b/include/asm-parisc/spinlock.h
index 679ea1c651e..43eaa6e742e 100644
--- a/include/asm-parisc/spinlock.h
+++ b/include/asm-parisc/spinlock.h
@@ -2,30 +2,25 @@
 #define __ASM_SPINLOCK_H
 
 #include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/spinlock_types.h>
 
 /* Note that PA-RISC has to use `1' to mean unlocked and `0' to mean locked
  * since it only has load-and-zero. Moreover, at least on some PA processors,
  * the semaphore address has to be 16-byte aligned.
  */
 
-#ifndef CONFIG_DEBUG_SPINLOCK
-
-#define __SPIN_LOCK_UNLOCKED	{ { 1, 1, 1, 1 } }
-#undef SPIN_LOCK_UNLOCKED
-#define SPIN_LOCK_UNLOCKED (spinlock_t) __SPIN_LOCK_UNLOCKED
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-static inline int spin_is_locked(spinlock_t *x)
+static inline int __raw_spin_is_locked(raw_spinlock_t *x)
 {
 	volatile unsigned int *a = __ldcw_align(x);
 	return *a == 0;
 }
 
-#define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_spin_unlock_wait(x) \
+		do { cpu_relax(); } while (__raw_spin_is_locked(x))
 
-static inline void _raw_spin_lock(spinlock_t *x)
+static inline void __raw_spin_lock(raw_spinlock_t *x)
 {
 	volatile unsigned int *a;
 
@@ -36,7 +31,7 @@ static inline void _raw_spin_lock(spinlock_t *x)
 	mb();
 }
 
-static inline void _raw_spin_unlock(spinlock_t *x)
+static inline void __raw_spin_unlock(raw_spinlock_t *x)
 {
 	volatile unsigned int *a;
 	mb();
@@ -45,7 +40,7 @@ static inline void _raw_spin_unlock(spinlock_t *x)
 	mb();
 }
 
-static inline int _raw_spin_trylock(spinlock_t *x)
+static inline int __raw_spin_trylock(raw_spinlock_t *x)
 {
 	volatile unsigned int *a;
 	int ret;
@@ -57,131 +52,38 @@ static inline int _raw_spin_trylock(spinlock_t *x)
 
 	return ret;
 }
-	
-#define spin_lock_own(LOCK, LOCATION)	((void)0)
-
-#else /* !(CONFIG_DEBUG_SPINLOCK) */
-
-#define SPINLOCK_MAGIC	0x1D244B3C
-
-#define __SPIN_LOCK_UNLOCKED	{ { 1, 1, 1, 1 }, SPINLOCK_MAGIC, 10, __FILE__ , NULL, 0, -1, NULL, NULL }
-#undef SPIN_LOCK_UNLOCKED
-#define SPIN_LOCK_UNLOCKED (spinlock_t) __SPIN_LOCK_UNLOCKED
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-#define CHECK_LOCK(x)							\
-	do {								\
-	 	if (unlikely((x)->magic != SPINLOCK_MAGIC)) {			\
-			printk(KERN_ERR "%s:%d: spin_is_locked"		\
-			" on uninitialized spinlock %p.\n",		\
-				__FILE__, __LINE__, (x)); 		\
-		} 							\
-	} while(0)
-
-#define spin_is_locked(x)						\
-	({								\
-	 	CHECK_LOCK(x);						\
-		volatile unsigned int *a = __ldcw_align(x);		\
-		if (unlikely((*a == 0) && (x)->babble)) {				\
-			(x)->babble--;					\
-			printk("KERN_WARNING				\
-				%s:%d: spin_is_locked(%s/%p) already"	\
-				" locked by %s:%d in %s at %p(%d)\n",	\
-				__FILE__,__LINE__, (x)->module,	(x),	\
-				(x)->bfile, (x)->bline, (x)->task->comm,\
-				(x)->previous, (x)->oncpu);		\
-		}							\
-		*a == 0;						\
-	})
-
-#define spin_unlock_wait(x)						\
-	do {								\
-	 	CHECK_LOCK(x);						\
-		volatile unsigned int *a = __ldcw_align(x);		\
-		if (unlikely((*a == 0) && (x)->babble)) {				\
-			(x)->babble--;					\
-			printk("KERN_WARNING				\
-				%s:%d: spin_unlock_wait(%s/%p)"		\
-				" owned by %s:%d in %s at %p(%d)\n",	\
-				__FILE__,__LINE__, (x)->module, (x),	\
-				(x)->bfile, (x)->bline, (x)->task->comm,\
-				(x)->previous, (x)->oncpu);		\
-		}							\
-		barrier();						\
-	} while (*((volatile unsigned char *)(__ldcw_align(x))) == 0)
-
-extern void _dbg_spin_lock(spinlock_t *lock, const char *base_file, int line_no);
-extern void _dbg_spin_unlock(spinlock_t *lock, const char *, int);
-extern int _dbg_spin_trylock(spinlock_t * lock, const char *, int);
-
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
-
-#define _raw_spin_unlock(lock)	_dbg_spin_unlock(lock, __FILE__, __LINE__)
-#define _raw_spin_lock(lock) _dbg_spin_lock(lock, __FILE__, __LINE__)
-#define _raw_spin_trylock(lock) _dbg_spin_trylock(lock, __FILE__, __LINE__)
-
-/* just in case we need it */
-#define spin_lock_own(LOCK, LOCATION)					\
-do {									\
-	volatile unsigned int *a = __ldcw_align(LOCK);			\
-	if (!((*a == 0) && ((LOCK)->oncpu == smp_processor_id())))	\
-		printk("KERN_WARNING					\
-			%s: called on %d from %p but lock %s on %d\n",	\
-			LOCATION, smp_processor_id(),			\
-			__builtin_return_address(0),			\
-			(*a == 0) ? "taken" : "freed", (LOCK)->on_cpu);	\
-} while (0)
-
-#endif /* !(CONFIG_DEBUG_SPINLOCK) */
 
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
  */
-typedef struct {
-	spinlock_t lock;
-	volatile int counter;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { __SPIN_LOCK_UNLOCKED, 0 }
-
-#define rwlock_init(lp)	do { *(lp) = RW_LOCK_UNLOCKED; } while (0)
 
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+#define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
 
 /* read_lock, read_unlock are pretty straightforward.  Of course it somehow
  * sucks we end up saving/restoring flags twice for read_lock_irqsave aso. */
 
-#ifdef CONFIG_DEBUG_RWLOCK
-extern void _dbg_read_lock(rwlock_t * rw, const char *bfile, int bline);
-#define _raw_read_lock(rw) _dbg_read_lock(rw, __FILE__, __LINE__)
-#else
-static  __inline__ void _raw_read_lock(rwlock_t *rw)
+static  __inline__ void __raw_read_lock(raw_rwlock_t *rw)
 {
 	unsigned long flags;
 	local_irq_save(flags);
-	_raw_spin_lock(&rw->lock); 
+	__raw_spin_lock(&rw->lock);
 
 	rw->counter++;
 
-	_raw_spin_unlock(&rw->lock);
+	__raw_spin_unlock(&rw->lock);
 	local_irq_restore(flags);
 }
-#endif	/* CONFIG_DEBUG_RWLOCK */
 
-static  __inline__ void _raw_read_unlock(rwlock_t *rw)
+static  __inline__ void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	unsigned long flags;
 	local_irq_save(flags);
-	_raw_spin_lock(&rw->lock); 
+	__raw_spin_lock(&rw->lock);
 
 	rw->counter--;
 
-	_raw_spin_unlock(&rw->lock);
+	__raw_spin_unlock(&rw->lock);
 	local_irq_restore(flags);
 }
 
@@ -194,20 +96,17 @@ static  __inline__ void _raw_read_unlock(rwlock_t *rw)
  * writers) in interrupt handlers someone fucked up and we'd dead-lock
  * sooner or later anyway.   prumpf */
 
-#ifdef CONFIG_DEBUG_RWLOCK
-extern void _dbg_write_lock(rwlock_t * rw, const char *bfile, int bline);
-#define _raw_write_lock(rw) _dbg_write_lock(rw, __FILE__, __LINE__)
-#else
-static  __inline__ void _raw_write_lock(rwlock_t *rw)
+static  __inline__ void __raw_write_lock(raw_rwlock_t *rw)
 {
 retry:
-	_raw_spin_lock(&rw->lock);
+	__raw_spin_lock(&rw->lock);
 
 	if(rw->counter != 0) {
 		/* this basically never happens */
-		_raw_spin_unlock(&rw->lock);
+		__raw_spin_unlock(&rw->lock);
 
-		while(rw->counter != 0);
+		while (rw->counter != 0)
+			cpu_relax();
 
 		goto retry;
 	}
@@ -215,26 +114,21 @@ retry:
 	/* got it.  now leave without unlocking */
 	rw->counter = -1; /* remember we are locked */
 }
-#endif /* CONFIG_DEBUG_RWLOCK */
 
 /* write_unlock is absolutely trivial - we don't have to wait for anything */
 
-static  __inline__ void _raw_write_unlock(rwlock_t *rw)
+static  __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	rw->counter = 0;
-	_raw_spin_unlock(&rw->lock);
+	__raw_spin_unlock(&rw->lock);
 }
 
-#ifdef CONFIG_DEBUG_RWLOCK
-extern int _dbg_write_trylock(rwlock_t * rw, const char *bfile, int bline);
-#define _raw_write_trylock(rw) _dbg_write_trylock(rw, __FILE__, __LINE__)
-#else
-static  __inline__ int _raw_write_trylock(rwlock_t *rw)
+static  __inline__ int __raw_write_trylock(raw_rwlock_t *rw)
 {
-	_raw_spin_lock(&rw->lock);
+	__raw_spin_lock(&rw->lock);
 	if (rw->counter != 0) {
 		/* this basically never happens */
-		_raw_spin_unlock(&rw->lock);
+		__raw_spin_unlock(&rw->lock);
 
 		return 0;
 	}
@@ -243,14 +137,13 @@ static  __inline__ int _raw_write_trylock(rwlock_t *rw)
 	rw->counter = -1; /* remember we are locked */
 	return 1;
 }
-#endif /* CONFIG_DEBUG_RWLOCK */
 
-static __inline__ int is_read_locked(rwlock_t *rw)
+static __inline__ int __raw_is_read_locked(raw_rwlock_t *rw)
 {
 	return rw->counter > 0;
 }
 
-static __inline__ int is_write_locked(rwlock_t *rw)
+static __inline__ int __raw_is_write_locked(raw_rwlock_t *rw)
 {
 	return rw->counter < 0;
 }
diff --git a/include/asm-parisc/spinlock_types.h b/include/asm-parisc/spinlock_types.h
new file mode 100644
index 00000000000..785bba822fb
--- /dev/null
+++ b/include/asm-parisc/spinlock_types.h
@@ -0,0 +1,21 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int lock[4];
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ { 1, 1, 1, 1 } }
+
+typedef struct {
+	raw_spinlock_t lock;
+	volatile int counter;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ __RAW_SPIN_LOCK_UNLOCKED, 0 }
+
+#endif
diff --git a/include/asm-parisc/system.h b/include/asm-parisc/system.h
index 81c54333903..26ff844a21c 100644
--- a/include/asm-parisc/system.h
+++ b/include/asm-parisc/system.h
@@ -160,29 +160,7 @@ static inline void set_eiem(unsigned long val)
 })
 
 #ifdef CONFIG_SMP
-/*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
-	volatile unsigned int lock[4];
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned long magic;
-	volatile unsigned int babble;
-	const char *module;
-	char *bfile;
-	int bline;
-	int oncpu;
-	void *previous;
-	struct task_struct * task;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#define __lock_aligned __attribute__((__section__(".data.lock_aligned")))
-
+# define __lock_aligned __attribute__((__section__(".data.lock_aligned")))
 #endif
 
 #define KERNEL_START (0x10100000 - 0x1000)
diff --git a/include/asm-ppc/spinlock.h b/include/asm-ppc/spinlock.h
index 909199aae10..20edcf2a6e0 100644
--- a/include/asm-ppc/spinlock.h
+++ b/include/asm-ppc/spinlock.h
@@ -5,41 +5,21 @@
 
 /*
  * Simple spin lock operations.
+ *
+ * (the type definitions are in asm/raw_spinlock_types.h)
  */
 
-typedef struct {
-	volatile unsigned long lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	volatile unsigned long owner_pc;
-	volatile unsigned long owner_cpu;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#ifdef __KERNEL__
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define SPINLOCK_DEBUG_INIT     , 0, 0
-#else
-#define SPINLOCK_DEBUG_INIT     /* */
-#endif
-
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 SPINLOCK_DEBUG_INIT }
-
-#define spin_lock_init(x) 	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_is_locked(x)	((x)->lock != 0)
-#define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
-
-#ifndef CONFIG_DEBUG_SPINLOCK
-
-static inline void _raw_spin_lock(spinlock_t *lock)
+#define __raw_spin_is_locked(x)		((x)->lock != 0)
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned long tmp;
 
 	__asm__ __volatile__(
-	"b	1f		# spin_lock\n\
+	"b	1f		# __raw_spin_lock\n\
 2:	lwzx	%0,0,%1\n\
 	cmpwi	0,%0,0\n\
 	bne+	2b\n\
@@ -55,21 +35,13 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 	: "cr0", "memory");
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-	__asm__ __volatile__("eieio		# spin_unlock": : :"memory");
+	__asm__ __volatile__("eieio	# __raw_spin_unlock": : :"memory");
 	lock->lock = 0;
 }
 
-#define _raw_spin_trylock(l) (!test_and_set_bit(0,&(l)->lock))
-
-#else
-
-extern void _raw_spin_lock(spinlock_t *lock);
-extern void _raw_spin_unlock(spinlock_t *lock);
-extern int _raw_spin_trylock(spinlock_t *lock);
-
-#endif
+#define __raw_spin_trylock(l) (!test_and_set_bit(0,&(l)->lock))
 
 /*
  * Read-write spinlocks, allowing multiple readers
@@ -81,22 +53,11 @@ extern int _raw_spin_trylock(spinlock_t *lock);
  * irq-safe write-lock, but readers can get non-irqsafe
  * read-locks.
  */
-typedef struct {
-	volatile signed int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
 
-#define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
-#define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0)
+#define __raw_read_can_lock(rw)	((rw)->lock >= 0)
+#define __raw_write_can_lock(rw)	(!(rw)->lock)
 
-#define read_can_lock(rw)	((rw)->lock >= 0)
-#define write_can_lock(rw)	(!(rw)->lock)
-
-#ifndef CONFIG_DEBUG_SPINLOCK
-
-static __inline__ int _raw_read_trylock(rwlock_t *rw)
+static __inline__ int __raw_read_trylock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -116,7 +77,7 @@ static __inline__ int _raw_read_trylock(rwlock_t *rw)
 	return tmp > 0;
 }
 
-static __inline__ void _raw_read_lock(rwlock_t *rw)
+static __inline__ void __raw_read_lock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -137,7 +98,7 @@ static __inline__ void _raw_read_lock(rwlock_t *rw)
 	: "cr0", "memory");
 }
 
-static __inline__ void _raw_read_unlock(rwlock_t *rw)
+static __inline__ void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -153,7 +114,7 @@ static __inline__ void _raw_read_unlock(rwlock_t *rw)
 	: "cr0", "memory");
 }
 
-static __inline__ int _raw_write_trylock(rwlock_t *rw)
+static __inline__ int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -173,7 +134,7 @@ static __inline__ int _raw_write_trylock(rwlock_t *rw)
 	return tmp == 0;
 }
 
-static __inline__ void _raw_write_lock(rwlock_t *rw)
+static __inline__ void __raw_write_lock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -194,22 +155,10 @@ static __inline__ void _raw_write_lock(rwlock_t *rw)
 	: "cr0", "memory");
 }
 
-static __inline__ void _raw_write_unlock(rwlock_t *rw)
+static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	__asm__ __volatile__("eieio		# write_unlock": : :"memory");
 	rw->lock = 0;
 }
 
-#else
-
-extern void _raw_read_lock(rwlock_t *rw);
-extern void _raw_read_unlock(rwlock_t *rw);
-extern void _raw_write_lock(rwlock_t *rw);
-extern void _raw_write_unlock(rwlock_t *rw);
-extern int _raw_read_trylock(rwlock_t *rw);
-extern int _raw_write_trylock(rwlock_t *rw);
-
-#endif
-
 #endif /* __ASM_SPINLOCK_H */
-#endif /* __KERNEL__ */
diff --git a/include/asm-ppc/spinlock_types.h b/include/asm-ppc/spinlock_types.h
new file mode 100644
index 00000000000..7919ccc75b8
--- /dev/null
+++ b/include/asm-ppc/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned long lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile signed int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/include/asm-ppc64/spinlock.h b/include/asm-ppc64/spinlock.h
index acd11564dd7..14cb895bb60 100644
--- a/include/asm-ppc64/spinlock.h
+++ b/include/asm-ppc64/spinlock.h
@@ -15,36 +15,42 @@
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
  */
 #include <linux/config.h>
 #include <asm/paca.h>
 #include <asm/hvcall.h>
 #include <asm/iSeries/HvCall.h>
 
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
+#define __raw_spin_is_locked(x)		((x)->slock != 0)
 
-typedef struct {
-	volatile signed int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
+/*
+ * This returns the old value in the lock, so we succeeded
+ * in getting the lock if the return value is 0.
+ */
+static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock)
+{
+	unsigned long tmp, tmp2;
 
-#ifdef __KERNEL__
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
+	__asm__ __volatile__(
+"	lwz		%1,%3(13)		# __spin_trylock\n\
+1:	lwarx		%0,0,%2\n\
+	cmpwi		0,%0,0\n\
+	bne-		2f\n\
+	stwcx.		%1,0,%2\n\
+	bne-		1b\n\
+	isync\n\
+2:"	: "=&r" (tmp), "=&r" (tmp2)
+	: "r" (&lock->slock), "i" (offsetof(struct paca_struct, lock_token))
+	: "cr0", "memory");
 
-#define spin_is_locked(x)	((x)->lock != 0)
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
+	return tmp;
+}
 
-static __inline__ void _raw_spin_unlock(spinlock_t *lock)
+static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock)
 {
-	__asm__ __volatile__("lwsync	# spin_unlock": : :"memory");
-	lock->lock = 0;
+	return __spin_trylock(lock) == 0;
 }
 
 /*
@@ -64,44 +70,15 @@ static __inline__ void _raw_spin_unlock(spinlock_t *lock)
 #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (get_paca()->lppaca.shared_proc)
-extern void __spin_yield(spinlock_t *lock);
-extern void __rw_yield(rwlock_t *lock);
+extern void __spin_yield(raw_spinlock_t *lock);
+extern void __rw_yield(raw_rwlock_t *lock);
 #else /* SPLPAR || ISERIES */
 #define __spin_yield(x)	barrier()
 #define __rw_yield(x)	barrier()
 #define SHARED_PROCESSOR	0
 #endif
-extern void spin_unlock_wait(spinlock_t *lock);
-
-/*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
- */
-static __inline__ unsigned long __spin_trylock(spinlock_t *lock)
-{
-	unsigned long tmp, tmp2;
-
-	__asm__ __volatile__(
-"	lwz		%1,%3(13)		# __spin_trylock\n\
-1:	lwarx		%0,0,%2\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n\
-	stwcx.		%1,0,%2\n\
-	bne-		1b\n\
-	isync\n\
-2:"	: "=&r" (tmp), "=&r" (tmp2)
-	: "r" (&lock->lock), "i" (offsetof(struct paca_struct, lock_token))
-	: "cr0", "memory");
-
-	return tmp;
-}
-
-static int __inline__ _raw_spin_trylock(spinlock_t *lock)
-{
-	return __spin_trylock(lock) == 0;
-}
 
-static void __inline__ _raw_spin_lock(spinlock_t *lock)
+static void __inline__ __raw_spin_lock(raw_spinlock_t *lock)
 {
 	while (1) {
 		if (likely(__spin_trylock(lock) == 0))
@@ -110,12 +87,12 @@ static void __inline__ _raw_spin_lock(spinlock_t *lock)
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->lock != 0));
+		} while (unlikely(lock->slock != 0));
 		HMT_medium();
 	}
 }
 
-static void __inline__ _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags)
+static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
 {
 	unsigned long flags_dis;
 
@@ -128,12 +105,20 @@ static void __inline__ _raw_spin_lock_flags(spinlock_t *lock, unsigned long flag
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->lock != 0));
+		} while (unlikely(lock->slock != 0));
 		HMT_medium();
 		local_irq_restore(flags_dis);
 	}
 }
 
+static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+	__asm__ __volatile__("lwsync	# __raw_spin_unlock": : :"memory");
+	lock->slock = 0;
+}
+
+extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
+
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -144,24 +129,15 @@ static void __inline__ _raw_spin_lock_flags(spinlock_t *lock, unsigned long flag
  * irq-safe write-lock, but readers can get non-irqsafe
  * read-locks.
  */
-#define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
 
-#define rwlock_init(x)		do { *(x) = RW_LOCK_UNLOCKED; } while(0)
-
-#define read_can_lock(rw)	((rw)->lock >= 0)
-#define write_can_lock(rw)	(!(rw)->lock)
-
-static __inline__ void _raw_write_unlock(rwlock_t *rw)
-{
-	__asm__ __volatile__("lwsync		# write_unlock": : :"memory");
-	rw->lock = 0;
-}
+#define __raw_read_can_lock(rw)		((rw)->lock >= 0)
+#define __raw_write_can_lock(rw)	(!(rw)->lock)
 
 /*
  * This returns the old value in the lock + 1,
  * so we got a read lock if the return value is > 0.
  */
-static long __inline__ __read_trylock(rwlock_t *rw)
+static long __inline__ __read_trylock(raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -180,45 +156,11 @@ static long __inline__ __read_trylock(rwlock_t *rw)
 	return tmp;
 }
 
-static int __inline__ _raw_read_trylock(rwlock_t *rw)
-{
-	return __read_trylock(rw) > 0;
-}
-
-static void __inline__ _raw_read_lock(rwlock_t *rw)
-{
-	while (1) {
-		if (likely(__read_trylock(rw) > 0))
-			break;
-		do {
-			HMT_low();
-			if (SHARED_PROCESSOR)
-				__rw_yield(rw);
-		} while (unlikely(rw->lock < 0));
-		HMT_medium();
-	}
-}
-
-static void __inline__ _raw_read_unlock(rwlock_t *rw)
-{
-	long tmp;
-
-	__asm__ __volatile__(
-	"eieio				# read_unlock\n\
-1:	lwarx		%0,0,%1\n\
-	addic		%0,%0,-1\n\
-	stwcx.		%0,0,%1\n\
-	bne-		1b"
-	: "=&r"(tmp)
-	: "r"(&rw->lock)
-	: "cr0", "memory");
-}
-
 /*
  * This returns the old value in the lock,
  * so we got the write lock if the return value is 0.
  */
-static __inline__ long __write_trylock(rwlock_t *rw)
+static __inline__ long __write_trylock(raw_rwlock_t *rw)
 {
 	long tmp, tmp2;
 
@@ -237,12 +179,21 @@ static __inline__ long __write_trylock(rwlock_t *rw)
 	return tmp;
 }
 
-static int __inline__ _raw_write_trylock(rwlock_t *rw)
+static void __inline__ __raw_read_lock(raw_rwlock_t *rw)
 {
-	return __write_trylock(rw) == 0;
+	while (1) {
+		if (likely(__read_trylock(rw) > 0))
+			break;
+		do {
+			HMT_low();
+			if (SHARED_PROCESSOR)
+				__rw_yield(rw);
+		} while (unlikely(rw->lock < 0));
+		HMT_medium();
+	}
 }
 
-static void __inline__ _raw_write_lock(rwlock_t *rw)
+static void __inline__ __raw_write_lock(raw_rwlock_t *rw)
 {
 	while (1) {
 		if (likely(__write_trylock(rw) == 0))
@@ -256,5 +207,35 @@ static void __inline__ _raw_write_lock(rwlock_t *rw)
 	}
 }
 
-#endif /* __KERNEL__ */
+static int __inline__ __raw_read_trylock(raw_rwlock_t *rw)
+{
+	return __read_trylock(rw) > 0;
+}
+
+static int __inline__ __raw_write_trylock(raw_rwlock_t *rw)
+{
+	return __write_trylock(rw) == 0;
+}
+
+static void __inline__ __raw_read_unlock(raw_rwlock_t *rw)
+{
+	long tmp;
+
+	__asm__ __volatile__(
+	"eieio				# read_unlock\n\
+1:	lwarx		%0,0,%1\n\
+	addic		%0,%0,-1\n\
+	stwcx.		%0,0,%1\n\
+	bne-		1b"
+	: "=&r"(tmp)
+	: "r"(&rw->lock)
+	: "cr0", "memory");
+}
+
+static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
+{
+	__asm__ __volatile__("lwsync	# write_unlock": : :"memory");
+	rw->lock = 0;
+}
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-ppc64/spinlock_types.h b/include/asm-ppc64/spinlock_types.h
new file mode 100644
index 00000000000..a37c8eabb9f
--- /dev/null
+++ b/include/asm-ppc64/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int slock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile signed int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/include/asm-s390/spinlock.h b/include/asm-s390/spinlock.h
index 321b23bba1e..273dbecf8ac 100644
--- a/include/asm-s390/spinlock.h
+++ b/include/asm-s390/spinlock.h
@@ -27,25 +27,19 @@ _raw_compare_and_swap(volatile unsigned int *lock,
  * on the local processor, one does not.
  *
  * We make no fairness assumptions. They have a cost.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
  */
 
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} __attribute__ ((aligned (4))) spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
-#define spin_lock_init(lp)	do { (lp)->lock = 0; } while(0)
-#define spin_unlock_wait(lp)	do { barrier(); } while(((volatile spinlock_t *)(lp))->lock)
-#define spin_is_locked(x)	((x)->lock != 0)
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_is_locked(x) ((x)->lock != 0)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
 
-extern void _raw_spin_lock_wait(spinlock_t *lp, unsigned int pc);
-extern int _raw_spin_trylock_retry(spinlock_t *lp, unsigned int pc);
+extern void _raw_spin_lock_wait(raw_spinlock_t *lp, unsigned int pc);
+extern int _raw_spin_trylock_retry(raw_spinlock_t *lp, unsigned int pc);
 
-static inline void _raw_spin_lock(spinlock_t *lp)
+static inline void __raw_spin_lock(raw_spinlock_t *lp)
 {
 	unsigned long pc = 1 | (unsigned long) __builtin_return_address(0);
 
@@ -53,7 +47,7 @@ static inline void _raw_spin_lock(spinlock_t *lp)
 		_raw_spin_lock_wait(lp, pc);
 }
 
-static inline int _raw_spin_trylock(spinlock_t *lp)
+static inline int __raw_spin_trylock(raw_spinlock_t *lp)
 {
 	unsigned long pc = 1 | (unsigned long) __builtin_return_address(0);
 
@@ -62,7 +56,7 @@ static inline int _raw_spin_trylock(spinlock_t *lp)
 	return _raw_spin_trylock_retry(lp, pc);
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lp)
+static inline void __raw_spin_unlock(raw_spinlock_t *lp)
 {
 	_raw_compare_and_swap(&lp->lock, lp->lock, 0);
 }
@@ -77,36 +71,25 @@ static inline void _raw_spin_unlock(spinlock_t *lp)
  * irq-safe write-lock, but readers can get non-irqsafe
  * read-locks.
  */
-typedef struct {
-	volatile unsigned int lock;
-	volatile unsigned long owner_pc;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
-
-#define rwlock_init(x)	do { *(x) = RW_LOCK_UNLOCKED; } while(0)
 
 /**
  * read_can_lock - would read_trylock() succeed?
  * @lock: the rwlock in question.
  */
-#define read_can_lock(x) ((int)(x)->lock >= 0)
+#define __raw_read_can_lock(x) ((int)(x)->lock >= 0)
 
 /**
  * write_can_lock - would write_trylock() succeed?
  * @lock: the rwlock in question.
  */
-#define write_can_lock(x) ((x)->lock == 0)
+#define __raw_write_can_lock(x) ((x)->lock == 0)
 
-extern void _raw_read_lock_wait(rwlock_t *lp);
-extern int _raw_read_trylock_retry(rwlock_t *lp);
-extern void _raw_write_lock_wait(rwlock_t *lp);
-extern int _raw_write_trylock_retry(rwlock_t *lp);
+extern void _raw_read_lock_wait(raw_rwlock_t *lp);
+extern int _raw_read_trylock_retry(raw_rwlock_t *lp);
+extern void _raw_write_lock_wait(raw_rwlock_t *lp);
+extern int _raw_write_trylock_retry(raw_rwlock_t *lp);
 
-static inline void _raw_read_lock(rwlock_t *rw)
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
 	unsigned int old;
 	old = rw->lock & 0x7fffffffU;
@@ -114,7 +97,7 @@ static inline void _raw_read_lock(rwlock_t *rw)
 		_raw_read_lock_wait(rw);
 }
 
-static inline void _raw_read_unlock(rwlock_t *rw)
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	unsigned int old, cmp;
 
@@ -125,18 +108,18 @@ static inline void _raw_read_unlock(rwlock_t *rw)
 	} while (cmp != old);
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
 	if (unlikely(_raw_compare_and_swap(&rw->lock, 0, 0x80000000) != 0))
 		_raw_write_lock_wait(rw);
 }
 
-static inline void _raw_write_unlock(rwlock_t *rw)
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	_raw_compare_and_swap(&rw->lock, 0x80000000, 0);
 }
 
-static inline int _raw_read_trylock(rwlock_t *rw)
+static inline int __raw_read_trylock(raw_rwlock_t *rw)
 {
 	unsigned int old;
 	old = rw->lock & 0x7fffffffU;
@@ -145,7 +128,7 @@ static inline int _raw_read_trylock(rwlock_t *rw)
 	return _raw_read_trylock_retry(rw);
 }
 
-static inline int _raw_write_trylock(rwlock_t *rw)
+static inline int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	if (likely(_raw_compare_and_swap(&rw->lock, 0, 0x80000000) == 0))
 		return 1;
diff --git a/include/asm-s390/spinlock_types.h b/include/asm-s390/spinlock_types.h
new file mode 100644
index 00000000000..f79a2216204
--- /dev/null
+++ b/include/asm-s390/spinlock_types.h
@@ -0,0 +1,21 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int lock;
+} __attribute__ ((aligned (4))) raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile unsigned int lock;
+	volatile unsigned int owner_pc;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0, 0 }
+
+#endif
diff --git a/include/asm-sh/spinlock.h b/include/asm-sh/spinlock.h
index e770b55649e..846322d4c35 100644
--- a/include/asm-sh/spinlock.h
+++ b/include/asm-sh/spinlock.h
@@ -15,20 +15,11 @@
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  */
-typedef struct {
-	volatile unsigned long lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
 
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-#define spin_is_locked(x)	((x)->lock != 0)
-#define spin_unlock_wait(x)	do { barrier(); } while (spin_is_locked(x))
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_is_locked(x)	((x)->lock != 0)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_spin_unlock_wait(x) \
+	do { cpu_relax(); } while (__raw_spin_is_locked(x))
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -36,7 +27,7 @@ typedef struct {
  *
  * We make no fairness assumptions.  They have a cost.
  */
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	__asm__ __volatile__ (
 		"1:\n\t"
@@ -49,14 +40,14 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 	);
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	assert_spin_locked(lock);
 
 	lock->lock = 0;
 }
 
-#define _raw_spin_trylock(x) (!test_and_set_bit(0, &(x)->lock))
+#define __raw_spin_trylock(x) (!test_and_set_bit(0, &(x)->lock))
 
 /*
  * Read-write spinlocks, allowing multiple readers but only one writer.
@@ -66,51 +57,40 @@ static inline void _raw_spin_unlock(spinlock_t *lock)
  * needs to get a irq-safe write-lock, but readers can get non-irqsafe
  * read-locks.
  */
-typedef struct {
-	spinlock_t lock;
-	atomic_t counter;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RW_LOCK_BIAS		0x01000000
-#define RW_LOCK_UNLOCKED	(rwlock_t) { { 0 }, { RW_LOCK_BIAS } }
-#define rwlock_init(x)		do { *(x) = RW_LOCK_UNLOCKED; } while (0)
-
-static inline void _raw_read_lock(rwlock_t *rw)
+
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
-	_raw_spin_lock(&rw->lock);
+	__raw_spin_lock(&rw->lock);
 
 	atomic_inc(&rw->counter);
 
-	_raw_spin_unlock(&rw->lock);
+	__raw_spin_unlock(&rw->lock);
 }
 
-static inline void _raw_read_unlock(rwlock_t *rw)
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
 {
-	_raw_spin_lock(&rw->lock);
+	__raw_spin_lock(&rw->lock);
 
 	atomic_dec(&rw->counter);
 
-	_raw_spin_unlock(&rw->lock);
+	__raw_spin_unlock(&rw->lock);
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
-	_raw_spin_lock(&rw->lock);
+	__raw_spin_lock(&rw->lock);
 	atomic_set(&rw->counter, -1);
 }
 
-static inline void _raw_write_unlock(rwlock_t *rw)
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	atomic_set(&rw->counter, 0);
-	_raw_spin_unlock(&rw->lock);
+	__raw_spin_unlock(&rw->lock);
 }
 
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+#define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
 
-static inline int _raw_write_trylock(rwlock_t *rw)
+static inline int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	if (atomic_sub_and_test(RW_LOCK_BIAS, &rw->counter))
 		return 1;
@@ -121,4 +101,3 @@ static inline int _raw_write_trylock(rwlock_t *rw)
 }
 
 #endif /* __ASM_SH_SPINLOCK_H */
-
diff --git a/include/asm-sh/spinlock_types.h b/include/asm-sh/spinlock_types.h
new file mode 100644
index 00000000000..8c41b6c3aac
--- /dev/null
+++ b/include/asm-sh/spinlock_types.h
@@ -0,0 +1,22 @@
+#ifndef __ASM_SH_SPINLOCK_TYPES_H
+#define __ASM_SH_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned long lock;
+} raw_spinlock_t;
+
+#define __SPIN_LOCK_UNLOCKED		{ 0 }
+
+typedef struct {
+	raw_spinlock_t lock;
+	atomic_t counter;
+} raw_rwlock_t;
+
+#define RW_LOCK_BIAS			0x01000000
+#define __RAW_RW_LOCK_UNLOCKED		{ { 0 }, { RW_LOCK_BIAS } }
+
+#endif
diff --git a/include/asm-sparc/spinlock.h b/include/asm-sparc/spinlock.h
index 0cbd87ad491..111727a2bb4 100644
--- a/include/asm-sparc/spinlock.h
+++ b/include/asm-sparc/spinlock.h
@@ -12,96 +12,12 @@
 
 #include <asm/psr.h>
 
-#ifdef CONFIG_DEBUG_SPINLOCK
-struct _spinlock_debug {
-	unsigned char lock;
-	unsigned long owner_pc;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-};
-typedef struct _spinlock_debug spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0, 0 }
-#define spin_lock_init(lp)	do { *(lp)= SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_is_locked(lp)  (*((volatile unsigned char *)(&((lp)->lock))) != 0)
-#define spin_unlock_wait(lp)	do { barrier(); } while(*(volatile unsigned char *)(&(lp)->lock))
-
-extern void _do_spin_lock(spinlock_t *lock, char *str);
-extern int _spin_trylock(spinlock_t *lock);
-extern void _do_spin_unlock(spinlock_t *lock);
-
-#define _raw_spin_trylock(lp)	_spin_trylock(lp)
-#define _raw_spin_lock(lock)	_do_spin_lock(lock, "spin_lock")
-#define _raw_spin_unlock(lock)	_do_spin_unlock(lock)
-
-struct _rwlock_debug {
-	volatile unsigned int lock;
-	unsigned long owner_pc;
-	unsigned long reader_pc[NR_CPUS];
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-};
-typedef struct _rwlock_debug rwlock_t;
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, {0} }
-
-#define rwlock_init(lp)	do { *(lp)= RW_LOCK_UNLOCKED; } while(0)
-
-extern void _do_read_lock(rwlock_t *rw, char *str);
-extern void _do_read_unlock(rwlock_t *rw, char *str);
-extern void _do_write_lock(rwlock_t *rw, char *str);
-extern void _do_write_unlock(rwlock_t *rw);
-
-#define _raw_read_lock(lock)	\
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_read_lock(lock, "read_lock"); \
-	local_irq_restore(flags); \
-} while(0)
-
-#define _raw_read_unlock(lock) \
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_read_unlock(lock, "read_unlock"); \
-	local_irq_restore(flags); \
-} while(0)
-
-#define _raw_write_lock(lock) \
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_write_lock(lock, "write_lock"); \
-	local_irq_restore(flags); \
-} while(0)
-
-#define _raw_write_unlock(lock) \
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_write_unlock(lock); \
-	local_irq_restore(flags); \
-} while(0)
-
-#else /* !CONFIG_DEBUG_SPINLOCK */
-
-typedef struct {
-	unsigned char lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
-
-#define spin_lock_init(lock)   (*((unsigned char *)(lock)) = 0)
-#define spin_is_locked(lock)    (*((volatile unsigned char *)(lock)) != 0)
+#define __raw_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0)
 
-#define spin_unlock_wait(lock) \
-do { \
-	barrier(); \
-} while(*((volatile unsigned char *)lock))
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
 
-extern __inline__ void _raw_spin_lock(spinlock_t *lock)
+extern __inline__ void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 	"\n1:\n\t"
@@ -121,7 +37,7 @@ extern __inline__ void _raw_spin_lock(spinlock_t *lock)
 	: "g2", "memory", "cc");
 }
 
-extern __inline__ int _raw_spin_trylock(spinlock_t *lock)
+extern __inline__ int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	unsigned int result;
 	__asm__ __volatile__("ldstub [%1], %0"
@@ -131,7 +47,7 @@ extern __inline__ int _raw_spin_trylock(spinlock_t *lock)
 	return (result == 0);
 }
 
-extern __inline__ void _raw_spin_unlock(spinlock_t *lock)
+extern __inline__ void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	__asm__ __volatile__("stb %%g0, [%0]" : : "r" (lock) : "memory");
 }
@@ -147,23 +63,11 @@ extern __inline__ void _raw_spin_unlock(spinlock_t *lock)
  *
  * XXX This might create some problems with my dual spinlock
  * XXX scheme, deadlocks etc. -DaveM
- */
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
-
-#define rwlock_init(lp)	do { *(lp)= RW_LOCK_UNLOCKED; } while(0)
-
-
-/* Sort of like atomic_t's on Sparc, but even more clever.
+ *
+ * Sort of like atomic_t's on Sparc, but even more clever.
  *
  *	------------------------------------
- *	| 24-bit counter           | wlock |  rwlock_t
+ *	| 24-bit counter           | wlock |  raw_rwlock_t
  *	------------------------------------
  *	 31                       8 7     0
  *
@@ -174,9 +78,9 @@ typedef struct {
  *
  * Unfortunately this scheme limits us to ~16,000,000 cpus.
  */
-extern __inline__ void _read_lock(rwlock_t *rw)
+extern __inline__ void __read_lock(raw_rwlock_t *rw)
 {
-	register rwlock_t *lp asm("g1");
+	register raw_rwlock_t *lp asm("g1");
 	lp = rw;
 	__asm__ __volatile__(
 	"mov	%%o7, %%g4\n\t"
@@ -187,16 +91,16 @@ extern __inline__ void _read_lock(rwlock_t *rw)
 	: "g2", "g4", "memory", "cc");
 }
 
-#define _raw_read_lock(lock) \
+#define __raw_read_lock(lock) \
 do {	unsigned long flags; \
 	local_irq_save(flags); \
-	_read_lock(lock); \
+	__raw_read_lock(lock); \
 	local_irq_restore(flags); \
 } while(0)
 
-extern __inline__ void _read_unlock(rwlock_t *rw)
+extern __inline__ void __read_unlock(raw_rwlock_t *rw)
 {
-	register rwlock_t *lp asm("g1");
+	register raw_rwlock_t *lp asm("g1");
 	lp = rw;
 	__asm__ __volatile__(
 	"mov	%%o7, %%g4\n\t"
@@ -207,16 +111,16 @@ extern __inline__ void _read_unlock(rwlock_t *rw)
 	: "g2", "g4", "memory", "cc");
 }
 
-#define _raw_read_unlock(lock) \
+#define __raw_read_unlock(lock) \
 do {	unsigned long flags; \
 	local_irq_save(flags); \
-	_read_unlock(lock); \
+	__raw_read_unlock(lock); \
 	local_irq_restore(flags); \
 } while(0)
 
-extern __inline__ void _raw_write_lock(rwlock_t *rw)
+extern __inline__ void __raw_write_lock(raw_rwlock_t *rw)
 {
-	register rwlock_t *lp asm("g1");
+	register raw_rwlock_t *lp asm("g1");
 	lp = rw;
 	__asm__ __volatile__(
 	"mov	%%o7, %%g4\n\t"
@@ -227,11 +131,9 @@ extern __inline__ void _raw_write_lock(rwlock_t *rw)
 	: "g2", "g4", "memory", "cc");
 }
 
-#define _raw_write_unlock(rw)	do { (rw)->lock = 0; } while(0)
-
-#endif /* CONFIG_DEBUG_SPINLOCK */
+#define __raw_write_unlock(rw)	do { (rw)->lock = 0; } while(0)
 
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
 #endif /* !(__ASSEMBLY__) */
 
diff --git a/include/asm-sparc/spinlock_types.h b/include/asm-sparc/spinlock_types.h
new file mode 100644
index 00000000000..0a0fb116c4e
--- /dev/null
+++ b/include/asm-sparc/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef __SPARC_SPINLOCK_TYPES_H
+#define __SPARC_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	unsigned char lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h
index a02c4370eb4..ec85d12d73b 100644
--- a/include/asm-sparc64/spinlock.h
+++ b/include/asm-sparc64/spinlock.h
@@ -29,24 +29,13 @@
  * must be pre-V9 branches.
  */
 
-#ifndef CONFIG_DEBUG_SPINLOCK
+#define __raw_spin_is_locked(lp)	((lp)->lock != 0)
 
-typedef struct {
-	volatile unsigned char lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-#define SPIN_LOCK_UNLOCKED	(spinlock_t) {0,}
+#define __raw_spin_unlock_wait(lp)	\
+	do {	rmb();			\
+	} while((lp)->lock)
 
-#define spin_lock_init(lp)	do { *(lp)= SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_is_locked(lp)  ((lp)->lock != 0)
-
-#define spin_unlock_wait(lp)	\
-do {	rmb();			\
-} while((lp)->lock)
-
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned long tmp;
 
@@ -67,7 +56,7 @@ static inline void _raw_spin_lock(spinlock_t *lock)
 	: "memory");
 }
 
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	unsigned long result;
 
@@ -81,7 +70,7 @@ static inline int _raw_spin_trylock(spinlock_t *lock)
 	return (result == 0UL);
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 "	membar		#StoreStore | #LoadStore\n"
@@ -91,7 +80,7 @@ static inline void _raw_spin_unlock(spinlock_t *lock)
 	: "memory");
 }
 
-static inline void _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags)
+static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
 {
 	unsigned long tmp1, tmp2;
 
@@ -115,51 +104,9 @@ static inline void _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags)
 	: "memory");
 }
 
-#else /* !(CONFIG_DEBUG_SPINLOCK) */
-
-typedef struct {
-	volatile unsigned char lock;
-	unsigned int owner_pc, owner_cpu;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0, 0, 0xff }
-#define spin_lock_init(lp)	do { *(lp)= SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_is_locked(__lock)	((__lock)->lock != 0)
-#define spin_unlock_wait(__lock)	\
-do { \
-	rmb(); \
-} while((__lock)->lock)
-
-extern void _do_spin_lock(spinlock_t *lock, char *str, unsigned long caller);
-extern void _do_spin_unlock(spinlock_t *lock);
-extern int _do_spin_trylock(spinlock_t *lock, unsigned long caller);
-
-#define _raw_spin_trylock(lp)	\
-	_do_spin_trylock(lp, (unsigned long) __builtin_return_address(0))
-#define _raw_spin_lock(lock)	\
-	_do_spin_lock(lock, "spin_lock", \
-		      (unsigned long) __builtin_return_address(0))
-#define _raw_spin_unlock(lock)	_do_spin_unlock(lock)
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
-
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
 /* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
 
-#ifndef CONFIG_DEBUG_SPINLOCK
-
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-#define RW_LOCK_UNLOCKED	(rwlock_t) {0,}
-#define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0)
-
-static void inline __read_lock(rwlock_t *lock)
+static void inline __read_lock(raw_rwlock_t *lock)
 {
 	unsigned long tmp1, tmp2;
 
@@ -184,7 +131,7 @@ static void inline __read_lock(rwlock_t *lock)
 	: "memory");
 }
 
-static void inline __read_unlock(rwlock_t *lock)
+static void inline __read_unlock(raw_rwlock_t *lock)
 {
 	unsigned long tmp1, tmp2;
 
@@ -201,7 +148,7 @@ static void inline __read_unlock(rwlock_t *lock)
 	: "memory");
 }
 
-static void inline __write_lock(rwlock_t *lock)
+static void inline __write_lock(raw_rwlock_t *lock)
 {
 	unsigned long mask, tmp1, tmp2;
 
@@ -228,7 +175,7 @@ static void inline __write_lock(rwlock_t *lock)
 	: "memory");
 }
 
-static void inline __write_unlock(rwlock_t *lock)
+static void inline __write_unlock(raw_rwlock_t *lock)
 {
 	__asm__ __volatile__(
 "	membar		#LoadStore | #StoreStore\n"
@@ -238,7 +185,7 @@ static void inline __write_unlock(rwlock_t *lock)
 	: "memory");
 }
 
-static int inline __write_trylock(rwlock_t *lock)
+static int inline __write_trylock(raw_rwlock_t *lock)
 {
 	unsigned long mask, tmp1, tmp2, result;
 
@@ -263,78 +210,15 @@ static int inline __write_trylock(rwlock_t *lock)
 	return result;
 }
 
-#define _raw_read_lock(p)	__read_lock(p)
-#define _raw_read_unlock(p)	__read_unlock(p)
-#define _raw_write_lock(p)	__write_lock(p)
-#define _raw_write_unlock(p)	__write_unlock(p)
-#define _raw_write_trylock(p)	__write_trylock(p)
-
-#else /* !(CONFIG_DEBUG_SPINLOCK) */
-
-typedef struct {
-	volatile unsigned long lock;
-	unsigned int writer_pc, writer_cpu;
-	unsigned int reader_pc[NR_CPUS];
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-#define RW_LOCK_UNLOCKED	(rwlock_t) { 0, 0, 0xff, { } }
-#define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0)
-
-extern void _do_read_lock(rwlock_t *rw, char *str, unsigned long caller);
-extern void _do_read_unlock(rwlock_t *rw, char *str, unsigned long caller);
-extern void _do_write_lock(rwlock_t *rw, char *str, unsigned long caller);
-extern void _do_write_unlock(rwlock_t *rw, unsigned long caller);
-extern int _do_write_trylock(rwlock_t *rw, char *str, unsigned long caller);
-
-#define _raw_read_lock(lock) \
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_read_lock(lock, "read_lock", \
-		      (unsigned long) __builtin_return_address(0)); \
-	local_irq_restore(flags); \
-} while(0)
-
-#define _raw_read_unlock(lock) \
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_read_unlock(lock, "read_unlock", \
-		      (unsigned long) __builtin_return_address(0)); \
-	local_irq_restore(flags); \
-} while(0)
-
-#define _raw_write_lock(lock) \
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_write_lock(lock, "write_lock", \
-		      (unsigned long) __builtin_return_address(0)); \
-	local_irq_restore(flags); \
-} while(0)
-
-#define _raw_write_unlock(lock) \
-do {	unsigned long flags; \
-	local_irq_save(flags); \
-	_do_write_unlock(lock, \
-		      (unsigned long) __builtin_return_address(0)); \
-	local_irq_restore(flags); \
-} while(0)
-
-#define _raw_write_trylock(lock) \
-({	unsigned long flags; \
-	int val; \
-	local_irq_save(flags); \
-	val = _do_write_trylock(lock, "write_trylock", \
-				(unsigned long) __builtin_return_address(0)); \
-	local_irq_restore(flags); \
-	val; \
-})
-
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
-#define read_can_lock(rw)	(!((rw)->lock & 0x80000000UL))
-#define write_can_lock(rw)	(!(rw)->lock)
+#define __raw_read_lock(p)	__read_lock(p)
+#define __raw_read_unlock(p)	__read_unlock(p)
+#define __raw_write_lock(p)	__write_lock(p)
+#define __raw_write_unlock(p)	__write_unlock(p)
+#define __raw_write_trylock(p)	__write_trylock(p)
+
+#define __raw_read_trylock(lock)	generic__raw_read_trylock(lock)
+#define __raw_read_can_lock(rw)		(!((rw)->lock & 0x80000000UL))
+#define __raw_write_can_lock(rw)	(!(rw)->lock)
 
 #endif /* !(__ASSEMBLY__) */
 
diff --git a/include/asm-sparc64/spinlock_types.h b/include/asm-sparc64/spinlock_types.h
new file mode 100644
index 00000000000..e128112a0d7
--- /dev/null
+++ b/include/asm-sparc64/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef __SPARC64_SPINLOCK_TYPES_H
+#define __SPARC64_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned char lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ 0 }
+
+#endif
diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index 5aeb57a3baa..69636831ad2 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -6,47 +6,21 @@
 #include <asm/page.h>
 #include <linux/config.h>
 
-extern int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
-
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
-#define SPINLOCK_MAGIC	0xdead4ead
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define SPINLOCK_MAGIC_INIT	, SPINLOCK_MAGIC
-#else
-#define SPINLOCK_MAGIC_INIT	/* */
-#endif
-
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-/*
+ *
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
  *
  * We make no fairness assumptions. They have a cost.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
  */
 
-#define spin_is_locked(x)	(*(volatile signed char *)(&(x)->lock) <= 0)
-#define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define __raw_spin_is_locked(x) \
+		(*(volatile signed char *)(&(x)->slock) <= 0)
 
-#define spin_lock_string \
+#define __raw_spin_lock_string \
 	"\n1:\t" \
 	"lock ; decb %0\n\t" \
 	"js 2f\n" \
@@ -58,74 +32,40 @@ typedef struct {
 	"jmp 1b\n" \
 	LOCK_SECTION_END
 
-/*
- * This works. Despite all the confusion.
- * (except on PPro SMP or if we are using OOSTORE)
- * (PPro errata 66, 92)
- */
- 
-#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
-
-#define spin_unlock_string \
+#define __raw_spin_unlock_string \
 	"movb $1,%0" \
-		:"=m" (lock->lock) : : "memory"
-
-
-static inline void _raw_spin_unlock(spinlock_t *lock)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	assert_spin_locked(lock);
-#endif
-	__asm__ __volatile__(
-		spin_unlock_string
-	);
-}
-
-#else
-
-#define spin_unlock_string \
-	"xchgb %b0, %1" \
-		:"=q" (oldval), "=m" (lock->lock) \
-		:"0" (oldval) : "memory"
+		:"=m" (lock->slock) : : "memory"
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
-	char oldval = 1;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	assert_spin_locked(lock);
-#endif
 	__asm__ __volatile__(
-		spin_unlock_string
-	);
+		__raw_spin_lock_string
+		:"=m" (lock->slock) : : "memory");
 }
 
-#endif
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	char oldval;
+
 	__asm__ __volatile__(
 		"xchgb %b0,%1"
-		:"=q" (oldval), "=m" (lock->lock)
+		:"=q" (oldval), "=m" (lock->slock)
 		:"0" (0) : "memory");
+
 	return oldval > 0;
 }
 
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK
-	if (lock->magic != SPINLOCK_MAGIC) {
-		printk("eip: %p\n", __builtin_return_address(0));
-		BUG();
-	}
-#endif
 	__asm__ __volatile__(
-		spin_lock_string
-		:"=m" (lock->lock) : : "memory");
+		__raw_spin_unlock_string
+	);
 }
 
+#define __raw_spin_unlock_wait(lock) \
+	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
 
 /*
  * Read-write spinlocks, allowing multiple readers
@@ -136,33 +76,7 @@ static inline void _raw_spin_lock(spinlock_t *lock)
  * can "mix" irq-safe locks - any writer needs to get a
  * irq-safe write-lock, but readers can get non-irqsafe
  * read-locks.
- */
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
-#define RWLOCK_MAGIC	0xdeaf1eed
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define RWLOCK_MAGIC_INIT	, RWLOCK_MAGIC
-#else
-#define RWLOCK_MAGIC_INIT	/* */
-#endif
-
-#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
-
-#define rwlock_init(x)	do { *(x) = RW_LOCK_UNLOCKED; } while(0)
-
-#define read_can_lock(x)	((int)(x)->lock > 0)
-#define write_can_lock(x)	((x)->lock == RW_LOCK_BIAS)
-
-/*
+ *
  * On x86, we implement read-write locks as a 32-bit counter
  * with the high bit (sign) being the "contended" bit.
  *
@@ -170,29 +84,24 @@ typedef struct {
  *
  * Changed to use the same technique as rw semaphores.  See
  * semaphore.h for details.  -ben
+ *
+ * the helpers are in arch/i386/kernel/semaphore.c
  */
-/* the spinlock helpers are in arch/i386/kernel/semaphore.c */
 
-static inline void _raw_read_lock(rwlock_t *rw)
+#define __raw_read_can_lock(x)		((int)(x)->lock > 0)
+#define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
+
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK
-	BUG_ON(rw->magic != RWLOCK_MAGIC);
-#endif
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define _raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define _raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
-
-static inline int _raw_read_trylock(rwlock_t *lock)
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -202,7 +111,7 @@ static inline int _raw_read_trylock(rwlock_t *lock)
 	return 0;
 }
 
-static inline int _raw_write_trylock(rwlock_t *lock)
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -211,4 +120,15 @@ static inline int _raw_write_trylock(rwlock_t *lock)
 	return 0;
 }
 
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
+{
+	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
+}
+
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
+{
+	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0"
+				: "=m" (rw->lock) : : "memory");
+}
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-x86_64/spinlock_types.h b/include/asm-x86_64/spinlock_types.h
new file mode 100644
index 00000000000..59efe849f35
--- /dev/null
+++ b/include/asm-x86_64/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+	volatile unsigned int slock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
+
+typedef struct {
+	volatile unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
+
+#endif
diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
new file mode 100644
index 00000000000..6b20af0bbb7
--- /dev/null
+++ b/include/linux/bit_spinlock.h
@@ -0,0 +1,77 @@
+#ifndef __LINUX_BIT_SPINLOCK_H
+#define __LINUX_BIT_SPINLOCK_H
+
+/*
+ *  bit-based spin_lock()
+ *
+ * Don't use this unless you really need to: spin_lock() and spin_unlock()
+ * are significantly faster.
+ */
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+	preempt_disable();
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	while (test_and_set_bit(bitnum, addr)) {
+		while (test_bit(bitnum, addr)) {
+			preempt_enable();
+			cpu_relax();
+			preempt_disable();
+		}
+	}
+#endif
+	__acquire(bitlock);
+}
+
+/*
+ * Return true if it was acquired
+ */
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+	preempt_disable();
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	if (test_and_set_bit(bitnum, addr)) {
+		preempt_enable();
+		return 0;
+	}
+#endif
+	__acquire(bitlock);
+	return 1;
+}
+
+/*
+ *  bit-based spin_unlock()
+ */
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	BUG_ON(!test_bit(bitnum, addr));
+	smp_mb__before_clear_bit();
+	clear_bit(bitnum, addr);
+#endif
+	preempt_enable();
+	__release(bitlock);
+}
+
+/*
+ * Return true if the lock is held.
+ */
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	return test_bit(bitnum, addr);
+#elif defined CONFIG_PREEMPT
+	return preempt_count();
+#else
+	return 1;
+#endif
+}
+
+#endif /* __LINUX_BIT_SPINLOCK_H */
+
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 84321a4cac9..de097269bd7 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -28,6 +28,7 @@
 #include <linux/buffer_head.h>
 #include <linux/journal-head.h>
 #include <linux/stddef.h>
+#include <linux/bit_spinlock.h>
 #include <asm/semaphore.h>
 #endif
 
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d6ba068719b..cdc99a27840 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -2,7 +2,48 @@
 #define __LINUX_SPINLOCK_H
 
 /*
- * include/linux/spinlock.h - generic locking declarations
+ * include/linux/spinlock.h - generic spinlock/rwlock declarations
+ *
+ * here's the role of the various spinlock/rwlock related include files:
+ *
+ * on SMP builds:
+ *
+ *  asm/spinlock_types.h: contains the raw_spinlock_t/raw_rwlock_t and the
+ *                        initializers
+ *
+ *  linux/spinlock_types.h:
+ *                        defines the generic type and initializers
+ *
+ *  asm/spinlock.h:       contains the __raw_spin_*()/etc. lowlevel
+ *                        implementations, mostly inline assembly code
+ *
+ *   (also included on UP-debug builds:)
+ *
+ *  linux/spinlock_api_smp.h:
+ *                        contains the prototypes for the _spin_*() APIs.
+ *
+ *  linux/spinlock.h:     builds the final spin_*() APIs.
+ *
+ * on UP builds:
+ *
+ *  linux/spinlock_type_up.h:
+ *                        contains the generic, simplified UP spinlock type.
+ *                        (which is an empty structure on non-debug builds)
+ *
+ *  linux/spinlock_types.h:
+ *                        defines the generic type and initializers
+ *
+ *  linux/spinlock_up.h:
+ *                        contains the __raw_spin_*()/etc. version of UP
+ *                        builds. (which are NOPs on non-debug, non-preempt
+ *                        builds)
+ *
+ *   (included on UP-non-debug builds:)
+ *
+ *  linux/spinlock_api_up.h:
+ *                        builds the _spin_*() APIs.
+ *
+ *  linux/spinlock.h:     builds the final spin_*() APIs.
  */
 
 #include <linux/config.h>
@@ -13,7 +54,6 @@
 #include <linux/kernel.h>
 #include <linux/stringify.h>
 
-#include <asm/processor.h>	/* for cpu relax */
 #include <asm/system.h>
 
 /*
@@ -35,423 +75,84 @@
 #define __lockfunc fastcall __attribute__((section(".spinlock.text")))
 
 /*
- * If CONFIG_SMP is set, pull in the _raw_* definitions
+ * Pull the raw_spinlock_t and raw_rwlock_t definitions:
  */
-#ifdef CONFIG_SMP
-
-#define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
-#include <asm/spinlock.h>
-
-int __lockfunc _spin_trylock(spinlock_t *lock);
-int __lockfunc _read_trylock(rwlock_t *lock);
-int __lockfunc _write_trylock(rwlock_t *lock);
-
-void __lockfunc _spin_lock(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _read_lock(rwlock_t *lock)	__acquires(rwlock_t);
-void __lockfunc _write_lock(rwlock_t *lock)	__acquires(rwlock_t);
-
-void __lockfunc _spin_unlock(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _write_unlock(rwlock_t *lock)	__releases(rwlock_t);
-
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)	__acquires(spinlock_t);
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)	__acquires(rwlock_t);
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)	__acquires(rwlock_t);
-
-void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)	__releases(spinlock_t);
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)				__releases(spinlock_t);
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)				__releases(spinlock_t);
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)	__releases(rwlock_t);
-void __lockfunc _read_unlock_irq(rwlock_t *lock)				__releases(rwlock_t);
-void __lockfunc _read_unlock_bh(rwlock_t *lock)					__releases(rwlock_t);
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)	__releases(rwlock_t);
-void __lockfunc _write_unlock_irq(rwlock_t *lock)				__releases(rwlock_t);
-void __lockfunc _write_unlock_bh(rwlock_t *lock)				__releases(rwlock_t);
-
-int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock);
-int in_lock_functions(unsigned long addr);
-
-#else
+#include <linux/spinlock_types.h>
 
-#define in_lock_functions(ADDR) 0
+extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
 
-#if !defined(CONFIG_PREEMPT) && !defined(CONFIG_DEBUG_SPINLOCK)
-# define _atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
-# define ATOMIC_DEC_AND_LOCK
-#endif
-
-#ifdef CONFIG_DEBUG_SPINLOCK
- 
-#define SPINLOCK_MAGIC	0x1D244B3C
-typedef struct {
-	unsigned long magic;
-	volatile unsigned long lock;
-	volatile unsigned int babble;
-	const char *module;
-	char *owner;
-	int oline;
-} spinlock_t;
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0}
-
-#define spin_lock_init(x) \
-	do { \
-		(x)->magic = SPINLOCK_MAGIC; \
-		(x)->lock = 0; \
-		(x)->babble = 5; \
-		(x)->module = __FILE__; \
-		(x)->owner = NULL; \
-		(x)->oline = 0; \
-	} while (0)
-
-#define CHECK_LOCK(x) \
-	do { \
-	 	if ((x)->magic != SPINLOCK_MAGIC) { \
-			printk(KERN_ERR "%s:%d: spin_is_locked on uninitialized spinlock %p.\n", \
-					__FILE__, __LINE__, (x)); \
-		} \
-	} while(0)
-
-#define _raw_spin_lock(x)		\
-	do { \
-	 	CHECK_LOCK(x); \
-		if ((x)->lock&&(x)->babble) { \
-			(x)->babble--; \
-			printk("%s:%d: spin_lock(%s:%p) already locked by %s/%d\n", \
-					__FILE__,__LINE__, (x)->module, \
-					(x), (x)->owner, (x)->oline); \
-		} \
-		(x)->lock = 1; \
-		(x)->owner = __FILE__; \
-		(x)->oline = __LINE__; \
-	} while (0)
-
-/* without debugging, spin_is_locked on UP always says
- * FALSE. --> printk if already locked. */
-#define spin_is_locked(x) \
-	({ \
-	 	CHECK_LOCK(x); \
-		if ((x)->lock&&(x)->babble) { \
-			(x)->babble--; \
-			printk("%s:%d: spin_is_locked(%s:%p) already locked by %s/%d\n", \
-					__FILE__,__LINE__, (x)->module, \
-					(x), (x)->owner, (x)->oline); \
-		} \
-		0; \
-	})
-
-/* with debugging, assert_spin_locked() on UP does check
- * the lock value properly */
-#define assert_spin_locked(x) \
-	({ \
-		CHECK_LOCK(x); \
-		BUG_ON(!(x)->lock); \
-	})
-
-/* without debugging, spin_trylock on UP always says
- * TRUE. --> printk if already locked. */
-#define _raw_spin_trylock(x) \
-	({ \
-	 	CHECK_LOCK(x); \
-		if ((x)->lock&&(x)->babble) { \
-			(x)->babble--; \
-			printk("%s:%d: spin_trylock(%s:%p) already locked by %s/%d\n", \
-					__FILE__,__LINE__, (x)->module, \
-					(x), (x)->owner, (x)->oline); \
-		} \
-		(x)->lock = 1; \
-		(x)->owner = __FILE__; \
-		(x)->oline = __LINE__; \
-		1; \
-	})
-
-#define spin_unlock_wait(x)	\
-	do { \
-	 	CHECK_LOCK(x); \
-		if ((x)->lock&&(x)->babble) { \
-			(x)->babble--; \
-			printk("%s:%d: spin_unlock_wait(%s:%p) owned by %s/%d\n", \
-					__FILE__,__LINE__, (x)->module, (x), \
-					(x)->owner, (x)->oline); \
-		}\
-	} while (0)
-
-#define _raw_spin_unlock(x) \
-	do { \
-	 	CHECK_LOCK(x); \
-		if (!(x)->lock&&(x)->babble) { \
-			(x)->babble--; \
-			printk("%s:%d: spin_unlock(%s:%p) not locked\n", \
-					__FILE__,__LINE__, (x)->module, (x));\
-		} \
-		(x)->lock = 0; \
-	} while (0)
-#else
 /*
- * gcc versions before ~2.95 have a nasty bug with empty initializers.
+ * Pull the __raw*() functions/declarations (UP-nondebug doesnt need them):
  */
-#if (__GNUC__ > 2)
-  typedef struct { } spinlock_t;
-  #define SPIN_LOCK_UNLOCKED (spinlock_t) { }
+#if defined(CONFIG_SMP)
+# include <asm/spinlock.h>
 #else
-  typedef struct { int gcc_is_buggy; } spinlock_t;
-  #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
+# include <linux/spinlock_up.h>
 #endif
 
+#define spin_lock_init(lock)	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
+#define rwlock_init(lock)	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
+
+#define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
+
+/**
+ * spin_unlock_wait - wait until the spinlock gets unlocked
+ * @lock: the spinlock in question.
+ */
+#define spin_unlock_wait(lock)	__raw_spin_unlock_wait(&(lock)->raw_lock)
+
 /*
- * If CONFIG_SMP is unset, declare the _raw_* definitions as nops
+ * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
  */
-#define spin_lock_init(lock)	do { (void)(lock); } while(0)
-#define _raw_spin_lock(lock)	do { (void)(lock); } while(0)
-#define spin_is_locked(lock)	((void)(lock), 0)
-#define assert_spin_locked(lock)	do { (void)(lock); } while(0)
-#define _raw_spin_trylock(lock)	(((void)(lock), 1))
-#define spin_unlock_wait(lock)	(void)(lock)
-#define _raw_spin_unlock(lock) do { (void)(lock); } while(0)
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
-/* RW spinlocks: No debug version */
-
-#if (__GNUC__ > 2)
-  typedef struct { } rwlock_t;
-  #define RW_LOCK_UNLOCKED (rwlock_t) { }
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+# include <linux/spinlock_api_smp.h>
 #else
-  typedef struct { int gcc_is_buggy; } rwlock_t;
-  #define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
+# include <linux/spinlock_api_up.h>
 #endif
 
-#define rwlock_init(lock)	do { (void)(lock); } while(0)
-#define _raw_read_lock(lock)	do { (void)(lock); } while(0)
-#define _raw_read_unlock(lock)	do { (void)(lock); } while(0)
-#define _raw_write_lock(lock)	do { (void)(lock); } while(0)
-#define _raw_write_unlock(lock)	do { (void)(lock); } while(0)
-#define read_can_lock(lock)	(((void)(lock), 1))
-#define write_can_lock(lock)	(((void)(lock), 1))
-#define _raw_read_trylock(lock) ({ (void)(lock); (1); })
-#define _raw_write_trylock(lock) ({ (void)(lock); (1); })
-
-#define _spin_trylock(lock)	({preempt_disable(); _raw_spin_trylock(lock) ? \
-				1 : ({preempt_enable(); 0;});})
-
-#define _read_trylock(lock)	({preempt_disable();_raw_read_trylock(lock) ? \
-				1 : ({preempt_enable(); 0;});})
-
-#define _write_trylock(lock)	({preempt_disable(); _raw_write_trylock(lock) ? \
-				1 : ({preempt_enable(); 0;});})
-
-#define _spin_trylock_bh(lock)	({preempt_disable(); local_bh_disable(); \
-				_raw_spin_trylock(lock) ? \
-				1 : ({preempt_enable_no_resched(); local_bh_enable(); 0;});})
-
-#define _spin_lock(lock)	\
-do { \
-	preempt_disable(); \
-	_raw_spin_lock(lock); \
-	__acquire(lock); \
-} while(0)
-
-#define _write_lock(lock) \
-do { \
-	preempt_disable(); \
-	_raw_write_lock(lock); \
-	__acquire(lock); \
-} while(0)
- 
-#define _read_lock(lock)	\
-do { \
-	preempt_disable(); \
-	_raw_read_lock(lock); \
-	__acquire(lock); \
-} while(0)
-
-#define _spin_unlock(lock) \
-do { \
-	_raw_spin_unlock(lock); \
-	preempt_enable(); \
-	__release(lock); \
-} while (0)
-
-#define _write_unlock(lock) \
-do { \
-	_raw_write_unlock(lock); \
-	preempt_enable(); \
-	__release(lock); \
-} while(0)
-
-#define _read_unlock(lock) \
-do { \
-	_raw_read_unlock(lock); \
-	preempt_enable(); \
-	__release(lock); \
-} while(0)
-
-#define _spin_lock_irqsave(lock, flags) \
-do {	\
-	local_irq_save(flags); \
-	preempt_disable(); \
-	_raw_spin_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _spin_lock_irq(lock) \
-do { \
-	local_irq_disable(); \
-	preempt_disable(); \
-	_raw_spin_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _spin_lock_bh(lock) \
-do { \
-	local_bh_disable(); \
-	preempt_disable(); \
-	_raw_spin_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _read_lock_irqsave(lock, flags) \
-do {	\
-	local_irq_save(flags); \
-	preempt_disable(); \
-	_raw_read_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _read_lock_irq(lock) \
-do { \
-	local_irq_disable(); \
-	preempt_disable(); \
-	_raw_read_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _read_lock_bh(lock) \
-do { \
-	local_bh_disable(); \
-	preempt_disable(); \
-	_raw_read_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _write_lock_irqsave(lock, flags) \
-do {	\
-	local_irq_save(flags); \
-	preempt_disable(); \
-	_raw_write_lock(lock); \
-	__acquire(lock); \
-} while (0)
+#ifdef CONFIG_DEBUG_SPINLOCK
+ extern void _raw_spin_lock(spinlock_t *lock);
+#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+ extern int _raw_spin_trylock(spinlock_t *lock);
+ extern void _raw_spin_unlock(spinlock_t *lock);
+
+ extern void _raw_read_lock(rwlock_t *lock);
+ extern int _raw_read_trylock(rwlock_t *lock);
+ extern void _raw_read_unlock(rwlock_t *lock);
+ extern void _raw_write_lock(rwlock_t *lock);
+ extern int _raw_write_trylock(rwlock_t *lock);
+ extern void _raw_write_unlock(rwlock_t *lock);
+#else
+# define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
+# define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
+# define _raw_spin_lock(lock)		__raw_spin_lock(&(lock)->raw_lock)
+# define _raw_spin_lock_flags(lock, flags) \
+		__raw_spin_lock_flags(&(lock)->raw_lock, *(flags))
+# define _raw_read_lock(rwlock)		__raw_read_lock(&(rwlock)->raw_lock)
+# define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
+# define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
+# define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
+# define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
+# define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
+#endif
 
-#define _write_lock_irq(lock) \
-do { \
-	local_irq_disable(); \
-	preempt_disable(); \
-	_raw_write_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _write_lock_bh(lock) \
-do { \
-	local_bh_disable(); \
-	preempt_disable(); \
-	_raw_write_lock(lock); \
-	__acquire(lock); \
-} while (0)
-
-#define _spin_unlock_irqrestore(lock, flags) \
-do { \
-	_raw_spin_unlock(lock); \
-	local_irq_restore(flags); \
-	preempt_enable(); \
-	__release(lock); \
-} while (0)
-
-#define _spin_unlock_irq(lock) \
-do { \
-	_raw_spin_unlock(lock); \
-	local_irq_enable(); \
-	preempt_enable(); \
-	__release(lock); \
-} while (0)
-
-#define _spin_unlock_bh(lock) \
-do { \
-	_raw_spin_unlock(lock); \
-	preempt_enable_no_resched(); \
-	local_bh_enable(); \
-	__release(lock); \
-} while (0)
-
-#define _write_unlock_bh(lock) \
-do { \
-	_raw_write_unlock(lock); \
-	preempt_enable_no_resched(); \
-	local_bh_enable(); \
-	__release(lock); \
-} while (0)
-
-#define _read_unlock_irqrestore(lock, flags) \
-do { \
-	_raw_read_unlock(lock); \
-	local_irq_restore(flags); \
-	preempt_enable(); \
-	__release(lock); \
-} while (0)
-
-#define _write_unlock_irqrestore(lock, flags) \
-do { \
-	_raw_write_unlock(lock); \
-	local_irq_restore(flags); \
-	preempt_enable(); \
-	__release(lock); \
-} while (0)
-
-#define _read_unlock_irq(lock)	\
-do { \
-	_raw_read_unlock(lock);	\
-	local_irq_enable();	\
-	preempt_enable();	\
-	__release(lock); \
-} while (0)
-
-#define _read_unlock_bh(lock)	\
-do { \
-	_raw_read_unlock(lock);	\
-	preempt_enable_no_resched();	\
-	local_bh_enable();	\
-	__release(lock); \
-} while (0)
-
-#define _write_unlock_irq(lock)	\
-do { \
-	_raw_write_unlock(lock);	\
-	local_irq_enable();	\
-	preempt_enable();	\
-	__release(lock); \
-} while (0)
-
-#endif /* !SMP */
+#define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
+#define write_can_lock(rwlock)		__raw_write_can_lock(&(rwlock)->raw_lock)
 
 /*
  * Define the various spin_lock and rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
  * methods are defined as nops in the case they are not required.
  */
-#define spin_trylock(lock)	__cond_lock(_spin_trylock(lock))
-#define read_trylock(lock)	__cond_lock(_read_trylock(lock))
-#define write_trylock(lock)	__cond_lock(_write_trylock(lock))
+#define spin_trylock(lock)		__cond_lock(_spin_trylock(lock))
+#define read_trylock(lock)		__cond_lock(_read_trylock(lock))
+#define write_trylock(lock)		__cond_lock(_write_trylock(lock))
 
-#define spin_lock(lock)		_spin_lock(lock)
-#define write_lock(lock)	_write_lock(lock)
-#define read_lock(lock)		_read_lock(lock)
+#define spin_lock(lock)			_spin_lock(lock)
+#define write_lock(lock)		_write_lock(lock)
+#define read_lock(lock)			_read_lock(lock)
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 #define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
 #define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
 #define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
@@ -470,137 +171,59 @@ do { \
 #define write_lock_irq(lock)		_write_lock_irq(lock)
 #define write_lock_bh(lock)		_write_lock_bh(lock)
 
-#define spin_unlock(lock)	_spin_unlock(lock)
-#define write_unlock(lock)	_write_unlock(lock)
-#define read_unlock(lock)	_read_unlock(lock)
+#define spin_unlock(lock)		_spin_unlock(lock)
+#define write_unlock(lock)		_write_unlock(lock)
+#define read_unlock(lock)		_read_unlock(lock)
 
-#define spin_unlock_irqrestore(lock, flags)	_spin_unlock_irqrestore(lock, flags)
+#define spin_unlock_irqrestore(lock, flags) \
+					_spin_unlock_irqrestore(lock, flags)
 #define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
 #define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
 
-#define read_unlock_irqrestore(lock, flags)	_read_unlock_irqrestore(lock, flags)
-#define read_unlock_irq(lock)			_read_unlock_irq(lock)
-#define read_unlock_bh(lock)			_read_unlock_bh(lock)
+#define read_unlock_irqrestore(lock, flags) \
+					_read_unlock_irqrestore(lock, flags)
+#define read_unlock_irq(lock)		_read_unlock_irq(lock)
+#define read_unlock_bh(lock)		_read_unlock_bh(lock)
 
-#define write_unlock_irqrestore(lock, flags)	_write_unlock_irqrestore(lock, flags)
-#define write_unlock_irq(lock)			_write_unlock_irq(lock)
-#define write_unlock_bh(lock)			_write_unlock_bh(lock)
+#define write_unlock_irqrestore(lock, flags) \
+					_write_unlock_irqrestore(lock, flags)
+#define write_unlock_irq(lock)		_write_unlock_irq(lock)
+#define write_unlock_bh(lock)		_write_unlock_bh(lock)
 
-#define spin_trylock_bh(lock)			__cond_lock(_spin_trylock_bh(lock))
+#define spin_trylock_bh(lock)		__cond_lock(_spin_trylock_bh(lock))
 
 #define spin_trylock_irq(lock) \
 ({ \
 	local_irq_disable(); \
 	_spin_trylock(lock) ? \
-	1 : ({local_irq_enable(); 0; }); \
+	1 : ({ local_irq_enable(); 0;  }); \
 })
 
 #define spin_trylock_irqsave(lock, flags) \
 ({ \
 	local_irq_save(flags); \
 	_spin_trylock(lock) ? \
-	1 : ({local_irq_restore(flags); 0;}); \
+	1 : ({ local_irq_restore(flags); 0; }); \
 })
 
-#ifdef CONFIG_LOCKMETER
-extern void _metered_spin_lock   (spinlock_t *lock);
-extern void _metered_spin_unlock (spinlock_t *lock);
-extern int  _metered_spin_trylock(spinlock_t *lock);
-extern void _metered_read_lock    (rwlock_t *lock);
-extern void _metered_read_unlock  (rwlock_t *lock);
-extern void _metered_write_lock   (rwlock_t *lock);
-extern void _metered_write_unlock (rwlock_t *lock);
-extern int  _metered_read_trylock (rwlock_t *lock);
-extern int  _metered_write_trylock(rwlock_t *lock);
-#endif
-
-/* "lock on reference count zero" */
-#ifndef ATOMIC_DEC_AND_LOCK
-#include <asm/atomic.h>
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
-#endif
-
-#define atomic_dec_and_lock(atomic,lock) __cond_lock(_atomic_dec_and_lock(atomic,lock))
-
-/*
- *  bit-based spin_lock()
- *
- * Don't use this unless you really need to: spin_lock() and spin_unlock()
- * are significantly faster.
- */
-static inline void bit_spin_lock(int bitnum, unsigned long *addr)
-{
-	/*
-	 * Assuming the lock is uncontended, this never enters
-	 * the body of the outer loop. If it is contended, then
-	 * within the inner loop a non-atomic test is used to
-	 * busywait with less bus contention for a good time to
-	 * attempt to acquire the lock bit.
-	 */
-	preempt_disable();
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	while (test_and_set_bit(bitnum, addr)) {
-		while (test_bit(bitnum, addr)) {
-			preempt_enable();
-			cpu_relax();
-			preempt_disable();
-		}
-	}
-#endif
-	__acquire(bitlock);
-}
-
 /*
- * Return true if it was acquired
+ * Pull the atomic_t declaration:
+ * (asm-mips/atomic.h needs above definitions)
  */
-static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
-{
-	preempt_disable();	
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	if (test_and_set_bit(bitnum, addr)) {
-		preempt_enable();
-		return 0;
-	}
-#endif
-	__acquire(bitlock);
-	return 1;
-}
-
-/*
- *  bit-based spin_unlock()
- */
-static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
-{
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	BUG_ON(!test_bit(bitnum, addr));
-	smp_mb__before_clear_bit();
-	clear_bit(bitnum, addr);
-#endif
-	preempt_enable();
-	__release(bitlock);
-}
-
-/*
- * Return true if the lock is held.
+#include <asm/atomic.h>
+/**
+ * atomic_dec_and_lock - lock on reaching reference count zero
+ * @atomic: the atomic counter
+ * @lock: the spinlock in question
  */
-static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
-{
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT
-	return preempt_count();
-#else
-	return 1;
-#endif
-}
-
-#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED
-#define DEFINE_RWLOCK(x) rwlock_t x = RW_LOCK_UNLOCKED
+extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
+#define atomic_dec_and_lock(atomic, lock) \
+		__cond_lock(_atomic_dec_and_lock(atomic, lock))
 
 /**
  * spin_can_lock - would spin_trylock() succeed?
  * @lock: the spinlock in question.
  */
-#define spin_can_lock(lock)		(!spin_is_locked(lock))
+#define spin_can_lock(lock)	(!spin_is_locked(lock))
 
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
new file mode 100644
index 00000000000..78e6989ffb5
--- /dev/null
+++ b/include/linux/spinlock_api_smp.h
@@ -0,0 +1,57 @@
+#ifndef __LINUX_SPINLOCK_API_SMP_H
+#define __LINUX_SPINLOCK_API_SMP_H
+
+#ifndef __LINUX_SPINLOCK_H
+# error "please don't include this file directly"
+#endif
+
+/*
+ * include/linux/spinlock_api_smp.h
+ *
+ * spinlock API declarations on SMP (and debug)
+ * (implemented in kernel/spinlock.c)
+ *
+ * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ */
+
+int in_lock_functions(unsigned long addr);
+
+#define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
+
+void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(spinlock_t);
+void __lockfunc _read_lock(rwlock_t *lock)		__acquires(rwlock_t);
+void __lockfunc _write_lock(rwlock_t *lock)		__acquires(rwlock_t);
+void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
+void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
+void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
+void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(spinlock_t);
+void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
+void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+							__acquires(spinlock_t);
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+							__acquires(rwlock_t);
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+							__acquires(rwlock_t);
+int __lockfunc _spin_trylock(spinlock_t *lock);
+int __lockfunc _read_trylock(rwlock_t *lock);
+int __lockfunc _write_trylock(rwlock_t *lock);
+int __lockfunc _spin_trylock_bh(spinlock_t *lock);
+void __lockfunc _spin_unlock(spinlock_t *lock)		__releases(spinlock_t);
+void __lockfunc _read_unlock(rwlock_t *lock)		__releases(rwlock_t);
+void __lockfunc _write_unlock(rwlock_t *lock)		__releases(rwlock_t);
+void __lockfunc _spin_unlock_bh(spinlock_t *lock)	__releases(spinlock_t);
+void __lockfunc _read_unlock_bh(rwlock_t *lock)		__releases(rwlock_t);
+void __lockfunc _write_unlock_bh(rwlock_t *lock)	__releases(rwlock_t);
+void __lockfunc _spin_unlock_irq(spinlock_t *lock)	__releases(spinlock_t);
+void __lockfunc _read_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
+void __lockfunc _write_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
+void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+							__releases(spinlock_t);
+void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+							__releases(rwlock_t);
+void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+							__releases(rwlock_t);
+
+#endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
new file mode 100644
index 00000000000..cd81cee566f
--- /dev/null
+++ b/include/linux/spinlock_api_up.h
@@ -0,0 +1,80 @@
+#ifndef __LINUX_SPINLOCK_API_UP_H
+#define __LINUX_SPINLOCK_API_UP_H
+
+#ifndef __LINUX_SPINLOCK_H
+# error "please don't include this file directly"
+#endif
+
+/*
+ * include/linux/spinlock_api_up.h
+ *
+ * spinlock API implementation on UP-nondebug (inlined implementation)
+ *
+ * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ */
+
+#define in_lock_functions(ADDR)		0
+
+#define assert_spin_locked(lock)	do { (void)(lock); } while (0)
+
+/*
+ * In the UP-nondebug case there's no real locking going on, so the
+ * only thing we have to do is to keep the preempt counts and irq
+ * flags straight, to supress compiler warnings of unused lock
+ * variables, and to add the proper checker annotations:
+ */
+#define __LOCK(lock) \
+  do { preempt_disable(); __acquire(lock); (void)(lock); } while (0)
+
+#define __LOCK_BH(lock) \
+  do { local_bh_disable(); __LOCK(lock); } while (0)
+
+#define __LOCK_IRQ(lock) \
+  do { local_irq_disable(); __LOCK(lock); } while (0)
+
+#define __LOCK_IRQSAVE(lock, flags) \
+  do { local_irq_save(flags); __LOCK(lock); } while (0)
+
+#define __UNLOCK(lock) \
+  do { preempt_enable(); __release(lock); (void)(lock); } while (0)
+
+#define __UNLOCK_BH(lock) \
+  do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)
+
+#define __UNLOCK_IRQ(lock) \
+  do { local_irq_enable(); __UNLOCK(lock); } while (0)
+
+#define __UNLOCK_IRQRESTORE(lock, flags) \
+  do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
+
+#define _spin_lock(lock)			__LOCK(lock)
+#define _read_lock(lock)			__LOCK(lock)
+#define _write_lock(lock)			__LOCK(lock)
+#define _spin_lock_bh(lock)			__LOCK_BH(lock)
+#define _read_lock_bh(lock)			__LOCK_BH(lock)
+#define _write_lock_bh(lock)			__LOCK_BH(lock)
+#define _spin_lock_irq(lock)			__LOCK_IRQ(lock)
+#define _read_lock_irq(lock)			__LOCK_IRQ(lock)
+#define _write_lock_irq(lock)			__LOCK_IRQ(lock)
+#define _spin_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
+#define _read_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
+#define _write_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
+#define _spin_trylock(lock)			({ __LOCK(lock); 1; })
+#define _read_trylock(lock)			({ __LOCK(lock); 1; })
+#define _write_trylock(lock)			({ __LOCK(lock); 1; })
+#define _spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
+#define _spin_unlock(lock)			__UNLOCK(lock)
+#define _read_unlock(lock)			__UNLOCK(lock)
+#define _write_unlock(lock)			__UNLOCK(lock)
+#define _spin_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define _write_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define _read_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define _spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
+#define _read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
+#define _write_unlock_irq(lock)			__UNLOCK_IRQ(lock)
+#define _spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define _read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define _write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+
+#endif /* __LINUX_SPINLOCK_API_UP_H */
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
new file mode 100644
index 00000000000..9cb51e07039
--- /dev/null
+++ b/include/linux/spinlock_types.h
@@ -0,0 +1,67 @@
+#ifndef __LINUX_SPINLOCK_TYPES_H
+#define __LINUX_SPINLOCK_TYPES_H
+
+/*
+ * include/linux/spinlock_types.h - generic spinlock type definitions
+ *                                  and initializers
+ *
+ * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ */
+
+#if defined(CONFIG_SMP)
+# include <asm/spinlock_types.h>
+#else
+# include <linux/spinlock_types_up.h>
+#endif
+
+typedef struct {
+	raw_spinlock_t raw_lock;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+	unsigned int break_lock;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+	unsigned int magic, owner_cpu;
+	void *owner;
+#endif
+} spinlock_t;
+
+#define SPINLOCK_MAGIC		0xdead4ead
+
+typedef struct {
+	raw_rwlock_t raw_lock;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+	unsigned int break_lock;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+	unsigned int magic, owner_cpu;
+	void *owner;
+#endif
+} rwlock_t;
+
+#define RWLOCK_MAGIC		0xdeaf1eed
+
+#define SPINLOCK_OWNER_INIT	((void *)-1L)
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define SPIN_LOCK_UNLOCKED						\
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = SPINLOCK_MAGIC,		\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+#define RW_LOCK_UNLOCKED						\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+				.magic = RWLOCK_MAGIC,			\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+#else
+# define SPIN_LOCK_UNLOCKED \
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+#define RW_LOCK_UNLOCKED \
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+#endif
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = SPIN_LOCK_UNLOCKED
+#define DEFINE_RWLOCK(x)	rwlock_t x = RW_LOCK_UNLOCKED
+
+#endif /* __LINUX_SPINLOCK_TYPES_H */
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
new file mode 100644
index 00000000000..def2d173a8d
--- /dev/null
+++ b/include/linux/spinlock_types_up.h
@@ -0,0 +1,51 @@
+#ifndef __LINUX_SPINLOCK_TYPES_UP_H
+#define __LINUX_SPINLOCK_TYPES_UP_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+/*
+ * include/linux/spinlock_types_up.h - spinlock type definitions for UP
+ *
+ * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ */
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+
+typedef struct {
+	volatile unsigned int slock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED { 1 }
+
+#else
+
+/*
+ * All gcc 2.95 versions and early versions of 2.96 have a nasty bug
+ * with empty initializers.
+ */
+#if (__GNUC__ > 2)
+typedef struct { } raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED { }
+#else
+typedef struct { int gcc_is_buggy; } raw_spinlock_t;
+#define __RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 0 }
+#endif
+
+#endif
+
+#if (__GNUC__ > 2)
+typedef struct {
+	/* no debug version on UP */
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED { }
+#else
+typedef struct { int gcc_is_buggy; } raw_rwlock_t;
+#define __RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) { 0 }
+#endif
+
+#endif /* __LINUX_SPINLOCK_TYPES_UP_H */
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
new file mode 100644
index 00000000000..31accf2f0b1
--- /dev/null
+++ b/include/linux/spinlock_up.h
@@ -0,0 +1,74 @@
+#ifndef __LINUX_SPINLOCK_UP_H
+#define __LINUX_SPINLOCK_UP_H
+
+#ifndef __LINUX_SPINLOCK_H
+# error "please don't include this file directly"
+#endif
+
+/*
+ * include/linux/spinlock_up.h - UP-debug version of spinlocks.
+ *
+ * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ *
+ * In the debug case, 1 means unlocked, 0 means locked. (the values
+ * are inverted, to catch initialization bugs)
+ *
+ * No atomicity anywhere, we are on UP.
+ */
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+
+#define __raw_spin_is_locked(x)		((x)->slock == 0)
+
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+	lock->slock = 0;
+}
+
+static inline void
+__raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+{
+	local_irq_save(flags);
+	lock->slock = 0;
+}
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+	char oldval = lock->slock;
+
+	lock->slock = 0;
+
+	return oldval > 0;
+}
+
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+	lock->slock = 1;
+}
+
+/*
+ * Read-write spinlocks. No debug version.
+ */
+#define __raw_read_lock(lock)		do { (void)(lock); } while (0)
+#define __raw_write_lock(lock)		do { (void)(lock); } while (0)
+#define __raw_read_trylock(lock)	({ (void)(lock); 1; })
+#define __raw_write_trylock(lock)	({ (void)(lock); 1; })
+#define __raw_read_unlock(lock)		do { (void)(lock); } while (0)
+#define __raw_write_unlock(lock)	do { (void)(lock); } while (0)
+
+#else /* DEBUG_SPINLOCK */
+#define __raw_spin_is_locked(lock)	((void)(lock), 0)
+/* for sched.c and kernel_lock.c: */
+# define __raw_spin_lock(lock)		do { (void)(lock); } while (0)
+# define __raw_spin_unlock(lock)	do { (void)(lock); } while (0)
+# define __raw_spin_trylock(lock)	({ (void)(lock); 1; })
+#endif /* DEBUG_SPINLOCK */
+
+#define __raw_read_can_lock(lock)	(((void)(lock), 1))
+#define __raw_write_can_lock(lock)	(((void)(lock), 1))
+
+#define __raw_spin_unlock_wait(lock) \
+		do { cpu_relax(); } while (__raw_spin_is_locked(lock))
+
+#endif /* __LINUX_SPINLOCK_UP_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 8d57a2f1226..ff4dc02ce17 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 2632b812cf2..15db82116e1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1511,6 +1511,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	rq->lock.owner = current;
+#endif
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	if (mm)
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe1..0375fcd5921 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
  *
  * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
  *
- * Copyright (2004) Ingo Molnar
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
  */
 
 #include <linux/config.h>
@@ -17,12 +20,12 @@
  * Generic declaration of the raw read_trylock() function,
  * architectures are supposed to optimize this:
  */
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
 {
-	_raw_read_lock(lock);
+	__raw_read_lock(lock);
 	return 1;
 }
-EXPORT_SYMBOL(generic_raw_read_trylock);
+EXPORT_SYMBOL(generic__raw_read_trylock);
 
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
 }
 EXPORT_SYMBOL(_write_trylock);
 
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
 
 void __lockfunc _read_lock(rwlock_t *lock)
 {
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock_flags(lock, flags);
+	_raw_spin_lock_flags(lock, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/lib/Makefile b/lib/Makefile
index d9c38ba05e7..44a46750690 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -16,6 +16,7 @@ CFLAGS_kobject.o += -DDEBUG
 CFLAGS_kobject_uevent.o += -DDEBUG
 endif
 
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c
index 6658d81e183..2377af057d0 100644
--- a/lib/dec_and_lock.c
+++ b/lib/dec_and_lock.c
@@ -25,8 +25,6 @@
  * this is trivially done efficiently using a load-locked
  * store-conditional approach, for example.
  */
-
-#ifndef ATOMIC_DEC_AND_LOCK
 int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
 	spin_lock(lock);
@@ -37,4 +35,3 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 }
 
 EXPORT_SYMBOL(_atomic_dec_and_lock);
-#endif
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index bd2bc5d887b..cb5490ec00f 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -177,8 +177,7 @@ static inline void __lock_kernel(void)
 
 static inline void __unlock_kernel(void)
 {
-	_raw_spin_unlock(&kernel_flag);
-	preempt_enable();
+	spin_unlock(&kernel_flag);
 }
 
 /*
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
new file mode 100644
index 00000000000..906ad101eab
--- /dev/null
+++ b/lib/spinlock_debug.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ *
+ * This file contains the spinlock/rwlock implementations for
+ * DEBUG_SPINLOCK.
+ */
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+
+static void spin_bug(spinlock_t *lock, const char *msg)
+{
+	static long print_once = 1;
+	struct task_struct *owner = NULL;
+
+	if (xchg(&print_once, 0)) {
+		if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
+			owner = lock->owner;
+		printk("BUG: spinlock %s on CPU#%d, %s/%d\n",
+			msg, smp_processor_id(), current->comm, current->pid);
+		printk(" lock: %p, .magic: %08x, .owner: %s/%d, .owner_cpu: %d\n",
+			lock, lock->magic,
+			owner ? owner->comm : "<none>",
+			owner ? owner->pid : -1,
+			lock->owner_cpu);
+		dump_stack();
+#ifdef CONFIG_SMP
+		/*
+		 * We cannot continue on SMP:
+		 */
+//		panic("bad locking");
+#endif
+	}
+}
+
+#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
+
+static inline void debug_spin_lock_before(spinlock_t *lock)
+{
+	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+	SPIN_BUG_ON(lock->owner == current, lock, "recursion");
+	SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+							lock, "cpu recursion");
+}
+
+static inline void debug_spin_lock_after(spinlock_t *lock)
+{
+	lock->owner_cpu = raw_smp_processor_id();
+	lock->owner = current;
+}
+
+static inline void debug_spin_unlock(spinlock_t *lock)
+{
+	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+	SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked");
+	SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
+	SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+							lock, "wrong CPU");
+	lock->owner = SPINLOCK_OWNER_INIT;
+	lock->owner_cpu = -1;
+}
+
+static void __spin_lock_debug(spinlock_t *lock)
+{
+	int print_once = 1;
+	u64 i;
+
+	for (;;) {
+		for (i = 0; i < loops_per_jiffy * HZ; i++) {
+			cpu_relax();
+			if (__raw_spin_trylock(&lock->raw_lock))
+				return;
+		}
+		/* lockup suspected: */
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: spinlock lockup on CPU#%d, %s/%d, %p\n",
+				smp_processor_id(), current->comm, current->pid,
+					lock);
+			dump_stack();
+		}
+	}
+}
+
+void _raw_spin_lock(spinlock_t *lock)
+{
+	debug_spin_lock_before(lock);
+	if (unlikely(!__raw_spin_trylock(&lock->raw_lock)))
+		__spin_lock_debug(lock);
+	debug_spin_lock_after(lock);
+}
+
+int _raw_spin_trylock(spinlock_t *lock)
+{
+	int ret = __raw_spin_trylock(&lock->raw_lock);
+
+	if (ret)
+		debug_spin_lock_after(lock);
+#ifndef CONFIG_SMP
+	/*
+	 * Must not happen on UP:
+	 */
+	SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+	return ret;
+}
+
+void _raw_spin_unlock(spinlock_t *lock)
+{
+	debug_spin_unlock(lock);
+	__raw_spin_unlock(&lock->raw_lock);
+}
+
+static void rwlock_bug(rwlock_t *lock, const char *msg)
+{
+	static long print_once = 1;
+
+	if (xchg(&print_once, 0)) {
+		printk("BUG: rwlock %s on CPU#%d, %s/%d, %p\n", msg,
+			smp_processor_id(), current->comm, current->pid, lock);
+		dump_stack();
+#ifdef CONFIG_SMP
+		/*
+		 * We cannot continue on SMP:
+		 */
+		panic("bad locking");
+#endif
+	}
+}
+
+#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
+
+static void __read_lock_debug(rwlock_t *lock)
+{
+	int print_once = 1;
+	u64 i;
+
+	for (;;) {
+		for (i = 0; i < loops_per_jiffy * HZ; i++) {
+			cpu_relax();
+			if (__raw_read_trylock(&lock->raw_lock))
+				return;
+		}
+		/* lockup suspected: */
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: read-lock lockup on CPU#%d, %s/%d, %p\n",
+				smp_processor_id(), current->comm, current->pid,
+					lock);
+			dump_stack();
+		}
+	}
+}
+
+void _raw_read_lock(rwlock_t *lock)
+{
+	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+	if (unlikely(!__raw_read_trylock(&lock->raw_lock)))
+		__read_lock_debug(lock);
+}
+
+int _raw_read_trylock(rwlock_t *lock)
+{
+	int ret = __raw_read_trylock(&lock->raw_lock);
+
+#ifndef CONFIG_SMP
+	/*
+	 * Must not happen on UP:
+	 */
+	RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+	return ret;
+}
+
+void _raw_read_unlock(rwlock_t *lock)
+{
+	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+	__raw_read_unlock(&lock->raw_lock);
+}
+
+static inline void debug_write_lock_before(rwlock_t *lock)
+{
+	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+	RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
+	RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+							lock, "cpu recursion");
+}
+
+static inline void debug_write_lock_after(rwlock_t *lock)
+{
+	lock->owner_cpu = raw_smp_processor_id();
+	lock->owner = current;
+}
+
+static inline void debug_write_unlock(rwlock_t *lock)
+{
+	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+	RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
+	RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+							lock, "wrong CPU");
+	lock->owner = SPINLOCK_OWNER_INIT;
+	lock->owner_cpu = -1;
+}
+
+static void __write_lock_debug(rwlock_t *lock)
+{
+	int print_once = 1;
+	u64 i;
+
+	for (;;) {
+		for (i = 0; i < loops_per_jiffy * HZ; i++) {
+			cpu_relax();
+			if (__raw_write_trylock(&lock->raw_lock))
+				return;
+		}
+		/* lockup suspected: */
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: write-lock lockup on CPU#%d, %s/%d, %p\n",
+				smp_processor_id(), current->comm, current->pid,
+					lock);
+			dump_stack();
+		}
+	}
+}
+
+void _raw_write_lock(rwlock_t *lock)
+{
+	debug_write_lock_before(lock);
+	if (unlikely(!__raw_write_trylock(&lock->raw_lock)))
+		__write_lock_debug(lock);
+	debug_write_lock_after(lock);
+}
+
+int _raw_write_trylock(rwlock_t *lock)
+{
+	int ret = __raw_write_trylock(&lock->raw_lock);
+
+	if (ret)
+		debug_write_lock_after(lock);
+#ifndef CONFIG_SMP
+	/*
+	 * Must not happen on UP:
+	 */
+	RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+	return ret;
+}
+
+void _raw_write_unlock(rwlock_t *lock)
+{
+	debug_write_unlock(lock);
+	__raw_write_unlock(&lock->raw_lock);
+}
-- 
cgit v1.2.3-70-g09d2


From d79fc0fc6645b0cf5cd980da76942ca6d6300fa4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 10 Sep 2005 00:26:12 -0700
Subject: [PATCH] sched: TASK_NONINTERACTIVE

This patch implements a task state bit (TASK_NONINTERACTIVE), which can be
used by blocking points to mark the task's wait as "non-interactive".  This
does not mean the task will be considered a CPU-hog - the wait will simply
not have an effect on the waiting task's priority - positive or negative
alike.  Right now only pipe_wait() will make use of it, because it's a
common source of not-so-interactive waits (kernel compilation jobs, etc.).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/pipe.c             |  6 +++++-
 include/linux/sched.h |  1 +
 kernel/sched.c        | 11 ++++++++++-
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 2c7a23dde2d..66aa0b938d6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -39,7 +39,11 @@ void pipe_wait(struct inode * inode)
 {
 	DEFINE_WAIT(wait);
 
-	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE);
+	/*
+	 * Pipes are system-local resources, so sleeping on them
+	 * is considered a noninteractive wait:
+	 */
+	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
 	up(PIPE_SEM(*inode));
 	schedule();
 	finish_wait(PIPE_WAIT(*inode), &wait);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8a1fcfe80fc..ac70f845b5b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -114,6 +114,7 @@ extern unsigned long nr_iowait(void);
 #define TASK_TRACED		8
 #define EXIT_ZOMBIE		16
 #define EXIT_DEAD		32
+#define TASK_NONINTERACTIVE	64
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index 24eed372d28..6da13bba3e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1260,6 +1260,16 @@ out_activate:
 		p->activated = -1;
 	}
 
+	/*
+	 * Tasks that have marked their sleep as noninteractive get
+	 * woken up without updating their sleep average. (i.e. their
+	 * sleep is handled in a priority-neutral manner, no priority
+	 * boost and no penalty.)
+	 */
+	if (old_state & TASK_NONINTERACTIVE)
+		__activate_task(p, rq);
+	else
+		activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
@@ -1268,7 +1278,6 @@ out_activate:
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
-	activate_task(p, rq, cpu == this_cpu);
 	if (!sync || cpu != this_cpu) {
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
-- 
cgit v1.2.3-70-g09d2


From 216d81bb35fc50923993462cc4fbc7029f9be1a9 Mon Sep 17 00:00:00 2001
From: Domen Puncer <domen@coderock.org>
Date: Sat, 10 Sep 2005 00:27:05 -0700
Subject: [PATCH] janitor: jffs/intrep: list_for_each_entry

Use list_for_each_entry to make code more readable.

Signed-off-by: Maximilian Attems <janitor@sternwelten.at>
Signed-off-by: Domen Puncer <domen@coderock.org>
Cc: <jffs-dev@axis.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jffs/intrep.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 456d7e6e29c..27f199e94cf 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -1701,12 +1701,10 @@ jffs_find_file(struct jffs_control *c, __u32 ino)
 {
 	struct jffs_file *f;
 	int i = ino % c->hash_len;
-	struct list_head *tmp;
 
 	D3(printk("jffs_find_file(): ino: %u\n", ino));
 
-	for (tmp = c->hash[i].next; tmp != &c->hash[i]; tmp = tmp->next) {
-		f = list_entry(tmp, struct jffs_file, hash);
+	list_for_each_entry(f, &c->hash[i], hash) {
 		if (ino != f->ino)
 			continue;
 		D3(printk("jffs_find_file(): Found file with ino "
@@ -2102,13 +2100,12 @@ jffs_foreach_file(struct jffs_control *c, int (*func)(struct jffs_file *))
 	int result = 0;
 
 	for (pos = 0; pos < c->hash_len; pos++) {
-		struct list_head *p, *next;
-		for (p = c->hash[pos].next; p != &c->hash[pos]; p = next) {
-			/* We need a reference to the next file in the
-			   list because `func' might remove the current
-			   file `f'.  */
-			next = p->next;
-			r = func(list_entry(p, struct jffs_file, hash));
+		struct jffs_file *f, *next;
+
+		/* We must do _safe, because 'func' might remove the
+		   current file 'f' from the list.  */
+		list_for_each_entry_safe(f, next, &c->hash[pos], hash) {
+			r = func(f);
 			if (r < 0)
 				return r;
 			result += r;
@@ -2613,9 +2610,8 @@ jffs_print_hash_table(struct jffs_control *c)
 
 	printk("JFFS: Dumping the file system's hash table...\n");
 	for (i = 0; i < c->hash_len; i++) {
-		struct list_head *p;
-		for (p = c->hash[i].next; p != &c->hash[i]; p = p->next) {
-			struct jffs_file *f=list_entry(p,struct jffs_file,hash);
+		struct jffs_file *f;
+		list_for_each_entry(f, &c->hash[i], hash) {
 			printk("*** c->hash[%u]: \"%s\" "
 			       "(ino: %u, pino: %u)\n",
 			       i, (f->name ? f->name : ""),
-- 
cgit v1.2.3-70-g09d2


From fdadd65fbce0ff966cb8e74247d9636f52a7fc7a Mon Sep 17 00:00:00 2001
From: Domen Puncer <domen@coderock.org>
Date: Sat, 10 Sep 2005 00:27:07 -0700
Subject: [PATCH] janitor: fs/namespace.c: list_for_each_entry

Make code more readable with list_for_each_entry.

Signed-off-by: Maximilian Attems <janitor@sternwelten.at>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 34156260c9b..2fa9fdf7d6f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -537,7 +537,6 @@ lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
 static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct vfsmount *res, *p, *q, *r, *s;
-	struct list_head *h;
 	struct nameidata nd;
 
 	res = q = clone_mnt(mnt, dentry);
@@ -546,8 +545,7 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
 	q->mnt_mountpoint = mnt->mnt_mountpoint;
 
 	p = mnt;
-	for (h = mnt->mnt_mounts.next; h != &mnt->mnt_mounts; h = h->next) {
-		r = list_entry(h, struct vfsmount, mnt_child);
+	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
 		if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry))
 			continue;
 
-- 
cgit v1.2.3-70-g09d2


From 0cdca3f9806a3dbaa07b5e8175000cd513ba92d4 Mon Sep 17 00:00:00 2001
From: Domen Puncer <domen@coderock.org>
Date: Sat, 10 Sep 2005 00:27:07 -0700
Subject: [PATCH] janitor: fs/dcache.c: list_for_each*

First one is list_for_each_entry (thanks maks), second 2 list_for_each_safe.

Signed-off-by: Maximilian Attems <janitor@sternwelten.at>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index a15a2e1f552..7376b61269f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -337,12 +337,10 @@ struct dentry * d_find_alias(struct inode *inode)
  */
 void d_prune_aliases(struct inode *inode)
 {
-	struct list_head *tmp, *head = &inode->i_dentry;
+	struct dentry *dentry;
 restart:
 	spin_lock(&dcache_lock);
-	tmp = head;
-	while ((tmp = tmp->next) != head) {
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
 			__dget_locked(dentry);
@@ -463,10 +461,7 @@ void shrink_dcache_sb(struct super_block * sb)
 	 * superblock to the most recent end of the unused list.
 	 */
 	spin_lock(&dcache_lock);
-	next = dentry_unused.next;
-	while (next != &dentry_unused) {
-		tmp = next;
-		next = tmp->next;
+	list_for_each_safe(tmp, next, &dentry_unused) {
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
@@ -478,10 +473,7 @@ void shrink_dcache_sb(struct super_block * sb)
 	 * Pass two ... free the dentries for this superblock.
 	 */
 repeat:
-	next = dentry_unused.next;
-	while (next != &dentry_unused) {
-		tmp = next;
-		next = tmp->next;
+	list_for_each_safe(tmp, next, &dentry_unused) {
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-- 
cgit v1.2.3-70-g09d2


From ea0e0a4f53a75ed9d0812352c0410f6fc2a0b62a Mon Sep 17 00:00:00 2001
From: James Lamanna <jlamanna@gmail.com>
Date: Sat, 10 Sep 2005 00:27:16 -0700
Subject: [PATCH] janitor: reiserfs: super.c - vfree() checking cleanups

super.c vfree() checking cleanups.

Signed-off by: James Lamanna <jlamanna@gmail.com>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 6951c35755b..44b02fc02eb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1934,8 +1934,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			if (SB_AP_BITMAP(s))
 				brelse(SB_AP_BITMAP(s)[j].bh);
 		}
-		if (SB_AP_BITMAP(s))
-			vfree(SB_AP_BITMAP(s));
+		vfree(SB_AP_BITMAP(s));
 	}
 	if (SB_BUFFER_WITH_SB(s))
 		brelse(SB_BUFFER_WITH_SB(s));
-- 
cgit v1.2.3-70-g09d2


From e711700a0e6a6824fcfd5519d6b6982850a648ee Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 10 Sep 2005 00:27:20 -0700
Subject: [PATCH] fs/cramfs/uncompress.c should #include <linux/cramfs_fs.h>

Every file should #include the header with the prototypes of the global
functions it is offering.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cramfs/uncompress.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 5034365b06a..8def89f2c43 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/vmalloc.h>
 #include <linux/zlib.h>
+#include <linux/cramfs_fs.h>
 
 static z_stream stream;
 static int initialized;
-- 
cgit v1.2.3-70-g09d2


From 041e0e3b1970c508dc9a95b7dd9dc86271a7d7ac Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@us.ibm.com>
Date: Sat, 10 Sep 2005 00:27:23 -0700
Subject: [PATCH] fs: fix-up schedule_timeout() usage

Use schedule_timeout_{,un}interruptible() instead of
set_current_state()/schedule_timeout() to reduce kernel size.  Also use helper
functions to convert between human time units and jiffies rather than constant
HZ division to avoid rounding errors.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/connect.c            |  6 ++----
 fs/jbd/transaction.c         |  3 +--
 fs/lockd/clntproc.c          |  3 +--
 fs/nfs/nfs3proc.c            |  3 +--
 fs/nfs/nfs4proc.c            | 12 ++++--------
 fs/reiserfs/journal.c        |  3 +--
 fs/smbfs/proc.c              |  3 +--
 fs/xfs/linux-2.6/time.h      |  3 +--
 fs/xfs/linux-2.6/xfs_buf.c   |  6 +++---
 fs/xfs/linux-2.6/xfs_super.c | 12 ++++++------
 10 files changed, 21 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3217ac5f6bd..2335f14a158 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3215,10 +3215,8 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	}
 	
 	cifs_sb->tcon = NULL;
-	if (ses) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ / 2);
-	}
+	if (ses)
+		schedule_timeout_interruptible(msecs_to_jiffies(500));
 	if (ses)
 		sesInfoFree(ses);
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c6ec66fd876..49bbc2be3d7 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1340,8 +1340,7 @@ int journal_stop(handle_t *handle)
 	if (handle->h_sync) {
 		do {
 			old_handle_count = transaction->t_handle_count;
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_timeout(1);
+			schedule_timeout_uninterruptible(1);
 		} while (old_handle_count != transaction->t_handle_count);
 	}
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 14b3ce87fa2..87332f30141 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -299,8 +299,7 @@ nlmclnt_alloc_call(void)
 			return call;
 		}
 		printk("nlmclnt_alloc_call: failed, waiting for memory\n");
-		current->state = TASK_INTERRUPTIBLE;
-		schedule_timeout(5*HZ);
+		schedule_timeout_interruptible(5*HZ);
 	}
 	return NULL;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2681485cf2d..edc95514046 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -34,8 +34,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX)
 			break;
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
+		schedule_timeout_interruptible(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!signalled());
 	rpc_clnt_sigunmask(clnt, &oldset);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0c5a308e496..9701ca8c942 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2418,14 +2418,11 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 		*timeout = NFS4_POLL_RETRY_MAX;
 	rpc_clnt_sigmask(clnt, &oldset);
 	if (clnt->cl_intr) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(*timeout);
+		schedule_timeout_interruptible(*timeout);
 		if (signalled())
 			res = -ERESTARTSYS;
-	} else {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(*timeout);
-	}
+	} else
+		schedule_timeout_uninterruptible(*timeout);
 	rpc_clnt_sigunmask(clnt, &oldset);
 	*timeout <<= 1;
 	return res;
@@ -2578,8 +2575,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-	current->state = TASK_INTERRUPTIBLE;
-	schedule_timeout(timeout);
+	schedule_timeout_interruptible(timeout);
 	timeout <<= 1;
 	if (timeout > NFS4_LOCK_MAXTIMEOUT)
 		return NFS4_LOCK_MAXTIMEOUT;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a8e29e9bbbd..4b15761434b 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2868,8 +2868,7 @@ static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	unsigned long bcount = journal->j_bcount;
 	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(1);
+		schedule_timeout_uninterruptible(1);
 		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
 		while ((atomic_read(&journal->j_wcount) > 0 ||
 			atomic_read(&journal->j_jlock)) &&
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 220babe91ef..38ab558835c 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -2397,8 +2397,7 @@ smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
 		if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
 			/* a damn Win95 bug - sometimes it clags if you 
 			   ask it too fast */
-			current->state = TASK_INTERRUPTIBLE;
-			schedule_timeout(HZ/5);
+			schedule_timeout_interruptible(msecs_to_jiffies(200));
 			continue;
                 }
 
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h
index 6c6fd0faa8e..b0d2873ab27 100644
--- a/fs/xfs/linux-2.6/time.h
+++ b/fs/xfs/linux-2.6/time.h
@@ -39,8 +39,7 @@ typedef struct timespec timespec_t;
 
 static inline void delay(long ticks)
 {
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule_timeout(ticks);
+	schedule_timeout_uninterruptible(ticks);
 }
 
 static inline void nanotime(struct timespec *tvp)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 655bf4a78af..e82cf72ac59 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1780,10 +1780,10 @@ xfsbufd(
 			xfsbufd_force_sleep = 0;
 		}
 
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
+		schedule_timeout_interruptible
+			(xfs_buf_timer_centisecs * msecs_to_jiffies(10));
 
-		age = (xfs_buf_age_centisecs * HZ) / 100;
+		age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
 		spin_lock(&pbd_delwrite_lock);
 		list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
 			PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 0da87bfc999..2302454d8d4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -467,7 +467,7 @@ xfs_flush_inode(
 
 	igrab(inode);
 	xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
-	delay(HZ/2);
+	delay(msecs_to_jiffies(500));
 }
 
 /*
@@ -492,7 +492,7 @@ xfs_flush_device(
 
 	igrab(inode);
 	xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
-	delay(HZ/2);
+	delay(msecs_to_jiffies(500));
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
 
@@ -520,10 +520,9 @@ xfssyncd(
 	struct vfs_sync_work	*work, *n;
 	LIST_HEAD		(tmp);
 
-	timeleft = (xfs_syncd_centisecs * HZ) / 100;
+	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		timeleft = schedule_timeout(timeleft);
+		timeleft = schedule_timeout_interruptible(timeleft);
 		/* swsusp */
 		try_to_freeze();
 		if (kthread_should_stop())
@@ -537,7 +536,8 @@ xfssyncd(
 		 */
 		if (!timeleft || list_empty(&vfsp->vfs_sync_list)) {
 			if (!timeleft)
-				timeleft = (xfs_syncd_centisecs * HZ) / 100;
+				timeleft = xfs_syncd_centisecs *
+							msecs_to_jiffies(10);
 			INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list);
 			list_add_tail(&vfsp->vfs_sync_work.w_list,
 					&vfsp->vfs_sync_list);
-- 
cgit v1.2.3-70-g09d2


From 5d46770f5f8bb0eff0a82596860958be13e7baf1 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Mon, 12 Sep 2005 14:33:47 +0100
Subject: NTFS: Change the mount options {u,f,d}mask to always parse the number
 as       an octal number to conform to how chmod(1) works, too.  Thanks to   
    Giuseppe Bilotta and Horst von Brand for pointing out the errors of      
 my ways.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  6 +++++-
 fs/ntfs/super.c   | 14 +++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index e4fd6134244..7f400724289 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -90,7 +90,11 @@ ToDo/Notes:
 	  in the first buffer head instead of a driver global spin lock to
 	  improve scalability.
 	- Minor fix to error handling and error message display in
-	  fs/ntfs/aops.c::ntfs_prepare_nonresident_write(). 
+	  fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
+	- Change the mount options {u,f,d}mask to always parse the number as
+	  an octal number to conform to how chmod(1) works, too.  Thanks to
+	  Giuseppe Bilotta and Horst von Brand for pointing out the errors of
+	  my ways.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index b2b39296126..453d0d51ea4 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -126,6 +126,14 @@ static BOOL parse_options(ntfs_volume *vol, char *opt)
 		if (*v)							\
 			goto needs_val;					\
 	}
+#define NTFS_GETOPT_OCTAL(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		variable = simple_strtoul(ov = v, &v, 8);		\
+		if (*v)							\
+			goto needs_val;					\
+	}
 #define NTFS_GETOPT_BOOL(option, variable)				\
 	if (!strcmp(p, option)) {					\
 		BOOL val;						\
@@ -157,9 +165,9 @@ static BOOL parse_options(ntfs_volume *vol, char *opt)
 			*v++ = 0;
 		NTFS_GETOPT("uid", uid)
 		else NTFS_GETOPT("gid", gid)
-		else NTFS_GETOPT("umask", fmask = dmask)
-		else NTFS_GETOPT("fmask", fmask)
-		else NTFS_GETOPT("dmask", dmask)
+		else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
+		else NTFS_GETOPT_OCTAL("fmask", fmask)
+		else NTFS_GETOPT_OCTAL("dmask", dmask)
 		else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
 		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, TRUE)
 		else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
-- 
cgit v1.2.3-70-g09d2


From 89ecf38c7aee6eb3f6aaf40a6d196ddff4b6d4a8 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Mon, 12 Sep 2005 15:43:03 +0100
Subject: NTFS: Mask out __GFP_HIGHMEM when doing kmalloc() in __ntfs_malloc()
 as it       otherwise causes a BUG().

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog | 3 ---
 fs/ntfs/malloc.h  | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 7f400724289..49eafbdb15c 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -34,9 +34,6 @@ ToDo/Notes:
 	  journals with two different restart pages.  We sanity check both and
 	  either use the only sane one or the more recent one of the two in the
 	  case that both are valid.
-	- Modify fs/ntfs/malloc.h::ntfs_malloc_nofs() to do the kmalloc() based
-	  allocations with __GFP_HIGHMEM, analogous to how the vmalloc() based
-	  allocations are done.
 	- Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
 	  ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
 	  hence cannot fail.
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index 9994e019a3c..3288bcc2c4a 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -45,7 +45,7 @@ static inline void *__ntfs_malloc(unsigned long size,
 	if (likely(size <= PAGE_SIZE)) {
 		BUG_ON(!size);
 		/* kmalloc() has per-CPU caches so is faster for now. */
-		return kmalloc(PAGE_SIZE, gfp_mask);
+		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
 		/* return (void *)__get_free_page(gfp_mask); */
 	}
 	if (likely(size >> PAGE_SHIFT < num_physpages))
-- 
cgit v1.2.3-70-g09d2